diff --git a/CMakeLists.txt b/CMakeLists.txt
index e333cab7b37c5466067f1cd5f05c9385e5fa120c..a496109044e05590d8458a73f2c2a5e7c2d78074 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,6 +29,7 @@ dune_register_package_flags(LIBRARIES dunecodegen)
 
 dune_enable_all_packages()
 
+add_subdirectory(doc)
 add_subdirectory(dune/codegen)
 add_subdirectory(cmake/modules)
 
diff --git a/bin/donkey_benchmark_execution_wrapper.py b/bin/donkey_benchmark_execution_wrapper.py
index d383963318291ae947c70b1f083c320a60250303..22835c7cbce61336168396007e73703394eb7433 100755
--- a/bin/donkey_benchmark_execution_wrapper.py
+++ b/bin/donkey_benchmark_execution_wrapper.py
@@ -5,8 +5,16 @@ import sys
 import subprocess
 import os
 
+os.environ['DUNE_CODEGEN_THREADS'] = '20'
+
 # Run the actual command
-command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split()
+
+# The actual measurements will be called like this (mpi parallelism)
+# command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split()
+
+# Command for autotune benchmarks with threading
+command = "srun -p haswell10c -n 1 -c 20 --hint=nomultithread --cpu_bin=verbose,core".split()
+
 command.extend(sys.argv[1:])
 ret = subprocess.call(command)
 
diff --git a/cmake/modules/DuneCodegenMacros.cmake b/cmake/modules/DuneCodegenMacros.cmake
index 61713109b32cc1af4716b0f9f133879592fec49c..da3225866785c75a8cf73e6aa78b6e3e0eea42f9 100644
--- a/cmake/modules/DuneCodegenMacros.cmake
+++ b/cmake/modules/DuneCodegenMacros.cmake
@@ -1,6 +1,6 @@
 # File for module specific CMake tests.
 #
-# .. cmake_function:: add_generated_executable
+# .. cmake_function:: dune_add_generated_executable
 #
 #    .. cmake_param:: UFLFILE
 #       :single:
@@ -22,19 +22,20 @@
 #       The name given to the added executable target.
 #
 #    .. cmake_param:: SOURCE
+#       :single:
 #
 #       The cc source file to build from. If omitted, a minimal
 #       source file and a driver file will be generated.
 #
 #    .. cmake_param:: FORM_COMPILER_ARGS
 #       :multi:
-#       :argname arg:
+#       :argname: arg
 #
 #       Additional arguments as recognized by the form compiler.
 #
 #    .. cmake_param:: DEPENDS
 #       :multi:
-#       :argname dep:
+#       :argname: dep
 #
 #       Additional dependencies of the generated executable (changes in those
 #       will retrigger generation)
@@ -57,7 +58,7 @@
 #
 #    .. cmake_param:: ANALYZE_GRID_COMMAND
 #       :multi:
-#       :argname command:
+#       :argname: command
 #
 #       Use this to pass a custom grid analysis command. This is necessary
 #       if you use a custom grid generation methdod. The inifile and the
@@ -104,7 +105,7 @@ else()
 endif()
 file(GLOB_RECURSE UFL2PDELAB_SOURCES ${UFL2PDELAB_GLOB_PATTERN})
 
-function(add_generated_executable)
+function(dune_add_generated_executable)
   set(OPTIONS EXCLUDE_FROM_ALL ANALYZE_GRID)
   set(SINGLE TARGET SOURCE UFLFILE INIFILE)
   set(MULTI FORM_COMPILER_ARGS DEPENDS ANALYZE_GRID_COMMAND)
@@ -112,15 +113,15 @@ function(add_generated_executable)
   cmake_parse_arguments(GEN "${OPTIONS}" "${SINGLE}" "${MULTI}" ${ARGN})
 
   if(GEN_UNPARSED_ARGUMENTS)
-    message(FATAL_ERROR "Unrecognized arguments in add_generated_executable. This usually indicates a typo.")
+    message(FATAL_ERROR "Unrecognized arguments in dune_add_generated_executable. This usually indicates a typo.")
   endif()
 
   # Apply defaults and enforce requirements
   if(NOT GEN_TARGET)
-    message(FATAL_ERROR "Need to specify the TARGET parameter for add_generated_executable")
+    message(FATAL_ERROR "Need to specify the TARGET parameter for dune_add_generated_executable")
   endif()
   if(NOT GEN_UFLFILE)
-    message(FATAL_ERROR "Need to specify the UFLFILE parameter for add_generated_executable")
+    message(FATAL_ERROR "Need to specify the UFLFILE parameter for dune_add_generated_executable")
   endif()
   if(NOT IS_ABSOLUTE ${GEN_UFLFILE})
     set(GEN_UFLFILE ${CMAKE_CURRENT_SOURCE_DIR}/${GEN_UFLFILE})
diff --git a/cmake/modules/GeneratedSystemtests.cmake b/cmake/modules/GeneratedSystemtests.cmake
index 206ce3bcbf10a457182ffbb1a5f8e415ea3fcb24..87af7161c8b9c8196dbddc3ae42e55bcffd3bd81 100644
--- a/cmake/modules/GeneratedSystemtests.cmake
+++ b/cmake/modules/GeneratedSystemtests.cmake
@@ -1,11 +1,85 @@
-# System testing for generated executables. All ideas taken from dune-testtools.
+# System testing for generated executables. All ideas taken from dune-testtools.
+#
+# .. cmake_function:: dune_add_formcompiler_system_test
+#
+#    .. cmake_param:: UFLFILE
+#       :single:
+#       :required:
+#
+#       The UFL file to create the generate code from.
+#
+#    .. cmake_param:: INIFILE
+#       :single:
+#       :required:
+#
+#       The ini file that controls the form compilation process.
+#       It is expected to contain a [formcompiler] section. This
+#       file may contain meta ini annotations.
+#
+#    .. cmake_param:: BASENAME
+#       :single:
+#       :required:
+#
+#       The basename for the generated executables.
+#
+#    .. cmake_param:: SCRIPT
+#       :single:
+#
+#       The python script that decides about test failure/success.
+#       Defaults to a script that simply runs the program and checks
+#       the exit code. More scripts to be found in dune-testtools.
+#
+#    .. cmake_param:: SOURCE
+#       :single:
+#
+#       The cc source file to build from. If omitted, a minimal
+#       source file and a driver file will be generated.
+#
+#    .. cmake_param:: CREATED_TARGETS
+#       :single:
+#
+#       A variable name that should be filled with the list of generated
+#       targets. This can be used to modify these lateron.
+#
+#    .. cmake_param:: DEPENDS
+#       :multi:
+#       :argname: dep
+#
+#       Additional dependencies of the generated executable (changes in those
+#       will retrigger generation)
+#
+#    .. cmake_param:: NO_TESTS
+#       :option:
+#
+#       If given, code will be generated and built normally, but no tests will
+#       be added to the test suite.
+#
+#    .. cmake_param:: ANALYZE_GRID
+#       :option:
+#
+#       Set this option to enable code generation time grid analysis.
+#       This is useful to reduce the variety of sum factorization kernels
+#       in unstructured grids. Note that the grid analysis tool needs to
+#       be able to construct your grid from the given inifile. If you have
+#       a custom grid construction method, you can use ANALYZE_GRID_COMMAND
+#       instead.
+#
+#    .. cmake_param:: ANALYZE_GRID_COMMAND
+#       :multi:
+#       :argname: command
+#
+#       Use this to pass a custom grid analysis command. This is necessary
+#       if you use a custom grid generation methdod. The inifile and the
+#       outputfile will be appended to this command. You can use the analysis code in
+#       dune/codegen/sumfact/analyzegrid.hh to write your own tool.
+#       Specifying this option will automatically set ANALYZE_GRID.
 #
 
 function(dune_add_formcompiler_system_test)
   # parse arguments
   set(OPTION DEBUG NO_TESTS ANALYZE_GRID)
-  set(SINGLE INIFILE BASENAME SCRIPT UFLFILE SOURCE)
-  set(MULTI CREATED_TARGETS DEPENDS)
+  set(SINGLE INIFILE BASENAME SCRIPT UFLFILE SOURCE CREATED_TARGETS)
+  set(MULTI DEPENDS ANALYZE_GRID_COMMAND)
   cmake_parse_arguments(SYSTEMTEST "${OPTION}" "${SINGLE}" "${MULTI}" ${ARGN})
 
   if(SYSTEMTEST_UNPARSED_ARGUMENTS)
@@ -25,6 +99,10 @@ function(dune_add_formcompiler_system_test)
   if(SYSTEMTEST_ANALYZE_GRID)
     set(ANALYZE_GRID_STR "ANALYZE_GRID")
   endif()
+  set(ANALYZE_GRID_COMMAND_STR "")
+  if(SYSTEMTEST_ANALYZE_GRID_COMMAND)
+    set(ANALYZE_GRID_COMMAND_STR "ANALYZE_GRID_COMMAND ${SYSTEMTEST_ANALYZE_GRID_COMMAND}")
+  endif()
 
   # set a default for the script. call_executable.py just calls the executable.
   # There, it is also possible to hook in things depending on the inifile
@@ -55,14 +133,15 @@ function(dune_add_formcompiler_system_test)
       endif()
     endif()
 
-    add_generated_executable(TARGET ${tname}
-                             UFLFILE ${SYSTEMTEST_UFLFILE}
-                             INIFILE "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
-                             DEPENDS ${SYSTEMTEST_INIFILE} ${SYSTEMTEST_DEPENDS}
-                             EXCLUDE_FROM_ALL
-                             ${SOURCE}
-                             ${ANALYZE_GRID_STR}
-                             )
+    dune_add_generated_executable(TARGET ${tname}
+                                  UFLFILE ${SYSTEMTEST_UFLFILE}
+                                  INIFILE "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
+                                  DEPENDS ${SYSTEMTEST_INIFILE} ${SYSTEMTEST_DEPENDS}
+                                  EXCLUDE_FROM_ALL
+                                  ${SOURCE}
+                                  ${ANALYZE_GRID_STR}
+                                  ${ANALYZE_GRID_COMMAND_STR}
+                                  )
 
     # Enrich the target with preprocessor variables from the __static section
     # just the way that dune-testtools does.
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..956de868a985aec3af90d7263417eb16b9be09df
--- /dev/null
+++ b/doc/CMakeLists.txt
@@ -0,0 +1 @@
+dune_cmake_sphinx_doc(MODULE_ONLY)
diff --git a/python/dune/codegen/__init__.py b/python/dune/codegen/__init__.py
index e892baf44c4c9fb301536c15c40059a5acc5d602..196419782487f33f13cf797d19c1fbdcbd13d946 100644
--- a/python/dune/codegen/__init__.py
+++ b/python/dune/codegen/__init__.py
@@ -10,3 +10,6 @@ import dune.codegen.loopy.symbolic  # noqa
 import dune.codegen.pdelab  # noqa
 import dune.codegen.sumfact  # noqa
 import dune.codegen.blockstructured  # noqa
+
+# Trigger imports that register finite elements in UFL
+import dune.codegen.ufl.finiteelement  # noqa
diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index adda4c44eaad264f8e1f4500eeba2dbb6cf9ebf4..1e801f4550d17d89ef8972fe3ffa867d4a44ea25 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -8,7 +8,9 @@ import hashlib
 import logging
 import json
 from operator import mul
+import pkg_resources
 from six.moves import reduce
+import textwrap
 import time
 
 import loopy as lp
@@ -318,113 +320,158 @@ def generate_standalone_kernel_code(kernel, signature, filename, transformations
     sig = p.sub(r'\1', sig)
     assert 'OpCounter' not in signature
 
-    with open(filename, 'w') as f:
-        if transformations:
-            f.write('// Transformations:\n')
-            for trafo in transformations:
-                f.write('// {}\n'.format(trafo))
-            f.write('\n')
-
-        # Write headers
-        headers = ['#include "config.h"',
-                   '#include <iostream>',
-                   '#include <fstream>',
-                   '#include <random>',
-                   '#include "benchmark/benchmark.h"',
-                   '#include <dune/codegen/common/vectorclass.hh>',
-                   '#include <dune/codegen/sumfact/horizontaladd.hh>',
-                   ]
-        f.write("\n".join(headers))
-
-        # Get a list of the function argument names
-        arguments = sig[sig.find('(') + 1:sig.find(')')].split(',')
-        arguments = [a.split(' ')[-1] for a in arguments]
-
-        global_args = [a for a in kernel.args if a.name not in arguments]
-
-        # Declare global arguments
-        f.write('\n\n')
-        target = DuneTarget()
-        for g in global_args:
-            decl_info = g.decl_info(target, True, g.dtype)
-            for idi in decl_info:
-                ast_builder = target.get_device_ast_builder()
-                arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
-                arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
-                arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
-                f.write('{}\n'.format(arg_decl))
-
-        # Generate function we want to benchmark
-        f.write('\n')
-        f.write(sig[0:sig.find(')') + 1])
-        f.writelines(lp.generate_body(kernel))
-        f.write('\n\n')
-
-        # Generate function that will do the benchmarking
-        f.write('static void BM_sumfact_kernel(benchmark::State& state){\n')
-
-        # Declare random generators
+    # Which transformations were applied
+    codegen_transformations = ''
+    if transformations:
+        codegen_transformations = ''
+        for trafo in transformations:
+            codegen_transformations += '// {}\n'.format(trafo)
+
+    template = 'kernel_benchmark_template1.cc.in'
+    use_datasets = True
+
+    # Old benchmark template
+    # template = 'kernel_benchmark_template0.cc.in'
+    # use_datasets = False
+
+    template_filename = pkg_resources.resource_filename(__name__, template)
+    with open(template_filename, 'r') as f:
+        benchmark = f.read()
+
+    # Find function arguments and global arguments
+    arguments = sig[sig.find('(') + 1:sig.find(')')].split(',')
+    arguments = [a.split(' ')[-1] for a in arguments]
+    global_args = [a for a in kernel.args if a.name not in arguments]
+    buffer_arguments = [a for a in arguments if a.startswith('buff')]
+    input_arguments = [a for a in arguments if a not in buffer_arguments]
+
+    # Declare global arguments
+    codegen_declare_global_arguments = ''
+    target = DuneTarget()
+    for g in global_args:
+        decl_info = g.decl_info(target, True, g.dtype)
+        for idi in decl_info:
+            ast_builder = target.get_device_ast_builder()
+            arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
+            arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
+            arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
+            codegen_declare_global_arguments += '{}\n'.format(arg_decl)
+    codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, '  ')
+
+    # Helper function for argument initialization
+    def _initialize_arg(arg):
+        if isinstance(arg, lp.ValueArg):
+            return []
         real = type_floatingpoint()
-        lines = ['  std::uniform_real_distribution<{}> unif(0,1);'.format(real),
-                 '  std::uniform_int_distribution<int> unif_int(0,1);',
-                 '  std::default_random_engine re;']
-        f.write('\n'.join(lines) + '\n')
-
-        # Declare function arguments
-        function_arguments = [a for a in kernel.args if a.name in arguments]
-        for arg in function_arguments:
-            if 'buffer' in arg.name:
-                byte_size = reduce(mul, arg.shape) * 8
-                f.write('  char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
-                                                                                 byte_size,
-                                                                                 arg.alignment),)
-            elif isinstance(arg, lp.ValueArg):
-                assert 'jacobian_offset' in arg.name
-                decl = arg.get_arg_decl(ast_builder)
-                decl = Initializer(decl, 'unif_int(re)')
-                f.write('  {}\n'.format(decl))
-            else:
-                assert 'fastdg' in arg.name
-                size = reduce(mul, arg.shape)
-                min_stride = min([tag.stride for tag in arg.dim_tags])
-                size *= min_stride
-                alignment = arg.dtype.itemsize
-                f.write('  {} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
-                                                                               arg.name,
-                                                                               size,
-                                                                               alignment))
-
-        # Initialize arguments
-        def _initialize_arg(arg):
-            if isinstance(arg, lp.ValueArg):
-                return []
-            real = type_floatingpoint()
+        size = reduce(mul, arg.shape)
+        fill_name = arg.name + '_fill'
+        lines = ['  {}* {} = (double *) {};'.format(real, fill_name, arg.name),
+                 '  for (std::size_t i=0; i<{}; ++i){{'.format(size),
+                 '    {}[i] = unif(re);'.format(fill_name),
+                 '  }']
+        return lines
+
+    # Initialize global arguments
+    codegen_initialize_global_arguments = ''
+    for arg in global_args:
+        lines = _initialize_arg(arg)
+        codegen_initialize_global_arguments += '\n'.join(lines) + '\n'
+    codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, '  ')
+
+    codegen_initialize_input = ''
+
+    # Function we want to benchmark
+    codegen_benchmark_function = ''
+    codegen_benchmark_function += sig[0:sig.find(')') + 1]
+    codegen_benchmark_function += lp.generate_body(kernel)
+    codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, '  ')
+
+    # Declare function arguments
+    codegen_declare_arguments = []
+    codegen_declare_input = []
+    function_arguments = [a for a in kernel.args if a.name in arguments]
+    for arg in function_arguments:
+        if 'buffer' in arg.name:
+            byte_size = reduce(mul, arg.shape) * 8
+            codegen_declare_arguments.append('  char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
+                                                                                                      byte_size,
+                                                                                                      arg.alignment),)
+        elif isinstance(arg, lp.ValueArg):
+            assert 'jacobian_offset' in arg.name
+            decl = arg.get_arg_decl(ast_builder)
+            decl = Initializer(decl, 'unif_int(re)')
+            codegen_declare_arguments.append(('  {}\n'.format(decl)))
+        else:
+            assert 'fastdg' in arg.name
             size = reduce(mul, arg.shape)
-            fill_name = arg.name + '_fill'
-            lines = ['  {}* {} = (double *) {};'.format(real, fill_name, arg.name),
-                     '  for (std::size_t i=0; i<{}; ++i){{'.format(size),
-                     '    {}[i] = unif(re);'.format(fill_name),
-                     '  }']
-            return lines
-
-        for arg in kernel.args:
+            min_stride = min([tag.stride for tag in arg.dim_tags])
+            size *= min_stride
+            alignment = arg.dtype.itemsize
+            real = type_floatingpoint()
+            if use_datasets:
+                codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                                                             arg.name,
+                                                                                                             size,
+                                                                                                             alignment)))
+            else:
+                codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                                                   arg.name,
+                                                                                                   size,
+                                                                                                   alignment)))
+
+    codegen_declare_arguments = ''.join(codegen_declare_arguments)
+    codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, '  ')
+    codegen_declare_input = ''.join(codegen_declare_input)
+    codegen_declare_input = textwrap.indent(codegen_declare_input, '  ')
+
+    # Initialize function arguments
+    codegen_initialize_arguments = ''
+    codegen_initialize_input = ''
+    for arg in function_arguments:
+        if 'fastdg' in arg.name:
+            if use_datasets:
+                lines = _initialize_arg(arg)
+                lines = ['  ' + a for a in lines]
+                lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines]
+                lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){')
+                lines.append('}')
+                codegen_initialize_input += '\n'.join(lines) + '\n'
+            else:
+                lines = _initialize_arg(arg)
+                codegen_initialize_arguments += '\n'.join(lines) + '\n'
+        else:
             lines = _initialize_arg(arg)
-            f.write('\n'.join(lines) + '\n')
-
-        # Benchmark loop
-        function_call = kernel.name + '({})'.format(','.join(arguments))
-        f.writelines(['  for (auto _ : state){\n',
-                      '    {};\n'.format(function_call),
-                      '  }\n',
-                      ])
-        f.write('}\n')
-
-        # Benchmark main
-        main = ['',
-                'BENCHMARK(BM_sumfact_kernel);',
-                '',
-                'BENCHMARK_MAIN();']
-        f.write('\n'.join(main))
+            codegen_initialize_arguments += '\n'.join(lines) + '\n'
+    codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, '  ')
+    codegen_initialize_input = textwrap.indent(codegen_initialize_input, '  ')
+
+    # Call the benchmark function
+    if use_datasets:
+        arguments_with_datasets = arguments.copy()
+        arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments]
+        codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n'
+        codegen_call_benchmark_function += '  ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n'
+        for arg in input_arguments:
+            codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg)
+        codegen_call_benchmark_function += '}'
+    else:
+        codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n'
+    codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, '    ')
+
+    # Replace placeholders in benchmark template
+    benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input)
+    benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments)
+    benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function)
+
+    # Write benchmark source file
+    with open(filename, 'w') as f:
+        f.writelines(benchmark)
 
     # Maybe turn opcounting on again
     set_option("opcounter", opcounting)
@@ -491,6 +538,7 @@ def autotune_realization(sf=None, kernel=None, signature=None, transformations=N
 
                 call.extend(compiler_invocation(executable, filename))
                 devnull = open(os.devnull, 'w')
+                os.environ['DUNE_CODEGEN_THREADS'] = '1'
                 ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
                 if ret != 0:
                     raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call)))
@@ -509,7 +557,7 @@ def autotune_realization(sf=None, kernel=None, signature=None, transformations=N
                 call.append(executable)
                 if get_option("autotune_google_benchmark"):
                     call.append("--benchmark_out={}".format(logname))
-                    call.append("--benchmark_repetitions=10")
+                    call.append("--benchmark_repetitions=5")
                     # call.append("--benchmark_out_format=csv")
                 else:
                     call.append(logname)
@@ -529,7 +577,9 @@ def autotune_realization(sf=None, kernel=None, signature=None, transformations=N
                         data = json.load(json_file)
                         minimal_time = 1e80
                         for b in data['benchmarks']:
-                            if b['name'] == 'BM_sumfact_kernel':
+                            if b['name'].endswith('_mean') or b['name'].endswith('_median') or b['name'].endswith('_stddev'):
+                                pass
+                            else:
                                 if b['cpu_time'] < minimal_time:
                                     minimal_time = b['cpu_time']
                         assert minimal_time < 1e80
diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in
new file mode 100644
index 0000000000000000000000000000000000000000..c4b49da5aec59006a9f007534dc7711b98ef0387
--- /dev/null
+++ b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in
@@ -0,0 +1,37 @@
+// Transformations:
+${CODEGEN_TRANSFORMATIONS}
+
+#include "config.h"
+#include <iostream>
+#include <fstream>
+#include <random>
+#include "benchmark/benchmark.h"
+#include <dune/codegen/common/vectorclass.hh>
+#include <dune/codegen/sumfact/horizontaladd.hh>
+
+${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+
+${CODEGEN_BENCHMARK_FUNCTION}
+
+static void BM_sumfact_kernel(benchmark::State& state){
+  std::uniform_real_distribution<double> unif(0,1);
+  std::uniform_int_distribution<int> unif_int(0,1);
+  std::default_random_engine re;
+
+  ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+  ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}
+
+  ${CODEGEN_DECLARE_INPUT}
+  ${CODEGEN_INITIALIZE_INPUT}
+
+  ${CODEGEN_DECLARE_ARGUMENTS}
+  ${CODEGEN_INITIALIZE_ARGUMENTS}
+
+  for (auto _ : state){
+    ${CODEGEN_CALL_BENCHMARK_FUNCTION}
+  }
+}
+
+${CODEGEN_BENCHMARK}
+
+BENCHMARK_MAIN();
diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in
new file mode 100644
index 0000000000000000000000000000000000000000..c03577d672123b3063020841b731fc1fd9780fce
--- /dev/null
+++ b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in
@@ -0,0 +1,58 @@
+// Transformations:
+${CODEGEN_TRANSFORMATIONS}
+
+#include "config.h"
+
+#include <iostream>
+#include <fstream>
+#include <random>
+#include <cstdlib>
+
+#include "benchmark/benchmark.h"
+#include <dune/codegen/common/vectorclass.hh>
+#include <dune/codegen/sumfact/horizontaladd.hh>
+
+const int datasets = 512;
+
+class Benchmark{
+  ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+
+  ${CODEGEN_DECLARE_INPUT}
+
+public:
+  Benchmark (){
+    std::uniform_real_distribution<double> unif(0,1);
+    std::uniform_int_distribution<int> unif_int(0,1);
+    std::default_random_engine re;
+
+    ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}
+
+    ${CODEGEN_INITIALIZE_INPUT}
+  }
+
+  ${CODEGEN_BENCHMARK_FUNCTION}
+
+  void run_benchmark(){
+    ${CODEGEN_DECLARE_ARGUMENTS}
+
+    std::uniform_real_distribution<double> unif(0,1);
+    std::uniform_int_distribution<int> unif_int(0,1);
+    std::default_random_engine re;
+
+    ${CODEGEN_INITIALIZE_ARGUMENTS}
+
+    ${CODEGEN_CALL_BENCHMARK_FUNCTION}
+  }
+
+};
+
+static void BM_sumfact_kernel(benchmark::State& state){
+  Benchmark benchmark;
+  for (auto _ : state){
+    benchmark.run_benchmark();
+  }
+}
+
+BENCHMARK(BM_sumfact_kernel) -> Threads(atoi(std::getenv("DUNE_CODEGEN_THREADS"))) -> UseRealTime();
+
+BENCHMARK_MAIN();
diff --git a/python/dune/codegen/ufl/finiteelement.py b/python/dune/codegen/ufl/finiteelement.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae0ef35f2a3b30cd2837525447233dfe017cd76d
--- /dev/null
+++ b/python/dune/codegen/ufl/finiteelement.py
@@ -0,0 +1,43 @@
+""" Registering finite elements that we have, but FEniCS does not """
+
+from ufl import register_element, L2
+
+
+register_element("Discontinuous Gauss-Lobatto-Legendre",
+                 "DGLL",
+                 0,
+                 L2,
+                 "identity",
+                 (0, None),
+                 ('interval',),
+                 )
+
+
+register_element("Monomials",
+                 "Monom",
+                 0,
+                 L2,
+                 "identity",
+                 (0, None),
+                 ('interval',),
+                 )
+
+
+register_element("L2-Orthonormal Polynomials",
+                 "OPB",
+                 0,
+                 L2,
+                 "identity",
+                 (0, None),
+                 ('interval',),
+                 )
+
+
+register_element("Rannacher-Turek",
+                 "RaTu",
+                 0,
+                 L2,
+                 "identity",
+                 (1, 1),
+                 ('quadrilateral', 'hexahedron'),
+                 )