diff --git a/CMakeLists.txt b/CMakeLists.txt
index e333cab7b37c5466067f1cd5f05c9385e5fa120c..bf740e74d1a29d6e822e44698388d2984e7c7fb9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.1.3)
 project(dune-codegen CXX)
 
 if(NOT (dune-common_DIR OR dune-common_ROOT OR
@@ -29,6 +29,7 @@ dune_register_package_flags(LIBRARIES dunecodegen)
 
 dune_enable_all_packages()
 
+add_subdirectory(doc)
 add_subdirectory(dune/codegen)
 add_subdirectory(cmake/modules)
 
diff --git a/README.md b/README.md
index 8896b12177ec1532e69d64f1748b81f140123643..5856f3514a91a64db9734af6cf7825952b3935c0 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,25 @@ ctest
 
 Note that this takes quite a while.
 
+## Building and Running dune-codegen in an offline environment
+
+dune-codegen relies on installing Python packages into self-contained environments
+during its configuration and build process. In order to do this in an offline
+environment, we recommend using the tool `devpi`. One of its use cases is to provide
+a local mirror for the Python package index. A quickstart tutorial for this use case
+is available [5]. It boils down to the following:
+
+* Installing the `devpi-server` package through your favorite method
+* Setting up a local server with `devpi-server --init`
+* Making sure it is running in the background (explicitly with `devpi-server --start/stop` or by configuring a systemd service.
+* Have the environment variable `PIP_INDEX_URL` to its index, e.g. by adding this line to your `~/.bashrc` (where `http://localhost:3141` might differ depending on your devpi configuration):
+```
+export PIP_INDEX_URL=http://localhost:3141/root/pypi/+simple/
+```
+
+At first installation, the locally mirrored package index will access PyPI.
+Later on, it will install packages from its local cache.
+
 ## Links
 
 [0]: https://git-lfs.github.com/
@@ -112,3 +131,4 @@ Note that this takes quite a while.
 [2]: https://gitlab.dune-project.org/quality/dune-testtools
 [3]: http://isl.gforge.inria.fr/
 [4]: https://www.dune-project.org/doc/installation/
+[5]: https://github.com/devpi/devpi/blob/master/doc/quickstart-pypimirror.rst
diff --git a/bin/donkey_benchmark_execution_wrapper.py b/bin/donkey_benchmark_execution_wrapper.py
index d383963318291ae947c70b1f083c320a60250303..22835c7cbce61336168396007e73703394eb7433 100755
--- a/bin/donkey_benchmark_execution_wrapper.py
+++ b/bin/donkey_benchmark_execution_wrapper.py
@@ -5,8 +5,16 @@ import sys
 import subprocess
 import os
 
+os.environ['DUNE_CODEGEN_THREADS'] = '20'
+
 # Run the actual command
-command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split()
+
+# The actual measurements will be called like this (mpi parallelism)
+# command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split()
+
+# Command for autotune benchmarks with threading
+command = "srun -p haswell10c -n 1 -c 20 --hint=nomultithread --cpu_bin=verbose,core".split()
+
 command.extend(sys.argv[1:])
 ret = subprocess.call(command)
 
diff --git a/cmake/modules/DuneCodegenMacros.cmake b/cmake/modules/DuneCodegenMacros.cmake
index 61713109b32cc1af4716b0f9f133879592fec49c..cff09c5ec49c0675680b5f49c1f24f9008c93cc5 100644
--- a/cmake/modules/DuneCodegenMacros.cmake
+++ b/cmake/modules/DuneCodegenMacros.cmake
@@ -1,6 +1,6 @@
 # File for module specific CMake tests.
 #
-# .. cmake_function:: add_generated_executable
+# .. cmake_function:: dune_add_generated_executable
 #
 #    .. cmake_param:: UFLFILE
 #       :single:
@@ -22,19 +22,20 @@
 #       The name given to the added executable target.
 #
 #    .. cmake_param:: SOURCE
+#       :single:
 #
 #       The cc source file to build from. If omitted, a minimal
 #       source file and a driver file will be generated.
 #
 #    .. cmake_param:: FORM_COMPILER_ARGS
 #       :multi:
-#       :argname arg:
+#       :argname: arg
 #
 #       Additional arguments as recognized by the form compiler.
 #
 #    .. cmake_param:: DEPENDS
 #       :multi:
-#       :argname dep:
+#       :argname: dep
 #
 #       Additional dependencies of the generated executable (changes in those
 #       will retrigger generation)
@@ -57,7 +58,7 @@
 #
 #    .. cmake_param:: ANALYZE_GRID_COMMAND
 #       :multi:
-#       :argname command:
+#       :argname: command
 #
 #       Use this to pass a custom grid analysis command. This is necessary
 #       if you use a custom grid generation methdod. The inifile and the
@@ -104,7 +105,7 @@ else()
 endif()
 file(GLOB_RECURSE UFL2PDELAB_SOURCES ${UFL2PDELAB_GLOB_PATTERN})
 
-function(add_generated_executable)
+function(dune_add_generated_executable)
   set(OPTIONS EXCLUDE_FROM_ALL ANALYZE_GRID)
   set(SINGLE TARGET SOURCE UFLFILE INIFILE)
   set(MULTI FORM_COMPILER_ARGS DEPENDS ANALYZE_GRID_COMMAND)
@@ -112,15 +113,20 @@ function(add_generated_executable)
   cmake_parse_arguments(GEN "${OPTIONS}" "${SINGLE}" "${MULTI}" ${ARGN})
 
   if(GEN_UNPARSED_ARGUMENTS)
-    message(FATAL_ERROR "Unrecognized arguments in add_generated_executable. This usually indicates a typo.")
+    message(FATAL_ERROR "Unrecognized arguments in dune_add_generated_executable. This usually indicates a typo.")
+  endif()
+
+  set(MPI_OPTION "0")
+  if(MPI_FOUND)
+    set(MPI_OPTION "1")
   endif()
 
   # Apply defaults and enforce requirements
   if(NOT GEN_TARGET)
-    message(FATAL_ERROR "Need to specify the TARGET parameter for add_generated_executable")
+    message(FATAL_ERROR "Need to specify the TARGET parameter for dune_add_generated_executable")
   endif()
   if(NOT GEN_UFLFILE)
-    message(FATAL_ERROR "Need to specify the UFLFILE parameter for add_generated_executable")
+    message(FATAL_ERROR "Need to specify the UFLFILE parameter for dune_add_generated_executable")
   endif()
   if(NOT IS_ABSOLUTE ${GEN_UFLFILE})
     set(GEN_UFLFILE ${CMAKE_CURRENT_SOURCE_DIR}/${GEN_UFLFILE})
@@ -138,6 +144,7 @@ function(add_generated_executable)
                                --target-name ${GEN_TARGET}
                                --driver-file ${GEN_SOURCE}
                                --project-basedir ${CMAKE_BINARY_DIR}
+                               --with-mpi ${MPI_OPTION}
                                ${GEN_FORM_COMPILER_ARGS}
                        DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES}
                        COMMENT "Generating driver for the target ${GEN_TARGET}"
@@ -171,10 +178,8 @@ function(add_generated_executable)
   endif()
 
   # Parse a mapping of operators to build and their respective filenames
-  dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET}
-                       OUTPUT_VARIABLE depdata
-                       )
-  parse_python_data(PREFIX depdata INPUT ${depdata})
+  dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET} ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
+  parse_python_data(PREFIX depdata FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
 
   if(DUNE_CODEGEN_PROFILING)
     # This is a bit silly, but cProfile only finds entry point scripts
@@ -198,6 +203,7 @@ function(add_generated_executable)
                                --ini-file ${GEN_INIFILE}
                                --target-name ${GEN_TARGET}
                                --operator-to-build ${op}
+                               --with-mpi ${MPI_OPTION}
                                ${ANALYZE_GRID_OPTION}
                        DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES} ${ANALYZE_GRID_FILE}
                        COMMENT "Generating operator file ${depdata___${op}} for the target ${GEN_TARGET}"
diff --git a/cmake/modules/GeneratedSystemtests.cmake b/cmake/modules/GeneratedSystemtests.cmake
index 206ce3bcbf10a457182ffbb1a5f8e415ea3fcb24..2a1907950a7f5bd68e85a5c536465a73634828cf 100644
--- a/cmake/modules/GeneratedSystemtests.cmake
+++ b/cmake/modules/GeneratedSystemtests.cmake
@@ -1,11 +1,85 @@
-# System testing for generated executables. All ideas taken from dune-testtools.
+# System testing for generated executables. All ideas taken from dune-testtools.
+#
+# .. cmake_function:: dune_add_formcompiler_system_test
+#
+#    .. cmake_param:: UFLFILE
+#       :single:
+#       :required:
+#
+#       The UFL file to create the generate code from.
+#
+#    .. cmake_param:: INIFILE
+#       :single:
+#       :required:
+#
+#       The ini file that controls the form compilation process.
+#       It is expected to contain a [formcompiler] section. This
+#       file may contain meta ini annotations.
+#
+#    .. cmake_param:: BASENAME
+#       :single:
+#       :required:
+#
+#       The basename for the generated executables.
+#
+#    .. cmake_param:: SCRIPT
+#       :single:
+#
+#       The python script that decides about test failure/success.
+#       Defaults to a script that simply runs the program and checks
+#       the exit code. More scripts to be found in dune-testtools.
+#
+#    .. cmake_param:: SOURCE
+#       :single:
+#
+#       The cc source file to build from. If omitted, a minimal
+#       source file and a driver file will be generated.
+#
+#    .. cmake_param:: CREATED_TARGETS
+#       :single:
+#
+#       A variable name that should be filled with the list of generated
+#       targets. This can be used to modify these lateron.
+#
+#    .. cmake_param:: DEPENDS
+#       :multi:
+#       :argname: dep
+#
+#       Additional dependencies of the generated executable (changes in those
+#       will retrigger generation)
+#
+#    .. cmake_param:: NO_TESTS
+#       :option:
+#
+#       If given, code will be generated and built normally, but no tests will
+#       be added to the test suite.
+#
+#    .. cmake_param:: ANALYZE_GRID
+#       :option:
+#
+#       Set this option to enable code generation time grid analysis.
+#       This is useful to reduce the variety of sum factorization kernels
+#       in unstructured grids. Note that the grid analysis tool needs to
+#       be able to construct your grid from the given inifile. If you have
+#       a custom grid construction method, you can use ANALYZE_GRID_COMMAND
+#       instead.
+#
+#    .. cmake_param:: ANALYZE_GRID_COMMAND
+#       :multi:
+#       :argname: command
+#
+#       Use this to pass a custom grid analysis command. This is necessary
+#       if you use a custom grid generation methdod. The inifile and the
+#       outputfile will be appended to this command. You can use the analysis code in
+#       dune/codegen/sumfact/analyzegrid.hh to write your own tool.
+#       Specifying this option will automatically set ANALYZE_GRID.
 #
 
 function(dune_add_formcompiler_system_test)
   # parse arguments
   set(OPTION DEBUG NO_TESTS ANALYZE_GRID)
-  set(SINGLE INIFILE BASENAME SCRIPT UFLFILE SOURCE)
-  set(MULTI CREATED_TARGETS DEPENDS)
+  set(SINGLE INIFILE BASENAME SCRIPT UFLFILE SOURCE CREATED_TARGETS)
+  set(MULTI DEPENDS ANALYZE_GRID_COMMAND)
   cmake_parse_arguments(SYSTEMTEST "${OPTION}" "${SINGLE}" "${MULTI}" ${ARGN})
 
   if(SYSTEMTEST_UNPARSED_ARGUMENTS)
@@ -25,6 +99,10 @@ function(dune_add_formcompiler_system_test)
   if(SYSTEMTEST_ANALYZE_GRID)
     set(ANALYZE_GRID_STR "ANALYZE_GRID")
   endif()
+  set(ANALYZE_GRID_COMMAND_STR "")
+  if(SYSTEMTEST_ANALYZE_GRID_COMMAND)
+    set(ANALYZE_GRID_COMMAND_STR "ANALYZE_GRID_COMMAND ${SYSTEMTEST_ANALYZE_GRID_COMMAND}")
+  endif()
 
   # set a default for the script. call_executable.py just calls the executable.
   # There, it is also possible to hook in things depending on the inifile
@@ -41,9 +119,14 @@ function(dune_add_formcompiler_system_test)
   configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} ${CMAKE_CURRENT_BINARY_DIR}/tmp_${SYSTEMTEST_INIFILE})
 
   # expand the given meta ini file into the build tree
-  execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py --cmake --ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} --dir ${CMAKE_CURRENT_BINARY_DIR} --section formcompiler
-                  OUTPUT_VARIABLE output)
-  parse_python_data(PREFIX INIINFO INPUT "${output}")
+  execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py
+                          --cmake
+                          --ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE}
+                          --dir ${CMAKE_CURRENT_BINARY_DIR}
+                          --section formcompiler
+                          --file ${CMAKE_CURRENT_BINARY_DIR}/interface.log
+                          )
+  parse_python_data(PREFIX INIINFO FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
 
   foreach(inifile ${INIINFO_names})
     if(${INIINFO_${inifile}_suffix} STREQUAL "__empty")
@@ -55,23 +138,25 @@ function(dune_add_formcompiler_system_test)
       endif()
     endif()
 
-    add_generated_executable(TARGET ${tname}
-                             UFLFILE ${SYSTEMTEST_UFLFILE}
-                             INIFILE "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
-                             DEPENDS ${SYSTEMTEST_INIFILE} ${SYSTEMTEST_DEPENDS}
-                             EXCLUDE_FROM_ALL
-                             ${SOURCE}
-                             ${ANALYZE_GRID_STR}
-                             )
+    dune_add_generated_executable(TARGET ${tname}
+                                  UFLFILE ${SYSTEMTEST_UFLFILE}
+                                  INIFILE "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
+                                  DEPENDS ${SYSTEMTEST_INIFILE} ${SYSTEMTEST_DEPENDS}
+                                  EXCLUDE_FROM_ALL
+                                  ${SOURCE}
+                                  ${ANALYZE_GRID_STR}
+                                  ${ANALYZE_GRID_COMMAND_STR}
+                                  )
 
     # Enrich the target with preprocessor variables from the __static section
     # just the way that dune-testtools does.
     dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_extract_static.py
                                --ini ${inifile}
+                               --file ${CMAKE_CURRENT_BINARY_DIR}/interface.log
                          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
                          OUTPUT_VARIABLE output
                          ERROR_MESSAGE "Error extracting static info from ${inifile}")
-    parse_python_data(PREFIX STAT INPUT "${output}")
+    parse_python_data(PREFIX STAT FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
 
     foreach(config ${STAT___CONFIGS})
       foreach(cd ${STAT___STATIC_DATA})
diff --git a/cmake/modules/deplist.py b/cmake/modules/deplist.py
index 9cb5d7d42cbfc712e18e37798dffc2d553416f8c..7b0afac3c7c5f216e9710cef394d175c0c6c9ee9 100755
--- a/cmake/modules/deplist.py
+++ b/cmake/modules/deplist.py
@@ -22,5 +22,5 @@ def get_filename(operator):
 result = {"__{}".format(o): get_filename(o) for o in operators}
 result["__operators"] = ";".join(operators)
 
-printForCMake(result)
+printForCMake(result, sys.argv[3])
 sys.exit(0)
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..956de868a985aec3af90d7263417eb16b9be09df
--- /dev/null
+++ b/doc/CMakeLists.txt
@@ -0,0 +1 @@
+dune_cmake_sphinx_doc(MODULE_ONLY)
diff --git a/dune/codegen/common/simdtraits.hh b/dune/codegen/common/simdtraits.hh
new file mode 100644
index 0000000000000000000000000000000000000000..73ee4caff1bb743d275e20173556416cc0bd2d30
--- /dev/null
+++ b/dune/codegen/common/simdtraits.hh
@@ -0,0 +1,16 @@
+#ifndef DUNE_CODEGEN_COMMON_SIMD_TRAITS_HH
+#define DUNE_CODEGEN_COMMON_SIMD_TRAITS_HH
+
+/** This is just the declaration of the traits classes, specialization for VCL and
+ *  OpCounter VCL are elsewhere.
+ */
+
+template<typename T>
+struct base_floatingpoint
+{};
+
+template<typename T>
+struct simd_size
+{};
+
+#endif
diff --git a/dune/codegen/common/vcltraits.hh b/dune/codegen/common/vcltraits.hh
new file mode 100644
index 0000000000000000000000000000000000000000..de764dbe6f78a51d1bf5934856ca36d4b90a77d0
--- /dev/null
+++ b/dune/codegen/common/vcltraits.hh
@@ -0,0 +1,86 @@
+#ifndef DUNE_CODEGEN_COMMON_VCLTRAITS_HH
+#define DUNE_CODEGEN_COMMON_VCLTRAITS_HH
+
+/** A collection of traits tools for the Vector Class Library */
+
+#include<dune/codegen/common/vectorclass.hh>
+
+
+template<>
+struct base_floatingpoint<Vec2d>
+{
+  using value = double;
+};
+
+template<>
+struct base_floatingpoint<Vec4f>
+{
+  using value = float;
+};
+
+template<>
+struct simd_size<Vec2d>
+{
+  static constexpr std::size_t value = 2;
+};
+
+template<>
+struct simd_size<Vec4f>
+{
+  static constexpr std::size_t value = 4;
+};
+
+#if MAX_VECTOR_SIZE >= 256
+template<>
+struct base_floatingpoint<Vec4d>
+{
+  using value = double;
+};
+
+template<>
+struct base_floatingpoint<Vec8f>
+{
+  using value = float;
+};
+
+template<>
+struct simd_size<Vec4d>
+{
+  static constexpr std::size_t value = 4;
+};
+
+template<>
+struct simd_size<Vec8f>
+{
+  static constexpr std::size_t value = 8;
+};
+#endif
+
+#if MAX_VECTOR_SIZE >= 512
+template<>
+struct base_floatingpoint<Vec8d>
+{
+  using value = double;
+};
+
+template<>
+struct base_floatingpoint<Vec16f>
+{
+  using value = float;
+};
+
+template<>
+struct simd_size<Vec8d>
+{
+  static constexpr std::size_t value = 8;
+};
+
+template<>
+struct simd_size<Vec16f>
+{
+  static constexpr std::size_t value = 16;
+};
+
+#endif
+
+#endif
diff --git a/dune/codegen/common/vectorclass.hh b/dune/codegen/common/vectorclass.hh
index aa3bba7b98f9cf57dcc223f61755e3aa26c5bdca..648ea52fe24490efc8af431fe54de4089515af33 100644
--- a/dune/codegen/common/vectorclass.hh
+++ b/dune/codegen/common/vectorclass.hh
@@ -1,71 +1,23 @@
 #ifndef DUNE_CODEGEN_COMMON_VECTORCLASS_HH
 #define DUNE_CODEGEN_COMMON_VECTORCLASS_HH
 
-
-template<typename T>
-struct base_floatingpoint
-{};
+#include<dune/codegen/common/simdtraits.hh>
 
 #ifdef ENABLE_COUNTER
+
 #if HAVE_DUNE_OPCOUNTER
 #include<dune/opcounter/vectorclass.hh>
-
-template<typename F, int size>
-struct base_floatingpoint<OpCounter::impl::OpCounterVector<F, size>>
-{
-  using value = OpCounter::OpCounter<F>;
-};
-
-
 #else
 #error "dune-opcounter is needed for opcounted vector types"
 #endif
+
 #else
+
 #include<dune/codegen/vectorclass/vectorclass.h>
 #include<dune/codegen/vectorclass/vectormath_exp.h>
 #include<dune/codegen/vectorclass/vectormath_hyp.h>
 #include<dune/codegen/vectorclass/vectormath_trig.h>
-
-template<>
-struct base_floatingpoint<Vec2d>
-{
-  using value = double;
-};
-
-template<>
-struct base_floatingpoint<Vec4f>
-{
-  using value = float;
-};
-
-
-#if MAX_VECTOR_SIZE >= 256
-template<>
-struct base_floatingpoint<Vec4d>
-{
-  using value = double;
-};
-
-template<>
-struct base_floatingpoint<Vec8f>
-{
-  using value = float;
-};
-#endif
-
-#if MAX_VECTOR_SIZE >= 512
-template<>
-struct base_floatingpoint<Vec8d>
-{
-  using value = double;
-};
-
-template<>
-struct base_floatingpoint<Vec16f>
-{
-  using value = float;
-};
-#endif
+#include<dune/codegen/common/vcltraits.hh>
 
 #endif
 
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 8881504f934e7b1d755580ffc46d47f2a1467346..67694937376b1b6cdaa66d44db07289652f62bcc 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -24,6 +24,7 @@ add_subdirectory(test)
 add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc)
 target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing)
 
-if(benchmark_FOUND)
-  target_link_libraries(_autotune_target benchmark)
+find_package(Threads)
+if(benchmark_FOUND AND Threads_FOUND)
+  target_link_libraries(_autotune_target benchmark Threads::Threads)
 endif()
diff --git a/python/dune/codegen/__init__.py b/python/dune/codegen/__init__.py
index e892baf44c4c9fb301536c15c40059a5acc5d602..196419782487f33f13cf797d19c1fbdcbd13d946 100644
--- a/python/dune/codegen/__init__.py
+++ b/python/dune/codegen/__init__.py
@@ -10,3 +10,6 @@ import dune.codegen.loopy.symbolic  # noqa
 import dune.codegen.pdelab  # noqa
 import dune.codegen.sumfact  # noqa
 import dune.codegen.blockstructured  # noqa
+
+# Trigger imports that register finite elements in UFL
+import dune.codegen.ufl.finiteelement  # noqa
diff --git a/python/dune/codegen/blockstructured/accumulation.py b/python/dune/codegen/blockstructured/accumulation.py
index 9af9759d5e3426df12a10db39f94048e48006867..ec2e3d9917998a7a024796d48aaf6be6488a271c 100644
--- a/python/dune/codegen/blockstructured/accumulation.py
+++ b/python/dune/codegen/blockstructured/accumulation.py
@@ -1,12 +1,12 @@
-from dune.codegen.blockstructured.tools import sub_element_inames
-from dune.codegen.generation import accumulation_mixin, instruction
+from dune.codegen.blockstructured.tools import sub_element_inames, name_accumulation_alias
+from dune.codegen.generation import accumulation_mixin, instruction, get_global_context_value
 from dune.codegen.loopy.target import dtype_floatingpoint
 from dune.codegen.options import get_form_option
 from dune.codegen.pdelab.geometry import world_dimension, name_intersection_geometry_wrapper
 from dune.codegen.pdelab.localoperator import determine_accumulation_space, GenericAccumulationMixin
 from dune.codegen.pdelab.argument import name_accumulation_variable
 from dune.codegen.pdelab.localoperator import boundary_predicates
-from dune.codegen.generation.loopy import function_mangler, globalarg, temporary_variable
+from dune.codegen.generation.loopy import function_mangler, temporary_variable
 import loopy as lp
 import pymbolic.primitives as prim
 
@@ -16,32 +16,10 @@ from loopy.match import Writes
 @accumulation_mixin("blockstructured")
 class BlockStructuredAccumulationMixin(GenericAccumulationMixin):
     def generate_accumulation_instruction(self, expr):
-        if get_form_option('vectorization_blockstructured'):
-            return generate_accumulation_instruction_vectorized(expr, self)
-        else:
+        if get_global_context_value("form_type") == "jacobian":
             return generate_accumulation_instruction(expr, self)
-
-
-def name_accumulation_alias(container, accumspace):
-    name = container + "_" + accumspace.lfs.name + "_alias"
-    name_tail = container + "_" + accumspace.lfs.name + "_alias_tail"
-    k = get_form_option("number_of_blocks")
-    p = accumspace.element.degree()
-
-    def _add_alias_insn(name):
-        dim = world_dimension()
-        element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim))
-        index_stride = tuple((p * k + 1)**i for i in range(0, dim))
-        globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True)
-        code = "auto {} = &{}.container()({},0);".format(name, container, accumspace.lfs.name)
-        instruction(within_inames=frozenset(),
-                    code=code,
-                    read_variables=frozenset({container}),
-                    assignees=frozenset({name}))
-
-    _add_alias_insn(name)
-    _add_alias_insn(name_tail)
-    return name
+        else:
+            return generate_accumulation_instruction_vectorized(expr, self)
 
 
 @function_mangler
diff --git a/python/dune/codegen/blockstructured/argument.py b/python/dune/codegen/blockstructured/argument.py
index 420773e85bea93ee55cb310255da7fa60d55d9de..deff1b8415d246d0ed197692df46b61db2dbd5cc 100644
--- a/python/dune/codegen/blockstructured/argument.py
+++ b/python/dune/codegen/blockstructured/argument.py
@@ -1,29 +1,11 @@
-from dune.codegen.generation import (kernel_cached,
-                                     valuearg, instruction, globalarg)
+from dune.codegen.generation import kernel_cached, valuearg
 from dune.codegen.options import get_form_option
 from dune.codegen.pdelab.argument import CoefficientAccess
-from dune.codegen.blockstructured.tools import micro_index_to_macro_index, sub_element_inames
-from dune.codegen.pdelab.geometry import world_dimension
+from dune.codegen.blockstructured.tools import micro_index_to_macro_index, sub_element_inames, name_container_alias
 from loopy.types import NumpyType
 import pymbolic.primitives as prim
 
 
-def name_alias(container, lfs, element):
-    name = container + "_" + lfs.name + "_alias"
-    k = get_form_option("number_of_blocks")
-    p = element.degree()
-    dim = world_dimension()
-    element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim))
-    index_stride = tuple((p * k + 1)**i for i in range(0, dim))
-    globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True)
-    code = "const auto {} = &{}({},0);".format(name, container, lfs.name)
-    instruction(within_inames=frozenset(),
-                code=code,
-                read_variables=frozenset({container}),
-                assignees=frozenset({name}))
-    return name
-
-
 # TODO remove the need for element
 @kernel_cached
 def pymbolic_coefficient(container, lfs, element, index):
@@ -36,9 +18,6 @@ def pymbolic_coefficient(container, lfs, element, index):
         lfs = prim.Variable(lfs)
 
     # use higher order FEM index instead of Q1 index
-    if get_form_option("vectorization_blockstructured"):
-        subelem_inames = sub_element_inames()
-        coeff_alias = name_alias(container, lfs, element)
-        return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index))
-    else:
-        return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),))
+    subelem_inames = sub_element_inames()
+    coeff_alias = name_container_alias(container, lfs, element)
+    return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index))
diff --git a/python/dune/codegen/blockstructured/basis.py b/python/dune/codegen/blockstructured/basis.py
index 5af8bb48f2bb625a0b336c5cae6d42863f0063c5..efbc1bc9b3b206d0503686679d6a15547d5004a9 100644
--- a/python/dune/codegen/blockstructured/basis.py
+++ b/python/dune/codegen/blockstructured/basis.py
@@ -7,7 +7,7 @@ from dune.codegen.generation import (basis_mixin,
                                      globalarg,
                                      class_member,
                                      initializer_list,
-                                     include_file,)
+                                     include_file, preamble)
 from dune.codegen.tools import get_pymbolic_basename, get_pymbolic_indices
 from dune.codegen.loopy.target import type_floatingpoint
 from dune.codegen.pdelab.basis import (GenericBasisMixin,
@@ -38,11 +38,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
         assert not isinstance(element, MixedElement)
         name = "phi_{}".format(FEM_name_mangling(element))
         name = restricted_name(name, restriction)
+        self.init_basis_cache(element, restriction)
         self.evaluate_basis(element, name, restriction)
         inames = self.lfs_inames(element, restriction, number, context=context)
 
         return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0))
 
+    @preamble(kernel='operator')
+    def init_basis_cache(self, element, restriction):
+        if not restriction:
+            cache = name_localbasis_cache(element)
+            localbasis = name_localbasis(element)
+            qp_name = get_pymbolic_basename(self.quadrature_position_in_micro())
+            return ["for (int i=0; i < {}.size(); ++i)".format(qp_name),
+                    "{",
+                    "  {}.evaluateFunction({}[i], {});".format(cache, qp_name, localbasis),
+                    "}"]
+        else:
+            return []
+
     @kernel_cached
     def evaluate_basis(self, element, name, restriction):
         temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1),
@@ -60,11 +74,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
         assert not isinstance(element, MixedElement)
         name = "js_{}".format(FEM_name_mangling(element))
         name = restricted_name(name, restriction)
+        self.init_gradient_cache(element, restriction)
         self.evaluate_reference_gradient(element, name, restriction)
         inames = self.lfs_inames(element, restriction, number, context=context)
 
         return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0))
 
+    @preamble(kernel='operator')
+    def init_gradient_cache(self, element, restriction):
+        if not restriction:
+            cache = name_localbasis_cache(element)
+            localbasis = name_localbasis(element)
+            qp_name = get_pymbolic_basename(self.quadrature_position_in_micro())
+            return ["for (int i=0; i < {}.size(); ++i)".format(qp_name),
+                    "{",
+                    "  {}.evaluateJacobian({}[i], {});".format(cache, qp_name, localbasis),
+                    "}"]
+        else:
+            return []
+
     @kernel_cached
     def evaluate_reference_gradient(self, element, name, restriction):
         temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1, world_dimension()),
diff --git a/python/dune/codegen/blockstructured/tools.py b/python/dune/codegen/blockstructured/tools.py
index 802b819f23754aa0862e122f4127a3db18593117..e9acf640cc3f6febad32dd9bec40cda7d75b881f 100644
--- a/python/dune/codegen/blockstructured/tools.py
+++ b/python/dune/codegen/blockstructured/tools.py
@@ -2,7 +2,7 @@ from dune.codegen.tools import get_pymbolic_basename
 from dune.codegen.generation import (iname,
                                      domain,
                                      temporary_variable,
-                                     instruction)
+                                     instruction, globalarg, preamble)
 from dune.codegen.pdelab.geometry import world_dimension
 from dune.codegen.options import get_form_option
 import pymbolic.primitives as prim
@@ -74,3 +74,33 @@ def name_point_in_macro(point_in_micro, visitor):
     name = get_pymbolic_basename(point_in_micro) + "_macro"
     define_point_in_macro(name, point_in_micro, visitor)
     return name
+
+
+@preamble
+def define_container_alias(name, container, lfs, element, is_const):
+    k = get_form_option("number_of_blocks")
+    p = element.degree()
+    dim = world_dimension()
+    element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim))
+    index_stride = tuple((p * k + 1)**i for i in range(0, dim))
+    globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True)
+    if is_const:
+        return "const auto {} = &{}({},0);".format(name, container, lfs.name)
+    else:
+        return "auto {} = &{}.container()({},0);".format(name, container, lfs.name)
+
+
+def name_accumulation_alias(container, accumspace):
+    name = container + "_" + accumspace.lfs.name + "_alias"
+    name_tail = container + "_" + accumspace.lfs.name + "_alias_tail"
+
+    define_container_alias(name, container, accumspace.lfs, accumspace.element, is_const=False)
+    define_container_alias(name_tail, container, accumspace.lfs, accumspace.element, is_const=False)
+    return name
+
+
+def name_container_alias(container, lfs, element):
+    name = container + "_" + lfs.name + "_alias"
+
+    define_container_alias(name, container, lfs, element, is_const=True)
+    return name
diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py
index d5224e6ffdd19ae88ccbdc6a798a8b416a6fd36d..94bc4875f262c72d695007be04c0b4107208f2f5 100644
--- a/python/dune/codegen/blockstructured/vectorization.py
+++ b/python/dune/codegen/blockstructured/vectorization.py
@@ -26,20 +26,22 @@ def add_vcl_temporaries(knl, vcl_size):
     init_iname = 'init_vec{}'.format(vcl_size)
     from islpy import BasicSet
     init_domain = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(init_iname, get_vcl_type_size(dtype_floatingpoint())))
+
+    silenced_warnings = []
+
     for alias in vector_alias:
         vector_name = alias.replace('alias', 'vec{}'.format(vcl_size))
         new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64,
                                                                  shape=(vcl_size,), managed=True,
                                                                  scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',))
-        # write once to the vector such that loopy won't complain
-        new_insns.append(lp.Assignment(assignee=prim.Subscript(prim.Variable(vector_name), prim.Variable(init_iname)),
-                                       expression=0, within_inames=frozenset({init_iname}),
-                                       id='init_{}'.format(vector_name)))
+        # silence warning such that loopy won't complain
+        silenced_warnings.append("read_no_write({})".format(vector_name))
 
     from loopy.kernel.data import VectorizeTag
     return knl.copy(instructions=knl.instructions + new_insns, domains=knl.domains + [init_domain],
                     temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries),
-                    iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()}))
+                    iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()}),
+                    silenced_warnings=knl.silenced_warnings + silenced_warnings)
 
 
 def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size, level):
@@ -248,6 +250,7 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
         load_insns.append(lp.CallInstruction(assignees=(), expression=call_load,
                                              id=load_id, within_inames=insn.within_inames | insn.reduction_inames(),
                                              depends_on=insn.depends_on | write_ids,
+                                             depends_on_is_final=True,
                                              tags=frozenset({'vectorized_{}'.format(level)})))
         read_dependencies.setdefault(id, set())
         read_dependencies[id].add(load_id)
@@ -277,6 +280,7 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
                                               id=store_id, within_inames=insn.within_inames,
                                               depends_on=(insn.depends_on | frozenset({id}) | read_dependencies[id] |
                                                           write_ids),
+                                              depends_on_is_final=True,
                                               tags=frozenset({'vectorized_{}'.format(level)})))
 
     # replace alias with vcl vector, except for accumulation assignee
@@ -341,10 +345,12 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
                 new_insns.append(insn.copy(assignee=assignee_vec,
                                            depends_on=(insn.depends_on | read_dependencies[insn.id] |
                                                        write_ids),
+                                           depends_on_is_final=True,
                                            tags=insn.tags | frozenset({'vectorized_{}'.format(level)})))
             else:
                 new_insns.append(insn.copy(depends_on=(insn.depends_on | read_dependencies[insn.id] |
                                                        write_ids),
+                                           depends_on_is_final=True,
                                            tags=insn.tags | frozenset({'vectorized_{}'.format(level)})))
 
     return knl.copy(instructions=new_insns + load_insns + store_insns)
diff --git a/python/dune/codegen/generation/__init__.py b/python/dune/codegen/generation/__init__.py
index bed0256407b7259bab61b6e932c4a17761097e75..97090e18852359b10d1a2d3f74a268a3abac60f1 100644
--- a/python/dune/codegen/generation/__init__.py
+++ b/python/dune/codegen/generation/__init__.py
@@ -24,6 +24,7 @@ from dune.codegen.generation.cpp import (base_class,
                                          preamble,
                                          post_include,
                                          template_parameter,
+                                         dump_ssc_marks
                                          )
 
 from dune.codegen.generation.hooks import (hook,
diff --git a/python/dune/codegen/generation/cpp.py b/python/dune/codegen/generation/cpp.py
index b918291067f45c5f988bc8fdcea55651d538a9db..2ea4c346590ee80ef329fdc9394b9fbc3c59db9c 100644
--- a/python/dune/codegen/generation/cpp.py
+++ b/python/dune/codegen/generation/cpp.py
@@ -55,3 +55,10 @@ def dump_accumulate_timer(name):
 @generator_factory(item_tags=("register_likwid_timers",))
 def register_liwkid_timer(name):
     return "LIKWID_MARKER_REGISTER(\"{}\");".format(name)
+
+
+@generator_factory(item_tags=("register_ssc_marks",))
+def dump_ssc_marks(name):
+    from dune.codegen.pdelab.driver.timings import get_region_marks
+    return 'std::cout << "{}: " << {} << " <--> " << {} << std::endl;'.format(name,
+                                                                              *get_region_marks(name, driver=False))
diff --git a/python/dune/codegen/logging.conf b/python/dune/codegen/logging.conf
index 32974b12744dd1c23c36f561148943ec936b3ccb..1fcc4ba02962460ea3f1db8c678dbba5ef3fef2e 100644
--- a/python/dune/codegen/logging.conf
+++ b/python/dune/codegen/logging.conf
@@ -1,5 +1,5 @@
 [loggers]
-keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.vectorization
+keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.transformations,dune.codegen.sumfact.vectorization
 
 [handlers]
 keys=consoleHandler
@@ -16,6 +16,12 @@ handlers=consoleHandler
 qualname=dune.codegen.pdelab.localoperator
 propagate=0
 
+[logger_dune.codegen.sumfact.transformations]
+level=INFO
+handlers=consoleHandler
+qualname=dune.codegen.sumfact.transformations
+propagate=0
+
 [logger_dune.codegen.sumfact.vectorization]
 level=INFO
 handlers=consoleHandler
diff --git a/python/dune/codegen/loopy/transformations/performance.py b/python/dune/codegen/loopy/transformations/performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..02644e80c3c0f782309a165c050949e0f1d71cb2
--- /dev/null
+++ b/python/dune/codegen/loopy/transformations/performance.py
@@ -0,0 +1,8 @@
+from dune.codegen.options import get_form_option
+from dune.codegen.sumfact.transformations import sumfact_performance_transformations
+
+
+def performance_transformations(kernel, signature):
+    if get_form_option("sumfact"):
+        kernel = sumfact_performance_transformations(kernel, signature)
+    return kernel
diff --git a/python/dune/codegen/loopy/transformations/remove_reductions.py b/python/dune/codegen/loopy/transformations/remove_reductions.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ccca09c230f5ec32a9134035f3e883c63cc3680
--- /dev/null
+++ b/python/dune/codegen/loopy/transformations/remove_reductions.py
@@ -0,0 +1,71 @@
+import loopy as lp
+
+import pymbolic.primitives as prim
+
+
+def remove_reduction(knl, match):
+    """Removes all matching reductions and do direct accumulation in assignee instead"""
+
+    # Find reductions
+    for instr in lp.find_instructions(knl, match):
+        if isinstance(instr.expression, lp.symbolic.Reduction):
+            instructions = []
+
+            # Dependencies
+            depends_on = instr.depends_on
+            depending = []
+            for i in knl.instructions:
+                if instr.id in i.depends_on:
+                    depending.append(i.id)
+
+            # Remove the instruction from the kernel
+            knl = lp.remove_instructions(knl, set([instr.id]))
+
+            # Add instruction that sets assignee to zero
+            id_zero = instr.id + '_set_zero'
+            instructions.append(lp.Assignment(instr.assignee,
+                                              0,
+                                              within_inames=instr.within_inames,
+                                              id=id_zero,
+                                              tags=('set_zero',)
+                                              ))
+
+            # Add instruction that accumulates directly in assignee
+            assignee = instr.assignee
+            expression = prim.Sum((assignee, instr.expression.expr))
+            within_inames = frozenset(tuple(instr.within_inames) + instr.expression.inames)
+            id_accum = instr.id + '_accum'
+            instructions.append(lp.Assignment(assignee,
+                                              expression,
+                                              within_inames=within_inames,
+                                              id=id_accum,
+                                              depends_on=frozenset((id_zero,) + tuple(depends_on)),
+                                              tags=('assignment',)))
+
+            knl = knl.copy(instructions=knl.instructions + instructions)
+
+            # Restore dependencies
+            for dep in depending:
+                match = lp.match.Id(dep)
+                knl = lp.add_dependency(knl, match, id_accum)
+    return knl
+
+
+def remove_all_reductions(knl):
+    """Remove all reductions from loopy kernel
+
+    This removes all reductions by instead setting the assignee to zero and
+    directly accumulating in the assignee.
+    """
+    # Find ids of all reductions
+    ids = []
+    for instr in knl.instructions:
+        if isinstance(instr.expression, lp.symbolic.Reduction):
+            ids.append(instr.id)
+
+    # Remove reductions
+    for id in ids:
+        match = lp.match.Id(id)
+        knl = remove_reduction(knl, match)
+
+    return knl
diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index 257da40169f578a4b64a367f6aaa4cd55729253d..51722da3fe34a3fa24b85f4d47e06750c0cc0007 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -37,6 +37,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
     debug_cache_with_stack = CodegenOption(default=False, helpstr="Store stack along with cache objects. Makes debugging caching issues easier.")
     driver_file = CodegenOption(helpstr="The filename for the generated driver header")
     explicit_time_stepping = CodegenOption(default=False, helpstr="use explicit time stepping")
+    time_stepping_order = CodegenOption(default=1, helpstr="Order of the time stepping method")
     exact_solution_expression = CodegenOption(helpstr="name of the exact solution expression in the ufl file")
     compare_l2errorsquared = CodegenOption(helpstr="maximal allowed l2 error squared of difference between numerical solution and interpolation of exact solution (NOTE: requires --exact-solution-expression)")
     grid_info = CodegenOption(default=None, helpstr="Path to file with information about facedir and facemod variations. This can be used to limit the generation of skeleton kernels.")
@@ -49,6 +50,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
     architecture = CodegenOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl|skylake")
     yaspgrid_offset = CodegenOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.")
     grid_unstructured = CodegenOption(default=False, helpstr="Set to true if you want to use an unstructured grid.")
+    grid_consistent = CodegenOption(default=False, helpstr="The used grid is already consistent.")
     precision_bits = CodegenOption(default=64, helpstr="The number of bits for the floating point type")
     overlapping = CodegenOption(default=False, helpstr="Use an overlapping solver and constraints. You still need to make sure to construct a grid with overlap! The parallel option will be set automatically.")
     operators = CodegenOption(default="r", helpstr="A comma separated list of operators, each name will be interpreted as a subsection name within the formcompiler section")
@@ -56,7 +58,9 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
     operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
     debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).")
     use_likwid = CodegenOption(default=False, helpstr="Use likwid instead of own performance measurements.")
+    use_sde = CodegenOption(default=False, helpstr="Use sde instead of own performance measurements.")
     autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
+    with_mpi = CodegenOption(default=True, helpstr="The module was configured with mpi")
 
     # Arguments that are mainly to be set by logic depending on other options
     max_vector_width = CodegenOption(default=256, helpstr=None)
@@ -85,6 +89,9 @@ class CodegenFormOptionsArray(ImmutableRecord):
     sumfact = CodegenOption(default=False, helpstr="Use sumfactorization")
     sumfact_regular_jacobians = CodegenOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)")
     sumfact_on_boundary = CodegenOption(default=True, helpstr="Whether boundary integrals should be vectorized. It might not be worth the hassle...")
+    sumfact_optimize_loop_order = CodegenOption(default=False, helpstr="Optimize order of loops in sumf factorization function using autotuning.")
+    sumfact_performance_transformations = CodegenOption(default=False, helpstr="Apply sum factorization specific performance transformations.")
+    sumfact_performance_transformations_testrun = CodegenOption(default=0, helpstr="If larger than zero determines test case to run.")
     vectorization_quadloop = CodegenOption(default=False, helpstr="whether to generate code with explicit vectorization")
     vectorization_strategy = CodegenOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune")
     vectorization_not_fully_vectorized_error = CodegenOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize")
@@ -111,6 +118,7 @@ class CodegenFormOptionsArray(ImmutableRecord):
     control_variable = CodegenOption(default=None, helpstr="Name of control variable in UFL file")
     block_preconditioner_diagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the diagonal part of a block preconditioner")
     block_preconditioner_offdiagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the off-diagonal part of a block preconditioner")
+    block_preconditioner_pointdiagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the point diagonal part of a block preconditioner")
     geometry_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for geometries. Currently implemented mixins: generic, axiparallel, equidistant, sumfact_multilinear, sumfact_axiparallel, sumfact_equidistant")
     quadrature_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for quadrature. Currently implemented: generic, sumfact")
     basis_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for basis function evaluation. Currently implemented: generic, sumfact")
@@ -125,6 +133,20 @@ _global_options = CodegenGlobalOptionsArray()
 _form_options = {}
 
 
+def show_options():
+    def subopt(arr):
+        for k, v in arr.__dict__.items():
+            if isinstance(v, CodegenOption) and v.helpstr is not None:
+                print("{}\n    {}".format(k, v.helpstr))
+
+    print("This is a summary of options available for the code generation process:\n")
+    print("The following options can be given in the [formcompiler] section:")
+    subopt(CodegenGlobalOptionsArray)
+
+    print("\nThefollowing options can be given in a form-specific subsection of [formcompiler]:")
+    subopt(CodegenFormOptionsArray)
+
+
 def initialize_options():
     """ Initialize the options from the command line """
     global _global_options
@@ -209,10 +231,16 @@ def process_form_options(opt, form):
     if opt.filename is None:
         opt = opt.copy(filename="{}_{}_file.hh".format(get_option("target_name"), opt.classname))
 
+    if opt.block_preconditioner_pointdiagonal:
+        opt = opt.copy(generate_jacobians=False,
+                       basis_mixins="sumfact_pointdiagonal",
+                       accumulation_mixins="sumfact_pointdiagonal",
+                       )
+
     if opt.block_preconditioner_diagonal or opt.block_preconditioner_offdiagonal:
         assert opt.numerical_jacobian is False
         opt = opt.copy(generate_residuals=False,
-                       generate_jacobians=False,
+                       generate_jacobians=True,
                        matrix_free=True,
                        )
 
diff --git a/python/dune/codegen/pdelab/argument.py b/python/dune/codegen/pdelab/argument.py
index c3cc48298c1b2efb70c6c7716b29b221486044e1..dc1acd660c137c42be6fb65bb687bafe03fbc730 100644
--- a/python/dune/codegen/pdelab/argument.py
+++ b/python/dune/codegen/pdelab/argument.py
@@ -19,6 +19,7 @@ from dune.codegen.pdelab.spaces import (lfs_iname,
                                         )
 from dune.codegen.pdelab.restriction import restricted_name
 from dune.codegen.ufl.modified_terminals import Restriction
+from dune.codegen.options import get_form_option
 
 from pymbolic.primitives import Call, Subscript, Variable
 
@@ -116,6 +117,10 @@ def type_coefficientcontainer():
     return "X"
 
 
+def type_linearizationpointcontainer():
+    return "Z"
+
+
 def name_jacobian(restriction1, restriction2):
     # Restrictions may only differ if NONE
     if (restriction1 == Restriction.NONE) or (restriction2 == Restriction.NONE):
@@ -144,6 +149,8 @@ def name_accumulation_variable(restrictions=None):
                 restrictions = (Restriction.NONE,)
             else:
                 restrictions = (Restriction.POSITIVE,)
+        if get_form_option("block_preconditioner_pointdiagonal"):
+            restrictions = restrictions[:1]
         return name_residual(*restrictions)
     if ft == 'jacobian':
         if restrictions is None:
diff --git a/python/dune/codegen/pdelab/driver/__init__.py b/python/dune/codegen/pdelab/driver/__init__.py
index b60544c1c78242f1490c76c46d4be6a2c4448501..124354b50dd354263800c84d99425cc6c9a57c01 100644
--- a/python/dune/codegen/pdelab/driver/__init__.py
+++ b/python/dune/codegen/pdelab/driver/__init__.py
@@ -39,7 +39,10 @@ def get_form_ident():
 
 def get_form():
     data = get_global_context_value("data")
-    return data.object_by_name[get_form_option("form", get_form_ident())]
+    form = get_form_option("form")
+    if form is None:
+        form = get_form_ident()
+    return data.object_by_name[form]
 
 
 def get_dimension():
@@ -212,7 +215,10 @@ def name_initree():
 @preamble(section="init")
 def define_mpihelper(name):
     include_file("dune/common/parallel/mpihelper.hh", filetag="driver")
-    return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name)
+    if get_option("with_mpi"):
+        return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name)
+    else:
+        return "Dune::FakeMPIHelper& {} = Dune::FakeMPIHelper::instance(argc, argv);".format(name)
 
 
 def name_mpihelper():
@@ -282,6 +288,13 @@ def generate_driver():
 
     contents = []
 
+    # Assert that this program was called with ini file
+    contents += ['if (argc != 2){',
+                 '  std::cerr << "This program needs to be called with an ini file" << std::endl;',
+                 '  return 1;',
+                 '}',
+                 '']
+
     def add_section(tag, comment):
         tagcontents = [i for i in retrieve_cache_items("preamble and {}".format(tag), make_generable=True)]
         if tagcontents:
diff --git a/python/dune/codegen/pdelab/driver/gridfunctionspace.py b/python/dune/codegen/pdelab/driver/gridfunctionspace.py
index 377b304b03499aa3c109390cf72f4af0f6327e9b..9aad1d5e6096c9d1735790badd63824f24026d42 100644
--- a/python/dune/codegen/pdelab/driver/gridfunctionspace.py
+++ b/python/dune/codegen/pdelab/driver/gridfunctionspace.py
@@ -82,7 +82,8 @@ def define_grid(name):
     _type = type_grid()
     # TODO: In principle this is only necessary if we use sum factorization in
     # one of the operators. So this could be turned off if that is not the case.
-    if isQuadrilateral(get_trial_element().cell()) and get_option("grid_unstructured"):
+    if isQuadrilateral(get_trial_element().cell()) and get_option("grid_unstructured") and not \
+            get_option("grid_consistent"):
         include_file("dune/consistent-edge-orientation/createconsistentgrid.hh", filetag="driver")
         return ["IniGridFactory<{}> factory({});".format(_type, ini),
                 "std::shared_ptr<{}> grid_nonconsistent = factory.getGrid();".format(_type),
diff --git a/python/dune/codegen/pdelab/driver/instationary.py b/python/dune/codegen/pdelab/driver/instationary.py
index 5cf7170f1dca6f4274db4bf77b4b1627466e20cb..355fd0e743f6a59716319241018d3c5794c784bd 100644
--- a/python/dune/codegen/pdelab/driver/instationary.py
+++ b/python/dune/codegen/pdelab/driver/instationary.py
@@ -132,10 +132,25 @@ def name_time():
 def typedef_timesteppingmethod(name):
     r_type = type_range()
     explicit = get_option('explicit_time_stepping')
+    order = get_option('time_stepping_order')
     if explicit:
-        return "using {} = Dune::PDELab::ExplicitEulerParameter<{}>;".format(name, r_type)
+        if order == 1:
+            return "using {} = Dune::PDELab::ExplicitEulerParameter<{}>;".format(name, r_type)
+        elif order == 2:
+            return "using {} = Dune::PDELab::HeunParameter<{}>;".format(name, r_type)
+        elif order == 3:
+            return "using {} = Dune::PDELab::Shu3Parameter<{}>;".format(name, r_type)
+        elif order == 4:
+            return "using {} = Dune::PDELab::RK4Parameter<{}>;".format(name, r_type)
+        else:
+            raise NotImplementedError("Time stepping method not supported")
     else:
-        return "using {} = Dune::PDELab::OneStepThetaParameter<{}>;".format(name, r_type)
+        if order == 1:
+            return "using {} = Dune::PDELab::OneStepThetaParameter<{}>;".format(name, r_type)
+        elif order == 2:
+            return "using {} = Dune::PDELab::Alexander2Parameter<{}>;".format(name, r_type)
+        elif order == 3:
+            return "using {} = Dune::PDELab::Alexander3Parameter<{}>;".format(name, r_type)
 
 
 def type_timesteppingmethod():
@@ -150,8 +165,12 @@ def define_timesteppingmethod(name):
     if explicit:
         return "{} {};".format(tsm_type, name)
     else:
-        ini = name_initree()
-        return "{} {}({}.get<double>(\"instat.theta\",1.0));".format(tsm_type, name, ini)
+        order = get_option('time_stepping_order')
+        if order == 1:
+            ini = name_initree()
+            return "{} {}({}.get<double>(\"instat.theta\",1.0));".format(tsm_type, name, ini)
+        else:
+            return "{} {};".format(tsm_type, name)
 
 
 def name_timesteppingmethod():
diff --git a/python/dune/codegen/pdelab/driver/timings.py b/python/dune/codegen/pdelab/driver/timings.py
index aeca64d46c73f2327b48c22f07dca7a85a044104..6bbbd07e4b7701fe516eff9509525165ac23a5eb 100644
--- a/python/dune/codegen/pdelab/driver/timings.py
+++ b/python/dune/codegen/pdelab/driver/timings.py
@@ -4,7 +4,7 @@ from dune.codegen.generation import (cached,
                                      include_file,
                                      pre_include,
                                      preamble,
-                                     )
+                                     post_include)
 from dune.codegen.options import get_option
 from dune.codegen.pdelab.driver import (get_form_ident,
                                         is_linear,
@@ -24,6 +24,9 @@ from dune.codegen.pdelab.driver.solve import (name_vector,
                                               )
 
 
+_sde_marks = {}
+
+
 @preamble(section="timings")
 def define_timing_identifier(name):
     ini = name_initree()
@@ -125,6 +128,17 @@ def local_operator_likwid():
     return "{}.register_likwid_timers();".format(lop_name)
 
 
+@preamble(section="timings")
+def local_operator_ssc_marks():
+    lop_name = name_localoperator(get_form_ident())
+    return "{}.dump_ssc_marks();".format(lop_name)
+
+
+def ssc_macro():
+    return '#define __SSC_MARK(x) do{ __asm__ __volatile__' \
+           '("movl %0, %%ebx; .byte 100, 103, 144" : :"i"(x) : "%ebx"); } while(0)'
+
+
 @cached
 def setup_timer():
     # TODO check that we are using YASP?
@@ -138,6 +152,10 @@ def setup_timer():
             logger.warning("timings: using instrumentation level >= 3 with likwid will slow down your code considerably")
             local_operator_likwid()
         finalize_likwid()
+    elif get_option("use_sde"):
+        post_include(ssc_macro(), filetag='driver')
+        if get_option('instrumentation_level') >= 3:
+            local_operator_ssc_marks()
     else:
         from dune.codegen.loopy.target import type_floatingpoint
         pre_include("#define HP_TIMER_OPCOUNTER {}".format(type_floatingpoint()), filetag="driver")
@@ -156,14 +174,26 @@ def init_region_timer(region):
     setup_timer()
     if get_option("use_likwid"):
         init_likwid_timer(region)
+    elif get_option("use_sde"):
+        pass
     else:
         from dune.codegen.generation import post_include
         post_include("HP_DECLARE_TIMER({});".format(region), filetag="driver")
 
 
+def get_region_marks(region, driver):
+    if driver:
+        return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 11, (2 * (len(_sde_marks) + 1) + 1) * 11))
+    else:
+        return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 1, (2 * (len(_sde_marks) + 1) + 1) * 1))
+
+
 def start_region_timer(region):
     if get_option("use_likwid"):
         return ["LIKWID_MARKER_START(\"{}\");".format(region)]
+    elif get_option("use_sde"):
+        marks = get_region_marks(region, driver=True)
+        return ["__SSC_MARK(0x{});".format(marks[0])]
     else:
         return ["HP_TIMER_START({});".format(region)]
 
@@ -171,6 +201,10 @@ def start_region_timer(region):
 def stop_region_timer(region):
     if get_option("use_likwid"):
         return ["LIKWID_MARKER_STOP(\"{}\");".format(region)]
+    elif get_option("use_sde"):
+        marks = get_region_marks(region, driver=True)
+        return ["__SSC_MARK(0x{});".format(marks[1]),
+                "std::cout << \"Timed region {}: {} <--> {}\" << std::endl;".format(region, *marks)]
     else:
         timestream = name_timing_stream()
         return ["HP_TIMER_STOP({});".format(region),
@@ -207,7 +241,7 @@ def timed_region(region, actions):
 
         init_region_timer(region)
 
-        if get_option('instrumentation_level') >= 3 and not get_option('use_likwid'):
+        if get_option('instrumentation_level') >= 3 and not (get_option('use_likwid') or get_option("use_sde")):
             timestream = name_timing_stream()
             lop_name = name_localoperator(get_form_ident())
             print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py
index 11474dbf3489fae9fc85e944c9289322d8b3d3d7..1fd09fc706795afec6832fbe50207ac41dd5e48c 100644
--- a/python/dune/codegen/pdelab/localoperator.py
+++ b/python/dune/codegen/pdelab/localoperator.py
@@ -32,6 +32,7 @@ from dune.codegen.generation import (accumulation_mixin,
                                      ReturnArg,
                                      run_hook,
                                      template_parameter,
+                                     dump_ssc_marks
                                      )
 from dune.codegen.cgen.clazz import (AccessModifier,
                                      BaseClass,
@@ -153,6 +154,11 @@ def enum_alpha():
     return _enum_alpha(ufl_measure_to_pdelab_measure(integral_type))
 
 
+@class_member(classtag="operator")
+def enum_skeleton_twosided():
+    return "enum { doSkeletonTwoSided = true };"
+
+
 def name_initree_member():
     define_initree("_iniParams")
     return "_iniParams"
@@ -279,7 +285,7 @@ def determine_accumulation_space(info, number):
     return AccumulationSpace(lfs=lfs,
                              index=lfsi,
                              restriction=info.restriction,
-                             element=element
+                             element=subel
                              )
 
 
@@ -595,6 +601,10 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin
     for trafo in transformations:
         kernel = trafo[0](kernel, *trafo[1], **trafo[2])
 
+    # Apply performance transformations
+    from dune.codegen.loopy.transformations.performance import performance_transformations
+    kernel = performance_transformations(kernel, signature)
+
     from dune.codegen.loopy import heuristic_duplication
     kernel = heuristic_duplication(kernel)
 
@@ -688,6 +698,19 @@ class RegisterLikwidMethod(ClassMember):
         ClassMember.__init__(self, content)
 
 
+class RegisterSSCMarksMethod(ClassMember):
+    def __init__(self):
+        knl = name_example_kernel()
+        assert(knl is not None)
+
+        content = ["void dump_ssc_marks()"
+                   "{"]
+        register_liwkid_timers = [i for i in retrieve_cache_items(condition='register_ssc_marks')]
+        content.extend(map(lambda x: '  ' + x, register_liwkid_timers))
+        content += ["}"]
+        ClassMember.__init__(self, content)
+
+
 class LoopyKernelMethod(ClassMember):
     def __init__(self, signature, kernel, add_timings=True, initializer_list=[]):
         from loopy import generate_body
@@ -715,6 +738,12 @@ class LoopyKernelMethod(ClassMember):
                     init_likwid_timer(timer_name)
                     content.append('  ' + 'LIKWID_MARKER_START(\"{}\");'.format(timer_name))
                     register_liwkid_timer(timer_name)
+                elif get_option('use_sde'):
+                    from dune.codegen.pdelab.driver.timings import get_region_marks, ssc_macro
+                    post_include(ssc_macro(), filetag='operatorfile')
+                    marks = get_region_marks(timer_name, driver=False)
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(marks[0]))
+                    dump_ssc_marks(timer_name)
                 else:
                     post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
                     content.append('  ' + 'HP_TIMER_START({});'.format(timer_name))
@@ -727,6 +756,11 @@ class LoopyKernelMethod(ClassMember):
                         init_likwid_timer(setuptimer)
                         content.append('  ' + 'LIKWID_MARKER_START(\"{}\");'.format(setuptimer))
                         register_liwkid_timer(setuptimer)
+                    elif get_option('use_sde'):
+                        from dune.codegen.pdelab.driver.timings import get_region_marks
+                        setup_marks = get_region_marks(setuptimer, driver=False)
+                        content.append('  ' + '__SSC_MARK(0x{});'.format(setup_marks[0]))
+                        dump_ssc_marks(setuptimer)
                     else:
                         post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile')
                         content.append('  HP_TIMER_START({});'.format(setuptimer))
@@ -739,6 +773,8 @@ class LoopyKernelMethod(ClassMember):
             if add_timings and get_option('instrumentation_level') >= 4:
                 if get_option('use_likwid'):
                     content.append('  ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(setuptimer))
+                elif get_option('use_sde'):
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(setup_marks[1]))
                 else:
                     content.append('  ' + 'HP_TIMER_STOP({});'.format(setuptimer))
 
@@ -749,6 +785,8 @@ class LoopyKernelMethod(ClassMember):
             if add_timings and get_option('instrumentation_level') >= 3:
                 if get_option('use_likwid'):
                     content.append('  ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(timer_name))
+                elif get_option('use_sde'):
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(marks[1]))
                 else:
                     content.append('  ' + 'HP_TIMER_STOP({});'.format(timer_name))
 
@@ -838,6 +876,14 @@ def local_operator_default_settings(operator, form):
         base_class('Dune::PDELab::InstationaryLocalOperatorDefaultMethods<{}>'
                    .format(rf), classtag="operator")
 
+    # *always* add the volume pattern, PDELab cannot handle matrices without diagonal blocks
+    with global_context(integral_type="cell"):
+        enum_pattern()
+        pattern_baseclass()
+
+    if get_form_option("block_preconditioner_diagonal") or get_form_option("block_preconditioner_pointdiagonal"):
+        enum_skeleton_twosided()
+
 
 def measure_is_enabled(measure):
     option_dict = {"cell": "enable_volume",
@@ -852,6 +898,16 @@ def generate_residual_kernels(form, original_form):
     if not get_form_option("generate_residuals"):
         return {}
 
+    if get_form_option("block_preconditioner_pointdiagonal"):
+        from ufl import derivative
+        jacform = derivative(original_form, original_form.coefficients()[0])
+
+        from dune.codegen.ufl.preprocess import preprocess_form
+        jacform = preprocess_form(jacform).preprocessed_form
+
+        from dune.codegen.ufl.transformations.blockpreconditioner import diagonal_block_jacobian
+        form = diagonal_block_jacobian(jacform)
+
     logger = logging.getLogger(__name__)
     with global_context(form_type='residual'):
         operator_kernels = {}
@@ -954,6 +1010,12 @@ def generate_jacobian_kernels(form, original_form):
                     from dune.codegen.pdelab.signatures import assembler_routine_name
                     with global_context(kernel=assembler_routine_name()):
                         kernel = [k for k in generate_kernels_per_integral(jac_apply_form.integrals_by_type(measure))]
+
+                        if kernel:
+                            enum_pattern()
+                            pattern_baseclass()
+                            enum_alpha()
+
                 operator_kernels[(measure, 'jacobian_apply')] = kernel
 
                 # Generate dummy functions for those kernels, that vanished in the differentiation process
@@ -983,6 +1045,12 @@ def generate_jacobian_kernels(form, original_form):
                         from dune.codegen.pdelab.signatures import assembler_routine_name
                         with global_context(kernel=assembler_routine_name()):
                             kernel = [k for k in generate_kernels_per_integral(jacform.integrals_by_type(measure))]
+
+                            if kernel:
+                                enum_pattern()
+                                pattern_baseclass()
+                                enum_alpha()
+
                     operator_kernels[(measure, 'jacobian')] = kernel
 
                 # Generate dummy functions for those kernels, that vanished in the differentiation process
@@ -1181,6 +1249,8 @@ def generate_localoperator_file(kernels, filename):
         include_file('dune/codegen/common/timer.hh', filetag='operatorfile')
         if get_option('use_likwid'):
             operator_methods.append(RegisterLikwidMethod())
+        elif get_option('use_sde'):
+            operator_methods.append(RegisterSSCMarksMethod())
         else:
             operator_methods.append(TimerMethod())
     elif get_option('opcounter'):
diff --git a/python/dune/codegen/pdelab/signatures.py b/python/dune/codegen/pdelab/signatures.py
index 09b832ac252138181d8be19c9fd4c098b5bb9b68..34acea22ea1fda1b8369bf499d7ce8ed0c37859d 100644
--- a/python/dune/codegen/pdelab/signatures.py
+++ b/python/dune/codegen/pdelab/signatures.py
@@ -9,6 +9,7 @@ from dune.codegen.pdelab.argument import (name_accumulation_variable,
                                           name_coefficientcontainer,
                                           type_coefficientcontainer,
                                           name_applycontainer,
+                                          type_linearizationpointcontainer,
                                           )
 from dune.codegen.pdelab.spaces import (name_testfunctionspace,
                                         type_testfunctionspace,
@@ -293,8 +294,9 @@ def nonlinear_jacobian_apply_volume_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, avt)
 
 
 def nonlinear_jacobian_apply_volume_args():
@@ -312,8 +314,9 @@ def nonlinear_jacobian_apply_boundary_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, avt)
 
 
 def nonlinear_jacobian_apply_boundary_args():
@@ -331,8 +334,9 @@ def nonlinear_jacobian_apply_skeleton_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, lfsut, cct, cct, lfsvt, avt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, lfsut, cct, lpt, lfsvt, avt, avt)
 
 
 def nonlinear_jacobian_apply_skeleton_args():
diff --git a/python/dune/codegen/pdelab/tensors.py b/python/dune/codegen/pdelab/tensors.py
index a924a39ae1bba19a94c47e5542567d8717cedf57..68f6d393129600f0fda995ea1a4f239ff844c173 100644
--- a/python/dune/codegen/pdelab/tensors.py
+++ b/python/dune/codegen/pdelab/tensors.py
@@ -22,7 +22,7 @@ def define_assembled_tensor(name, expr, visitor):
         visitor.indices = indices
         instruction(assignee=prim.Subscript(prim.Variable(name), indices),
                     expression=visitor.call(expr),
-                    forced_iname_deps=frozenset(visitor.interface.quadrature_inames()),
+                    forced_iname_deps=frozenset(visitor.quadrature_inames()),
                     depends_on=frozenset({lp.match.Tagged("sumfact_stage1")}),
                     tags=frozenset({"quad"}),
                     )
@@ -36,17 +36,43 @@ def name_assembled_tensor(o, visitor):
 
 
 @kernel_cached
+def code_generation_time_inversion(expr, visitor):
+    mat = np.ndarray(expr.ufl_shape)
+    for indices in it.product(*tuple(range(i) for i in expr.ufl_shape)):
+        visitor.indices = indices
+        val = visitor.call(expr.ufl_operands[0])
+        if not isinstance(val, (float, int)):
+            visitor.indices = None
+            return None
+
+        mat[indices] = val
+
+    visitor.indices = None
+    return np.linalg.inv(mat)
+
+
 def pymbolic_matrix_inverse(o, visitor):
+    # Try to evaluate the matrix at code generation time.
+    # If this works (it does e.g. for Maxwell on structured grids)
+    # we can invert the matrix at code generation time!!!
     indices = visitor.indices
     visitor.indices = None
+
+    mat = code_generation_time_inversion(o, visitor)
+    if mat is not None:
+        return mat[indices]
+
+    # If code generation time inversion failed, we assemble it in C++
+    # and invert it there.
     name = name_assembled_tensor(o.ufl_operands[0], visitor)
 
     instruction(code="{}.invert();".format(name),
-                within_inames=frozenset(visitor.interface.quadrature_inames()),
+                within_inames=frozenset(visitor.quadrature_inames()),
                 depends_on=frozenset({lp.match.Writes(name),
                                       lp.match.Tagged("sumfact_stage1"),
                                       }),
                 tags=frozenset({"quad"}),
+                assignees=frozenset({name}),
                 )
 
     visitor.indices = indices
diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py
index 9394f746ec9cd7bafa569ccfbcb83d9a8e44663f..6e87697403eda5a8373b043fad3140fd56f87175 100644
--- a/python/dune/codegen/sumfact/accumulation.py
+++ b/python/dune/codegen/sumfact/accumulation.py
@@ -52,6 +52,7 @@ import pymbolic.primitives as prim
 from loopy.symbolic import WalkMapper
 import ufl.classes as uc
 from ufl import FiniteElement, MixedElement, TensorProductElement
+import itertools
 
 
 basis_sf_kernels = generator_factory(item_tags=("basis_sf_kernels",), context_tags='kernel', no_deco=True)
@@ -158,10 +159,12 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
         if self.trial_element is None:
             return ()
         else:
-            from dune.codegen.sumfact.basis import SumfactBasisMixin
-            return SumfactBasisMixin.lfs_inames(SumfactBasisMixin(), get_leaf(self.trial_element, self.trial_element_index), self.restriction)
+            mixin = get_form_option("basis_mixins")
+            from dune.codegen.generation import construct_from_mixins
+            MixinType = construct_from_mixins(mixins=[mixin], mixintype="basis", name="MixinType")
+            return MixinType.lfs_inames(MixinType(), get_leaf(self.trial_element, self.trial_element_index), self.restriction)
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # The result of stage 2 has the correct quadrature permutation but no
         # cost permutation is applied. The inames for this method are
         # quadrature and cost permuted. This means we need to reverse the cost
@@ -172,7 +175,8 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
@@ -347,7 +351,9 @@ class SumfactAccumulationMixin(AccumulationMixinBase):
         return get_accumulation_info(expr, self)
 
     def list_accumulation_infos(self, expr):
-        return list_accumulation_infos(expr, self)
+        return itertools.product(_gradsplitting_generator(expr, self),
+                                 _trial_generator(expr, self),
+                                 )
 
     def generate_accumulation_instruction(self, expr):
         return generate_accumulation_instruction(expr, self)
@@ -368,6 +374,61 @@ class SumfactAccumulationMixin(AccumulationMixinBase):
             return get_global_context_value("facemod_n")
         return None
 
+    def additional_matrix_sequence(self):
+        return None
+
+    @property
+    def prohibit_jacobian(self):
+        return False
+
+
+@accumulation_mixin("sumfact_pointdiagonal")
+class SumfactPointDiagonalAccumulationMixin(SumfactAccumulationMixin):
+    def additional_matrix_sequence(self):
+        info = self.current_info[1]
+        return construct_basis_matrix_sequence(transpose=True,
+                                               derivative=info.grad_index,
+                                               facedir=get_facedir(info.restriction),
+                                               facemod=get_facemod(info.restriction),
+                                               basis_size=get_basis_size(info),
+                                               )
+
+    def get_accumulation_info(self, expr):
+        element = expr.ufl_element()
+        leaf_element = element
+        element_index = 0
+        from ufl import MixedElement
+        if isinstance(expr.ufl_element(), MixedElement):
+            element_index = self.indices[0]
+            leaf_element = element.extract_component(element_index)[1]
+
+        restriction = self.restriction
+        if self.measure == 'exterior_facet':
+            from dune.codegen.pdelab.restriction import Restriction
+            restriction = Restriction.POSITIVE
+
+        grad_index = None
+        if self.reference_grad:
+            if isinstance(expr.ufl_element(), MixedElement):
+                grad_index = self.indices[1]
+            else:
+                grad_index = self.indices[0]
+
+        return SumfactAccumulationInfo(element=expr.ufl_element(),
+                                       element_index=element_index,
+                                       restriction=restriction,
+                                       grad_index=grad_index,
+                                       )
+
+    def list_accumulation_infos(self, expr):
+        return itertools.product(_gradsplitting_generator(expr, self, number=0),
+                                 _gradsplitting_generator(expr, self, number=1),
+                                 )
+
+    @property
+    def prohibit_jacobian(self):
+        return True
+
 
 class SumfactAccumulationInfo(ImmutableRecord):
     def __init__(self,
@@ -434,9 +495,9 @@ def _get_childs(element):
             yield (i, element.extract_component(i)[1])
 
 
-def _test_generator(expr, visitor):
+def _gradsplitting_generator(expr, visitor, number=0):
     from dune.codegen.ufl.modified_terminals import extract_modified_arguments
-    ma = extract_modified_arguments(expr, argnumber=0)
+    ma = extract_modified_arguments(expr, argnumber=number)
     if len(ma) == 0:
         return
     element = ma[0].argexpr.ufl_element()
@@ -452,7 +513,8 @@ def _test_generator(expr, visitor):
     for res in restrictions:
         for ei, e in _get_childs(element):
             for grad in (None,) + tuple(range(dim)):
-                yield SumfactAccumulationInfo(element_index=ei,
+                yield SumfactAccumulationInfo(element=element,
+                                              element_index=ei,
                                               restriction=res,
                                               grad_index=grad)
 
@@ -477,13 +539,20 @@ def _trial_generator(expr, visitor):
             yield SumfactAccumulationInfo(element_index=ei, restriction=res, element=e)
 
 
-def list_accumulation_infos(expr, visitor):
-    import itertools
-    return itertools.product(_test_generator(expr, visitor), _trial_generator(expr, visitor))
+def get_basis_size(info):
+    leaf_element = info.element
+    element_index = info.element_index
+    dim = world_dimension()
+    from ufl import MixedElement
+    if isinstance(leaf_element, MixedElement):
+        leaf_element = leaf_element.extract_component(element_index)[1]
+    degree = leaf_element._degree
+    if isinstance(degree, int):
+        degree = (degree,) * dim
+    return tuple(deg + 1 for deg in degree)
 
 
 def generate_accumulation_instruction(expr, visitor):
-    dim = world_dimension()
     test_info = visitor.test_info
     trial_info = visitor.trial_info
 
@@ -492,14 +561,7 @@ def generate_accumulation_instruction(expr, visitor):
     count_quadrature_point_operations(expr)
 
     # Number of basis functions per direction
-    leaf_element = test_info.element
-    from ufl import MixedElement
-    if isinstance(leaf_element, MixedElement):
-        leaf_element = leaf_element.extract_component(test_info.element_index)[1]
-    degree = leaf_element._degree
-    if isinstance(degree, int):
-        degree = (degree,) * dim
-    basis_size = tuple(deg + 1 for deg in degree)
+    basis_size = get_basis_size(test_info)
 
     # Anisotropic finite elements are not (yet) supported by Dune
     assert(size == basis_size[0] for size in basis_size)
@@ -535,20 +597,27 @@ def generate_accumulation_instruction(expr, visitor):
         derivative=test_info.grad_index,
         facedir=visitor.get_facedir(test_info.restriction),
         facemod=visitor.get_facemod(test_info.restriction),
-        basis_size=basis_size)
+        basis_size=basis_size,
+        additional_sequence=visitor.additional_matrix_sequence())
 
     jacobian_inames = trial_info.inames
     priority = test_info.grad_index
     if priority is None:
         priority = 3
 
+    trial_element = trial_info.element
+    trial_element_index = trial_info.element_index
+    if visitor.prohibit_jacobian:
+        trial_element = None
+        trial_element_index = 0
+
     output = AccumulationOutput(matrix_sequence=matrix_sequence,
                                 accumvar=accumvar,
                                 restriction=(test_info.restriction, trial_info.restriction),
                                 test_element=test_info.element,
                                 test_element_index=test_info.element_index,
-                                trial_element=trial_info.element,
-                                trial_element_index=trial_info.element_index,
+                                trial_element=trial_element,
+                                trial_element_index=trial_element_index,
                                 )
 
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index a1d7442addc4e2d1d720b1cc81d8a914c1f75532..1e801f4550d17d89ef8972fe3ffa867d4a44ea25 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -1,21 +1,28 @@
 """ Autotuning for sum factorization kernels """
 
-from dune.codegen.generation import cache_restoring, delete_cache_items
-from dune.codegen.loopy.target import DuneTarget
-from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
-from dune.codegen.options import get_option, set_option
-from dune.codegen.error import CodegenAutotuneError
-
-import loopy as lp
-from pytools import product
-
 import os
 import re
 import subprocess
 import filelock
 import hashlib
+import logging
+import json
+from operator import mul
+import pkg_resources
+from six.moves import reduce
+import textwrap
 import time
 
+import loopy as lp
+from pytools import product
+from cgen import ArrayOf, AlignedAttribute, Initializer
+
+from dune.codegen.generation import cache_restoring, delete_cache_items
+from dune.codegen.loopy.target import DuneTarget, type_floatingpoint
+from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
+from dune.codegen.options import get_option, set_option
+from dune.codegen.error import CodegenAutotuneError
+
 
 def get_cmake_cache_entry(entry):
     for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"):
@@ -301,13 +308,207 @@ def generate_standalone_code(sf, filename):
     set_option("opcounter", opcounting)
 
 
-def autotune_realization(sf):
+def generate_standalone_kernel_code(kernel, signature, filename, transformations=None):
+    # Turn off opcounting
+    opcounting = get_option("opcounter")
+    set_option("opcounter", False)
+
+    # Remove opcounter from signature
+    p = re.compile('OpCounter::OpCounter<([^>]*)>')
+    assert len(signature) == 1
+    sig = signature[0]
+    sig = p.sub(r'\1', sig)
+    assert 'OpCounter' not in signature
+
+    # Which transformations were applied
+    codegen_transformations = ''
+    if transformations:
+        codegen_transformations = ''
+        for trafo in transformations:
+            codegen_transformations += '// {}\n'.format(trafo)
+
+    template = 'kernel_benchmark_template1.cc.in'
+    use_datasets = True
+
+    # Old benchmark template
+    # template = 'kernel_benchmark_template0.cc.in'
+    # use_datasets = False
+
+    template_filename = pkg_resources.resource_filename(__name__, template)
+    with open(template_filename, 'r') as f:
+        benchmark = f.read()
+
+    # Find function arguments and global arguments
+    arguments = sig[sig.find('(') + 1:sig.find(')')].split(',')
+    arguments = [a.split(' ')[-1] for a in arguments]
+    global_args = [a for a in kernel.args if a.name not in arguments]
+    buffer_arguments = [a for a in arguments if a.startswith('buff')]
+    input_arguments = [a for a in arguments if a not in buffer_arguments]
+
+    # Declare global arguments
+    codegen_declare_global_arguments = ''
+    target = DuneTarget()
+    for g in global_args:
+        decl_info = g.decl_info(target, True, g.dtype)
+        for idi in decl_info:
+            ast_builder = target.get_device_ast_builder()
+            arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
+            arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
+            arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
+            codegen_declare_global_arguments += '{}\n'.format(arg_decl)
+    codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, '  ')
+
+    # Helper function for argument initialization
+    def _initialize_arg(arg):
+        if isinstance(arg, lp.ValueArg):
+            return []
+        real = type_floatingpoint()
+        size = reduce(mul, arg.shape)
+        fill_name = arg.name + '_fill'
+        lines = ['  {}* {} = (double *) {};'.format(real, fill_name, arg.name),
+                 '  for (std::size_t i=0; i<{}; ++i){{'.format(size),
+                 '    {}[i] = unif(re);'.format(fill_name),
+                 '  }']
+        return lines
+
+    # Initialize global arguments
+    codegen_initialize_global_arguments = ''
+    for arg in global_args:
+        lines = _initialize_arg(arg)
+        codegen_initialize_global_arguments += '\n'.join(lines) + '\n'
+    codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, '  ')
+
+    codegen_initialize_input = ''
+
+    # Function we want to benchmark
+    codegen_benchmark_function = ''
+    codegen_benchmark_function += sig[0:sig.find(')') + 1]
+    codegen_benchmark_function += lp.generate_body(kernel)
+    codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, '  ')
+
+    # Declare function arguments
+    codegen_declare_arguments = []
+    codegen_declare_input = []
+    function_arguments = [a for a in kernel.args if a.name in arguments]
+    for arg in function_arguments:
+        if 'buffer' in arg.name:
+            byte_size = reduce(mul, arg.shape) * 8
+            codegen_declare_arguments.append('  char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
+                                                                                                      byte_size,
+                                                                                                      arg.alignment),)
+        elif isinstance(arg, lp.ValueArg):
+            assert 'jacobian_offset' in arg.name
+            decl = arg.get_arg_decl(ast_builder)
+            decl = Initializer(decl, 'unif_int(re)')
+            codegen_declare_arguments.append(('  {}\n'.format(decl)))
+        else:
+            assert 'fastdg' in arg.name
+            size = reduce(mul, arg.shape)
+            min_stride = min([tag.stride for tag in arg.dim_tags])
+            size *= min_stride
+            alignment = arg.dtype.itemsize
+            real = type_floatingpoint()
+            if use_datasets:
+                codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                                                             arg.name,
+                                                                                                             size,
+                                                                                                             alignment)))
+            else:
+                codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                                                   arg.name,
+                                                                                                   size,
+                                                                                                   alignment)))
+
+    codegen_declare_arguments = ''.join(codegen_declare_arguments)
+    codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, '  ')
+    codegen_declare_input = ''.join(codegen_declare_input)
+    codegen_declare_input = textwrap.indent(codegen_declare_input, '  ')
+
+    # Initialize function arguments
+    codegen_initialize_arguments = ''
+    codegen_initialize_input = ''
+    for arg in function_arguments:
+        if 'fastdg' in arg.name:
+            if use_datasets:
+                lines = _initialize_arg(arg)
+                lines = ['  ' + a for a in lines]
+                lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines]
+                lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){')
+                lines.append('}')
+                codegen_initialize_input += '\n'.join(lines) + '\n'
+            else:
+                lines = _initialize_arg(arg)
+                codegen_initialize_arguments += '\n'.join(lines) + '\n'
+        else:
+            lines = _initialize_arg(arg)
+            codegen_initialize_arguments += '\n'.join(lines) + '\n'
+    codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, '  ')
+    codegen_initialize_input = textwrap.indent(codegen_initialize_input, '  ')
+
+    # Call the benchmark function
+    if use_datasets:
+        arguments_with_datasets = arguments.copy()
+        arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments]
+        codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n'
+        codegen_call_benchmark_function += '  ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n'
+        for arg in input_arguments:
+            codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg)
+        codegen_call_benchmark_function += '}'
+    else:
+        codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n'
+    codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, '    ')
+
+    # Replace placeholders in benchmark template
+    benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input)
+    benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments)
+    benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function)
+
+    # Write benchmark source file
+    with open(filename, 'w') as f:
+        f.writelines(benchmark)
+
+    # Maybe turn opcounting on again
+    set_option("opcounter", opcounting)
+
+
+def autotune_realization(sf=None, kernel=None, signature=None, transformations=None):
+    """Generate an microbenchmark, compile run and return time
+
+    Parameters
+    ----------
+    sf: SumfactKernel or VectorizedSumfactKernel
+    kernel: loopy.kernel.LoopKernel
+    signature: str
+    transformation: list of str
+        Will be used to distinguish between autotune targets
+    """
+    if sf is None:
+        assert kernel is not None
+        assert signature is not None
+    else:
+        assert kernel is None
+        assert signature is None
+
+    logger = logging.getLogger(__name__)
+
     # Make sure that the benchmark directory exists
     dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
     if not os.path.exists(dir):
         os.mkdir(dir)
 
-    basename = "autotune_sumfact_{}".format(sf.function_name)
+    if sf is None:
+        basename = "autotune_sumfact_{}".format(kernel.name)
+    else:
+        basename = "autotune_sumfact_{}".format(sf.function_name)
+    if transformations:
+        for trafo in transformations:
+            basename = '{}_{}'.format(basename, trafo)
     basename = hashlib.sha256(basename.encode()).hexdigest()
 
     filename = os.path.join(dir, "{}.cc".format(basename))
@@ -316,10 +517,16 @@ def autotune_realization(sf):
     executable = os.path.join(dir, basename)
 
     # Generate and compile a benchmark program
+    #
+    # Note: cache restoring is only necessary when generating from SumfactKernel
     with cache_restoring():
         with filelock.FileLock(lock):
             if not os.path.isfile(logname):
-                if get_option("autotune_google_benchmark"):
+                logger.debug('Generate autotune target in file {}'.format(filename))
+
+                if sf is None:
+                    generate_standalone_kernel_code(kernel, signature, filename, transformations)
+                elif get_option("autotune_google_benchmark"):
                     generate_standalone_code_google_benchmark(sf, filename)
                 else:
                     generate_standalone_code(sf, filename)
@@ -331,6 +538,7 @@ def autotune_realization(sf):
 
                 call.extend(compiler_invocation(executable, filename))
                 devnull = open(os.devnull, 'w')
+                os.environ['DUNE_CODEGEN_THREADS'] = '1'
                 ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
                 if ret != 0:
                     raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call)))
@@ -349,6 +557,7 @@ def autotune_realization(sf):
                 call.append(executable)
                 if get_option("autotune_google_benchmark"):
                     call.append("--benchmark_out={}".format(logname))
+                    call.append("--benchmark_repetitions=5")
                     # call.append("--benchmark_out_format=csv")
                 else:
                     call.append(logname)
@@ -366,7 +575,15 @@ def autotune_realization(sf):
                 with open(logname) as json_file:
                     try:
                         data = json.load(json_file)
-                        return data['benchmarks'][0]['cpu_time']
+                        minimal_time = 1e80
+                        for b in data['benchmarks']:
+                            if b['name'].endswith('_mean') or b['name'].endswith('_median') or b['name'].endswith('_stddev'):
+                                pass
+                            else:
+                                if b['cpu_time'] < minimal_time:
+                                    minimal_time = b['cpu_time']
+                        assert minimal_time < 1e80
+                        return minimal_time
                     except Exception as e:
                         print("Error while loading file {}".format(logname))
                         raise e
diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py
index 8be5fc9e4a5157fee7f597b3000282238d85dd56..83e5206b5289719f3c4e85f4ca523976cb6feb79 100644
--- a/python/dune/codegen/sumfact/basis.py
+++ b/python/dune/codegen/sumfact/basis.py
@@ -232,6 +232,28 @@ class SumfactBasisMixin(GenericBasisMixin):
         return prim.Subscript(var, vsf.quadrature_index(sf, self))
 
 
+@basis_mixin("sumfact_pointdiagonal")
+class SumfactPointDiagonalBasisMixin(SumfactBasisMixin):
+    def lfs_inames(self, element, restriction, number=1):
+        return ()
+
+    def implement_basis(self, element, restriction, number):
+        info = self.current_info[number]
+        if element == info.element and restriction == info.restriction:
+            return 1
+        else:
+            return 0
+
+    def implement_reference_gradient(self, element, restriction, number):
+        index, = self.indices
+        self.indices = None
+        info = self.current_info[number]
+        if element == info.element and restriction == info.restriction and index == info.grad_index:
+            return 1
+        else:
+            return 0
+
+
 class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
     def __init__(self,
                  matrix_sequence=None,
@@ -342,14 +364,15 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
 
         return insn_dep.union(frozenset({insn}))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Note: Here we do not need to reverse any permutation since this is
         # already done in the setup_input method above!
 
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py
index 14a095696de75f6e169d5772949d1ed4707c2c7b..6f82b196a048996ed25a9eb22465565a181b5722 100644
--- a/python/dune/codegen/sumfact/geometry.py
+++ b/python/dune/codegen/sumfact/geometry.py
@@ -454,11 +454,12 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
 
         return insn_dep.union(frozenset({insn}))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in
new file mode 100644
index 0000000000000000000000000000000000000000..c4b49da5aec59006a9f007534dc7711b98ef0387
--- /dev/null
+++ b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in
@@ -0,0 +1,37 @@
+// Transformations:
+${CODEGEN_TRANSFORMATIONS}
+
+#include "config.h"
+#include <iostream>
+#include <fstream>
+#include <random>
+#include "benchmark/benchmark.h"
+#include <dune/codegen/common/vectorclass.hh>
+#include <dune/codegen/sumfact/horizontaladd.hh>
+
+${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+
+${CODEGEN_BENCHMARK_FUNCTION}
+
+static void BM_sumfact_kernel(benchmark::State& state){
+  std::uniform_real_distribution<double> unif(0,1);
+  std::uniform_int_distribution<int> unif_int(0,1);
+  std::default_random_engine re;
+
+  ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+  ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}
+
+  ${CODEGEN_DECLARE_INPUT}
+  ${CODEGEN_INITIALIZE_INPUT}
+
+  ${CODEGEN_DECLARE_ARGUMENTS}
+  ${CODEGEN_INITIALIZE_ARGUMENTS}
+
+  for (auto _ : state){
+    ${CODEGEN_CALL_BENCHMARK_FUNCTION}
+  }
+}
+
+${CODEGEN_BENCHMARK}
+
+BENCHMARK_MAIN();
diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in
new file mode 100644
index 0000000000000000000000000000000000000000..c03577d672123b3063020841b731fc1fd9780fce
--- /dev/null
+++ b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in
@@ -0,0 +1,58 @@
+// Transformations:
+${CODEGEN_TRANSFORMATIONS}
+
+#include "config.h"
+
+#include <iostream>
+#include <fstream>
+#include <random>
+#include <cstdlib>
+
+#include "benchmark/benchmark.h"
+#include <dune/codegen/common/vectorclass.hh>
+#include <dune/codegen/sumfact/horizontaladd.hh>
+
+const int datasets = 512;
+
+class Benchmark{
+  ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+
+  ${CODEGEN_DECLARE_INPUT}
+
+public:
+  Benchmark (){
+    std::uniform_real_distribution<double> unif(0,1);
+    std::uniform_int_distribution<int> unif_int(0,1);
+    std::default_random_engine re;
+
+    ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}
+
+    ${CODEGEN_INITIALIZE_INPUT}
+  }
+
+  ${CODEGEN_BENCHMARK_FUNCTION}
+
+  void run_benchmark(){
+    ${CODEGEN_DECLARE_ARGUMENTS}
+
+    std::uniform_real_distribution<double> unif(0,1);
+    std::uniform_int_distribution<int> unif_int(0,1);
+    std::default_random_engine re;
+
+    ${CODEGEN_INITIALIZE_ARGUMENTS}
+
+    ${CODEGEN_CALL_BENCHMARK_FUNCTION}
+  }
+
+};
+
+static void BM_sumfact_kernel(benchmark::State& state){
+  Benchmark benchmark;
+  for (auto _ : state){
+    benchmark.run_benchmark();
+  }
+}
+
+BENCHMARK(BM_sumfact_kernel) -> Threads(atoi(std::getenv("DUNE_CODEGEN_THREADS"))) -> UseRealTime();
+
+BENCHMARK_MAIN();
diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py
index 14cd0b1d0fe3814f294c22bc00f25260f4ee0233..461f5b0a3ace21e09a7a3707c4849aa1c36e6972 100644
--- a/python/dune/codegen/sumfact/realization.py
+++ b/python/dune/codegen/sumfact/realization.py
@@ -30,7 +30,6 @@ from dune.codegen.sumfact.quadrature import quadrature_points_per_direction
 from dune.codegen.sumfact.symbolic import (SumfactKernel,
                                            VectorizedSumfactKernel,
                                            )
-from dune.codegen.sumfact.vectorization import attach_vectorization_info
 from dune.codegen.sumfact.accumulation import sumfact_iname
 from dune.codegen.loopy.target import dtype_floatingpoint
 from dune.codegen.loopy.vcl import ExplicitVCLCast
@@ -61,6 +60,12 @@ def name_buffer_storage(buff, which):
     return name
 
 
+def _max_sum_factorization_buffer_size(sf):
+    size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width,
+               product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width)
+    return size
+
+
 @kernel_cached
 def _realize_sum_factorization_kernel(sf):
     insn_dep = sf.insn_dep
@@ -74,8 +79,7 @@ def _realize_sum_factorization_kernel(sf):
     for buf in buffers:
         # Determine the necessary size of the buffer. We assume that we do not
         # underintegrate the form!!!
-        size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width,
-                   product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width)
+        size = _max_sum_factorization_buffer_size(sf)
         temporary_variable("{}_dummy".format(buf),
                            shape=(size,),
                            custom_base_storage=buf,
@@ -118,10 +122,32 @@ class BufferSwitcher(object):
     def __init__(self):
         self.current = 0
 
-    def get_temporary(self, name=None, **kwargs):
+    def get_temporary(self, sf=None, name=None, **kwargs):
+        assert sf
         assert name
+
         bs = "buffer{}".format(self.current)
-        globalarg(bs)
+        shape = kwargs['shape']
+        assert shape
+        dim_tags = kwargs['dim_tags'].split(',')
+        assert dim_tags
+
+        # Calculate correct alignment
+        vec_size = 1
+        if 'vec' in dim_tags:
+            vec_size = shape[dim_tags.index('vec')]
+        from dune.codegen.loopy.target import dtype_floatingpoint
+        dtype = np.dtype(kwargs.get("dtype", dtype_floatingpoint()))
+        alignment = dtype.itemsize * vec_size
+
+        # Add this buffer as global argument to the kernel. Add a shape to make
+        # benchmark generation for autotuning of loopy kernels easier. Since
+        # this buffer is used to store different data sizes we need to make
+        # sure it is big enough.
+        assert sf
+        size = _max_sum_factorization_buffer_size(sf)
+        globalarg(bs, shape=(size,), alignment=alignment, dim_tags=['f', ])
+
         temporary_variable(name,
                            managed=True,
                            custom_base_storage=bs,
@@ -193,7 +219,8 @@ def realize_sumfact_kernel_function(sf):
         if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible:
             input_summand = sf.interface.realize_direct_input(input_inames, inp_shape)
         elif l == 0:
-            input_summand = sf.interface.realize_input(input_inames,
+            input_summand = sf.interface.realize_input(sf,
+                                                       input_inames,
                                                        inp_shape,
                                                        vec_iname,
                                                        vec_shape,
@@ -204,7 +231,8 @@ def realize_sumfact_kernel_function(sf):
             # Get a temporary that interprets the base storage of the input
             # as a column-major matrix. In later iteration of the matrix loop
             # this reinterprets the output of the previous iteration.
-            inp = buffer.get_temporary("buff_step{}_in".format(l),
+            inp = buffer.get_temporary(sf,
+                                       "buff_step{}_in".format(l),
                                        shape=inp_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
@@ -250,7 +278,8 @@ def realize_sumfact_kernel_function(sf):
                 output_inames = permute_backward(output_inames, sf.interface.quadrature_permutation)
                 output_shape = permute_backward(output_shape, sf.interface.quadrature_permutation)
 
-            out = buffer.get_temporary("buff_step{}_out".format(l),
+            out = buffer.get_temporary(sf,
+                                       "buff_step{}_out".format(l),
                                        shape=output_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
@@ -265,7 +294,8 @@ def realize_sumfact_kernel_function(sf):
 
         else:
             output_shape = tuple(out_shape[1:]) + (out_shape[0],)
-            out = buffer.get_temporary("buff_step{}_out".format(l),
+            out = buffer.get_temporary(sf,
+                                       "buff_step{}_out".format(l),
                                        shape=output_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py
index 60641b0ce4af4ddef525a4c03a438c4ebe277c0c..dfd9383f93c79d3c50bfcca24cdd717fcc16aa58 100644
--- a/python/dune/codegen/sumfact/symbolic.py
+++ b/python/dune/codegen/sumfact/symbolic.py
@@ -60,7 +60,7 @@ class SumfactKernelInterfaceBase(object):
         """
         raise NotImplementedError
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         """Interpret the input of sumfact kernel function in the right way (non fastdgg)
 
         This happens inside the sumfact kernel function.
@@ -73,6 +73,7 @@ class SumfactKernelInterfaceBase(object):
 
         Parameters
         ----------
+        sf : SumfactKernel or VectorizedSumfactKernel
         inames : tuple of pymbolic.primitives.Variable
             Inames for accesing the input. Ordered according to permuted matrix sequence.
         shape : tuple of int
@@ -252,11 +253,12 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
             dep = dep.union(inp.setup_input(sf, dep, index=i))
         return dep
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
@@ -358,7 +360,7 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
 
         return prim.Call(prim.Variable(hadd_function), (result,))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # The input for stage 3 is quadrature permuted. The inames and shape
         # passed to this method are quadrature and cost permuted. This means we
         # need to take the cost permutation back to get the right inames and
@@ -368,7 +370,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
 
         # Get a temporary that interprets the base storage of the input as a
         # column-major matrix.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
@@ -382,9 +385,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
     def realize_direct_output(self, result, inames, shape, **args):
         outputs = set(self.interfaces)
 
-        # If multiple horizontal_add's are to be performed with 'result'
-        # we need to precompute the result!
         if len(outputs) > 1:
+            # Introduce substrule for the argument of the horizontal add
             substname = "haddsubst_{}".format("_".join([i.name for i in inames]))
             subst_rule(substname, (), result)
             result = prim.Call(prim.Variable(substname), ())
@@ -503,6 +505,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
         the transformation brings the next reduction index in the fastest
         position.
 
+        In einsum notation from numpy this can be written as three contractions
+        of the form: 'ij,jkl->kli'
+
         It can make sense to permute the order of directions. If you have
         a small m_l (e.g. stage 1 on faces) it is better to do direction l
         first. This can be done by:
@@ -533,6 +538,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
             other.
         interface: An SumfactKernelInterfaceBase instance describing the input
             (stage 1) or output (stage 3) of the kernel
+
         """
         # Assert the inputs!
         assert isinstance(matrix_sequence, tuple)
@@ -828,6 +834,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
                  vertical_width=1,
                  buffer=None,
                  insn_dep=frozenset(),
+                 transformations=(),
                  ):
         # Assert the input data structure
         assert isinstance(kernels, tuple)
@@ -880,8 +887,9 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     #
     @property
     def function_name(self):
-        return "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted),
+        name = "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted),
                                     self.interface.function_name_suffix)
+        return name
 
     @property
     def cache_key(self):
diff --git a/python/dune/codegen/sumfact/tabulation.py b/python/dune/codegen/sumfact/tabulation.py
index 9def97eb3ba4fdae280cc65e7f12ca73164d1146..e2e4a771ec9918f960e5ea88821aa7acb885a0c2 100644
--- a/python/dune/codegen/sumfact/tabulation.py
+++ b/python/dune/codegen/sumfact/tabulation.py
@@ -50,6 +50,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
                  direction=None,
                  slice_size=None,
                  slice_index=None,
+                 additional_tabulation=None,
                  ):
         """
         Arguments:
@@ -61,6 +62,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
         direction: Direction corresponding to this matrix
         slice_size: Number of slices for this direction
         slice_index: To which slice does this belong
+        additional_tabulation: A factor to be multiplied with this basis tabulation matrix.
         """
         assert(isinstance(basis_size, int))
         ImmutableRecord.__init__(self,
@@ -71,6 +73,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
                                  direction=direction,
                                  slice_size=slice_size,
                                  slice_index=slice_index,
+                                 additional_tabulation=additional_tabulation,
                                  )
 
     @property
@@ -90,6 +93,9 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
         if self.slice_size is not None:
             infos.append("s{}".format(self.slice_index))
 
+        if self.additional_tabulation is not None:
+            infos.append("prod{}".format(self.additional_tabulation._shortname))
+
         return "".join(infos)
 
     def __str__(self):
@@ -122,7 +128,9 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
     def pymbolic(self, indices):
         name = str(self)
         define_theta(name, self)
-        return prim.Subscript(prim.Variable(name), indices)
+        ret = prim.Subscript(prim.Variable(name), indices)
+
+        return ret
 
     @property
     def vectorized(self):
@@ -461,22 +469,31 @@ def define_theta(name, tabmat, additional_indices=(), width=None):
     else:
         args.append(tabmat.face)
 
+    # Get right hand side of basis evaluation matrix assignment
+    expr = prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args))
+
+    # Maybe multiply another matrix (needed for the very special case of assembling point diagonals)
+    if tabmat.additional_tabulation is not None:
+        expr = prim.Product((expr, prim.Call(PolynomialLookup(polynomials, tabmat.additional_tabulation.derivative), tuple(args))))
+
     instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices),
-                expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)),
+                expression=expr,
                 kernel="operator",
                 depends_on=dep,
                 )
 
 
-def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None, basis_size=None):
+def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None, basis_size=None, additional_sequence=None):
     dim = world_dimension()
     result = [None] * dim
+    if additional_sequence is None:
+        additional_sequence = [None] * dim
     quadrature_size = quadrature_points_per_direction()
     assert (basis_size is not None)
     if facedir is not None:
         quadrature_size = quadrature_size[:facedir] + (1,) + quadrature_size[facedir:]
 
-    for i in range(dim):
+    for i, add_seq in zip(range(dim), additional_sequence):
         onface = None
         if facedir == i:
             onface = facemod
@@ -485,6 +502,7 @@ def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=No
                                           basis_size=basis_size[i],
                                           transpose=transpose,
                                           derivative=derivative == i,
-                                          face=onface)
+                                          face=onface,
+                                          additional_tabulation=add_seq)
 
     return tuple(result)
diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py
new file mode 100644
index 0000000000000000000000000000000000000000..a752d217add1013bbcb22771824cf992103b2237
--- /dev/null
+++ b/python/dune/codegen/sumfact/transformations.py
@@ -0,0 +1,422 @@
+import re
+import logging
+
+import loopy as lp
+import pymbolic.primitives as prim
+import islpy as isl
+
+from dune.codegen.generation import (get_counted_variable,
+                                     get_global_context_value,
+                                     )
+from dune.codegen.loopy.transformations.remove_reductions import remove_all_reductions, remove_reduction
+from dune.codegen.options import get_form_option, get_option
+from dune.codegen.pdelab.geometry import world_dimension
+from dune.codegen.error import CodegenAutotuneError
+from dune.codegen.sumfact.autotune import autotune_realization
+
+
+def _current_iname_order(current_inames, new_iname_order):
+    """Sort the inames for this contraction according to new_iname order"""
+    current_iname_order = []
+    for i in new_iname_order:
+        for j in current_inames:
+            if i in j:
+                current_iname_order.append(j)
+    return current_iname_order
+
+
+def _get_inames_of_reduction(instr, iname_permutation):
+    # Check which loops are inside the reduction loop
+    reduction_index = iname_permutation[-1]
+    within_reduction = tuple(i > reduction_index for i in iname_permutation)
+
+    # Basename of inner inames
+    dim = world_dimension()
+    inner_inames_base = []
+    for index, within in enumerate(within_reduction):
+        if within:
+            inner_inames_base.append('sf_out_inames_{}'.format(dim - 1 - index))
+
+    # Name of inner inames and outer inames
+    inner_inames = []
+    outer_inames = []
+    vec_inames = []
+    for i in [iname for iname in instr.within_inames]:
+        for j in inner_inames_base:
+            if j in i:
+                inner_inames.append(i)
+        if 'sf_vec' in i:
+            vec_inames.append(i)
+        elif i not in inner_inames:
+            outer_inames.append(i)
+
+    # Reduction iname
+    regex = re.compile('sf_red_([0-9]*)')
+    reduction_index = set(regex.findall(str(instr)))
+    if len(reduction_index) == 0:
+        reduction_iname = None
+    else:
+        assert len(reduction_index) == 1
+        reduction_index = reduction_index.pop()
+        reduction_iname = 'sf_red_{}'.format(reduction_index)
+
+    return outer_inames, reduction_iname, inner_inames, vec_inames
+
+
+class FindReductionMapper(lp.symbolic.WalkMapper):
+    def __init__(self):
+        self.reductions = []
+
+    def map_reduction(self, expr, **kwargs):
+        self.reductions.append(expr)
+        return lp.symbolic.WalkMapper.map_reduction(self, expr, **kwargs)
+
+
+def _reorder_reduction_loops(kernel, match, iname_order):
+    """Reorder loops in instructions containing one reduction
+
+    In order to reorder the loops we need to manually do the accumulation in a
+    temporary that is large enough to fit all the data created in loops inside
+    the reduction loop.
+
+    Parameters
+    ----------
+    kernel: loopy.kernel.LoopKernel
+    match:
+    iname_order: tuple of str
+        prefered loop order for this instruction
+    """
+    instructions = lp.find_instructions(kernel, match)
+    for instr in instructions:
+        # Get reduction
+        reduction = FindReductionMapper()
+        reduction(instr.expression)
+        assert len(reduction.reductions) == 1
+        reduction = reduction.reductions[0]
+
+        # Reduction iname, inner inames and vetcor inames
+        assert len(reduction.inames) == 1
+        reduction_iname = reduction.inames[0]
+        assert set([reduction_iname]) == instr.reduction_inames()
+        if reduction_iname not in iname_order:
+            lp.prioritize_loops(kernel, iname_order)
+            continue
+        inner_inames = iname_order[iname_order.index(reduction_iname) + 1:]
+        if len(inner_inames) == 0:
+            lp.prioritize_loops(kernel, iname_order)
+            continue
+        # TODO: There should be a better way to do that
+        vec_inames = [i for i in instr.within_inames if 'sf_vec' in i]
+        assert len(vec_inames) < 2
+
+        # {{{ Create new temporary variable
+
+        # Create dim_tags
+        dim_tags = ','.join(['f'] * len(inner_inames))
+        vectorized = len(vec_inames) > 0
+        if vectorized:
+            assert len(vec_inames) == 1
+            dim_tags = dim_tags + ',vec'
+
+        # Create shape
+        shape = tuple(kernel.get_constant_iname_length(i) for i in inner_inames + vec_inames)
+
+        # Update temporary_variables of this kernel
+        from dune.codegen.loopy.temporary import DuneTemporaryVariable
+        accum_variable = get_counted_variable('accum_variable')
+        from dune.codegen.loopy.target import dtype_floatingpoint
+        dtype = lp.types.NumpyType(dtype_floatingpoint())
+        var = {accum_variable: DuneTemporaryVariable(accum_variable,
+                                                     dtype=dtype,
+                                                     shape=shape,
+                                                     dim_tags=dim_tags,
+                                                     managed=True)}
+        tv = kernel.temporary_variables.copy()
+        tv.update(var)
+        kernel = kernel.copy(temporary_variables=tv)
+
+        # }}}
+
+        # Set accumulation variable to zero
+        accum_init_inames = tuple(prim.Variable(i) for i in inner_inames)
+        if vectorized:
+            accum_init_inames = accum_init_inames + (prim.Variable(vec_inames[0]),)
+        assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
+        accum_init_id = instr.id + '_accum_init'
+        accum_init_instr = lp.Assignment(assignee,
+                                         0,
+                                         within_inames=instr.within_inames,
+                                         id=accum_init_id,
+                                         depends_on=instr.depends_on,
+                                         tags=('accum_init',),
+                                         )
+        kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr])
+
+        # Accumulate in temporary variable
+        assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
+        expression = prim.Sum((assignee, reduction.expr))
+        within_inames = frozenset(tuple(instr.within_inames) + reduction.inames)
+        accum_id = instr.id + '_accum'
+        accum_instr = lp.Assignment(assignee,
+                                    expression,
+                                    within_inames=within_inames,
+                                    id=accum_id,
+                                    depends_on=frozenset([accum_init_id]),
+                                    tags=('accum',),
+                                    )
+        kernel = kernel.copy(instructions=kernel.instructions + [accum_instr])
+
+        # Replace reduction in insn with accumulation result
+        def map_reduction(expr, rec):
+            return assignee
+        mapper = lp.symbolic.ReductionCallbackMapper(map_reduction)
+        new_expression = mapper(instr.expression)
+        assign_id = instr.id + '_assign'
+        new_instr = instr.copy(expression=new_expression,
+                               id=assign_id,
+                               depends_on=frozenset(list(instr.depends_on) + [accum_id]))
+        kernel = kernel.copy(instructions=kernel.instructions + [new_instr])
+
+        # Fix dependencies and remove old instruction
+        for i in kernel.instructions:
+            if instr.id in i.depends_on:
+                match = lp.match.Id(i.id)
+                kernel = lp.add_dependency(kernel, match, assign_id)
+        kernel = lp.remove_instructions(kernel, set([instr.id]))
+
+        # Duplicate and reorder loops for accumulation
+        ids = [accum_init_id, accum_id]
+        duplicate_inames = tuple(inner_inames)
+        for idx in ids:
+            match = lp.match.Id(idx)
+            kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+            match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
+            current_iname_order = _current_iname_order(match_inames, iname_order)
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+
+        # Reorder assignment loops
+        kernel = lp.prioritize_loops(kernel, iname_order)
+        return kernel
+
+
+def _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation):
+    """Reorder the loop nests of a tensor contraction accumulating directly in the data structure"""
+    dim = world_dimension()
+
+    # Nothing to do if permutation is identity
+    if iname_permutation == tuple(range(dim + 1)):
+        return kernel
+
+    # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
+    default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
+    from dune.codegen.sumfact.permutation import permute_backward
+    new_iname_order = permute_backward(default_iname_order, iname_permutation)
+
+    for instr in kernel.instructions:
+        # Inames used in this reduction
+        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
+                                                                                           iname_permutation)
+        if reduction_iname:
+            current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
+        else:
+            current_inames = outer_inames + inner_inames + vec_inames
+        current_iname_order = _current_iname_order(current_inames,
+                                                   new_iname_order)
+
+        # We can directly use lp.prioritize_loops if
+        # - The reduction is the innermost loop
+        # - There is no reduction (eg reduced direction on faces)
+        if iname_permutation[-1] == dim or reduction_iname is None:
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+            continue
+
+        if isinstance(instr.expression, lp.symbolic.Reduction):
+            # If the instruction is a reduction we can directly accumulate in the assignee
+            assert set(inner_inames).issubset(set(i.name for i in instr.assignee.index_tuple))
+            match = lp.match.Id(instr.id)
+            kernel = remove_reduction(kernel, match)
+
+            lp.prioritize_loops(kernel, current_iname_order)
+            duplicate_inames = tuple(inner_inames)
+            match = lp.match.Id(instr.id + '_set_zero')
+            kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+            match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
+            set_zero_iname_order = _current_iname_order(match_inames, new_iname_order)
+            lp.prioritize_loops(kernel, tuple(set_zero_iname_order))
+        else:
+            # In stage 3 this is usually not a reduction. In this case direct
+            # accumulation is not possible and we accumulate in a temporay
+            # variable instead
+            match = lp.match.Id(instr.id)
+            kernel = _reorder_reduction_loops(kernel, match, current_iname_order)
+
+    return kernel
+
+
+def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
+    """Reorder the loop nests of a tensor contraction using an accumulation variable"""
+    dim = world_dimension()
+
+    # Nothing to do if permutation is identity
+    if iname_permutation == tuple(range(dim + 1)):
+        return kernel
+
+    # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
+    default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
+    from dune.codegen.sumfact.permutation import permute_backward
+    new_iname_order = permute_backward(default_iname_order, iname_permutation)
+
+    for instr in kernel.instructions:
+        # Inames used in this reduction
+        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
+                                                                                           iname_permutation)
+        if reduction_iname:
+            current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
+        else:
+            current_inames = outer_inames + inner_inames + vec_inames
+        current_iname_order = _current_iname_order(current_inames, new_iname_order)
+
+        # We can directly use lp.prioritize_loops if:
+        # - The reduction is the innermost loop
+        # - There is no reduction (eg reduced direction on faces)
+        if iname_permutation[-1] == dim or reduction_iname is None:
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+            continue
+
+        match = lp.match.Id(instr.id)
+        kernel = _reorder_reduction_loops(kernel, match, current_iname_order)
+
+    return kernel
+
+
+def reorder_loops_in_tensor_contraction(kernel, iname_permutation, accum_variable=True):
+    """Change loop order in tensor contractions of sum factorization kernel
+
+    Since there is a reduction involved this implies more than just reordering
+    loops. There are two possibilities to handle the reduction:
+
+    1) Generate an object of correct size for the accumulation and assign to
+    the result data structure afterwards
+
+    2) Make sure to set the correct entries of the result data structure to
+    zero and accumulate inplace
+
+    Parameters
+    ----------
+    kernel: loopy.kernel.LoopKernel
+    iname_permutation: tuple of ints
+        How to permute the loop inames. Should contain some permutation of the
+        numbers 0 to dim+1
+    accum_variable: bool
+        If True use method 1 else use method 2 from above
+
+    Notes
+    -----
+    Using einsum notation from numpy a contraction of a sum factorization
+    kernel can be written as:
+
+    - 3D: 'ij,jkl->kli'
+    - 2D: 'ij,jk->ki'
+
+    In the sum factorization kernel itself those inames are called:
+
+    sf_out_inames_2_* : l
+    sf_out_inames_1_* : k
+    sf_out_inames_0_* : i
+    sf_red_* : j
+
+    where * represents the current direction (0,1,2 for 3D problems). The
+    default order of the loops is
+
+    (sf_out_iname_2_*, sf_out_iname_1_*, sf_out_iname_0_*, red_*)
+
+    A permutation vector of (3,2,0,1) would mean that we do
+
+    (sf_out_iname_0_*, red_*, sf_out_iname_1_*, sf_out_iname_2_*)
+
+    instead.
+
+    """
+    if accum_variable:
+        kernel = _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation)
+        return kernel
+    else:
+        kernel = _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation)
+        return kernel
+
+
+def tensor_contraction_loop_order_generator(kernel):
+    yield kernel, ['None']
+
+    dim = world_dimension()
+    identity = range(dim + 1)
+    import itertools
+    for permutation in itertools.permutations(identity):
+        # Culling of stupid variant
+        if permutation[0] == dim:
+            continue
+
+        new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, accum_variable=True)
+        yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_True'.format(permutation)]
+
+        new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, accum_variable=False)
+        yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_False'.format(permutation)]
+
+
+def simple_autotuner(kernel_generator, signature):
+    kernel, transformations = next(kernel_generator)
+    best_cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations)
+    best_kernel = kernel
+    best_transformations = transformations
+    for kernel, transformations in kernel_generator:
+        cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations)
+        if cost < best_cost:
+            best_cost = cost
+            best_kernel = kernel
+            best_transformations = transformations
+    return best_kernel, best_transformations
+
+
+def autotune_tensor_contraction_loop_order(kernel, signature):
+    logger = logging.getLogger(__name__)
+
+    assert get_option('autotune_google_benchmark')
+    from dune.codegen.loopy.transformations.matchfma import match_fused_multiply_add
+    kernel = match_fused_multiply_add(kernel)
+    generator = tensor_contraction_loop_order_generator(kernel)
+    kernel, transformations = simple_autotuner(generator, signature)
+    logger.debug('autotute_tensor_contraction_loop_order - kernel {} transformations {}'.format(kernel.name, transformations))
+    return kernel
+
+
+def sumfact_performance_transformations(kernel, signature):
+    if get_form_option('sumfact_performance_transformations'):
+        assert not get_form_option('sumfact_performance_transformations_testrun')
+        if kernel.name.startswith('sfimpl'):
+            kernel = autotune_tensor_contraction_loop_order(kernel, signature)
+
+    # Testing performance transformations
+    elif get_form_option('sumfact_performance_transformations_testrun'):
+        assert not get_form_option('sumfact_performance_transformations')
+        if kernel.name.startswith('sfimpl'):
+            dim = world_dimension()
+            testcase = get_form_option('sumfact_performance_transformations_testrun')
+            if dim == 2:
+                testrun_dict = {
+                    1: [(reorder_loops_in_tensor_contraction, ((0, 2, 1), True))],
+                    2: [(reorder_loops_in_tensor_contraction, ((0, 2, 1), False))],
+                    3: [(reorder_loops_in_tensor_contraction, ((2, 0, 1), True))],
+                    4: [(reorder_loops_in_tensor_contraction, ((2, 0, 1), False))],
+                }
+                for trafo, arguments in testrun_dict[testcase]:
+                    kernel = trafo(kernel, *arguments)
+            else:
+                testrun_dict = {
+                    1: [(reorder_loops_in_tensor_contraction, ((3, 2, 0, 1), True))],
+                    2: [(reorder_loops_in_tensor_contraction, ((3, 2, 0, 1), False))],
+                    3: [(reorder_loops_in_tensor_contraction, ((2, 0, 1, 3), True))],
+                    4: [(reorder_loops_in_tensor_contraction, ((2, 0, 1, 3), False))],
+                }
+                for trafo, arguments in testrun_dict[testcase]:
+                    kernel = trafo(kernel, *arguments)
+    return kernel
diff --git a/python/dune/codegen/ufl/finiteelement.py b/python/dune/codegen/ufl/finiteelement.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae0ef35f2a3b30cd2837525447233dfe017cd76d
--- /dev/null
+++ b/python/dune/codegen/ufl/finiteelement.py
@@ -0,0 +1,43 @@
+""" Registering finite elements that we have, but FEniCS does not """
+
+from ufl import register_element, L2
+
+
+register_element("Discontinuous Gauss-Lobatto-Legendre",
+                 "DGLL",
+                 0,
+                 L2,
+                 "identity",
+                 (0, None),
+                 ('interval',),
+                 )
+
+
+register_element("Monomials",
+                 "Monom",
+                 0,
+                 L2,
+                 "identity",
+                 (0, None),
+                 ('interval',),
+                 )
+
+
+register_element("L2-Orthonormal Polynomials",
+                 "OPB",
+                 0,
+                 L2,
+                 "identity",
+                 (0, None),
+                 ('interval',),
+                 )
+
+
+register_element("Rannacher-Turek",
+                 "RaTu",
+                 0,
+                 L2,
+                 "identity",
+                 (1, 1),
+                 ('quadrilateral', 'hexahedron'),
+                 )
diff --git a/python/dune/codegen/ufl/transformations/blockpreconditioner.py b/python/dune/codegen/ufl/transformations/blockpreconditioner.py
index c992af12e0cae3499f6f6694027610dfd53c535e..b3fd11249e766bdc8abec4c0d01defbf5de9e59c 100644
--- a/python/dune/codegen/ufl/transformations/blockpreconditioner.py
+++ b/python/dune/codegen/ufl/transformations/blockpreconditioner.py
@@ -21,7 +21,7 @@ class OffDiagonalBlockSwitcher(MultiFunction):
     def positive_restricted(self, o):
         self.res = Restriction.POSITIVE
         ret = self(o.ufl_operands[0])
-        self.rest = Restriction.NONE
+        self.res = Restriction.NONE
         if isinstance(ret, uc.Zero):
             return ret
         else:
@@ -55,6 +55,8 @@ class OffDiagonalBlockSwitcher(MultiFunction):
 def list_restriction_tuples(diagonal):
     if diagonal:
         yield (Restriction.NONE, Restriction.NONE)
+        yield (Restriction.POSITIVE, Restriction.POSITIVE)
+        return
 
     res = (Restriction.POSITIVE, Restriction.NEGATIVE)
     amount = 1 if diagonal else 2
diff --git a/python/dune/codegen/ufl/visitor.py b/python/dune/codegen/ufl/visitor.py
index ab7c334f323bbc51221d926d357466d6ebc8cf88..e774e03d90e8bc61a1108767867ab046bfa56c81 100644
--- a/python/dune/codegen/ufl/visitor.py
+++ b/python/dune/codegen/ufl/visitor.py
@@ -37,6 +37,7 @@ from ufl.classes import (Coefficient,
                          JacobianDeterminant,
                          )
 
+from pytools import product as ptproduct
 import pymbolic.primitives as prim
 import numpy as np
 
@@ -278,7 +279,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
     #
 
     def product(self, o):
-        return prim.flattened_product(tuple(self.call(op) for op in o.ufl_operands))
+        ops = tuple(self.call(op) for op in o.ufl_operands)
+        if all(isinstance(op, (int, float)) for op in ops):
+            return ptproduct(ops)
+        return prim.flattened_product(ops)
 
     def float_value(self, o):
         return o.value()
@@ -290,7 +294,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         return prim.quotient(self.call(o.ufl_operands[0]), self.call(o.ufl_operands[1]))
 
     def sum(self, o):
-        return prim.flattened_sum(tuple(self.call(op) for op in o.ufl_operands))
+        ops = tuple(self.call(op) for op in o.ufl_operands)
+        if all(isinstance(op, (int, float)) for op in ops):
+            return sum(ops)
+        return prim.flattened_sum(ops)
 
     def zero(self, o):
         # UFL has Zeroes with shape. We ignore those indices.
diff --git a/python/setup.py b/python/setup.py
index 88431ad1b496afc1c41d13fae5608956274f3018..d847cb90064e734efe22ea0646129fa8bb86be35 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -47,5 +47,6 @@ setup(name='dune.codegen',
         "console_scripts": [
             "generate_operators = dune.codegen.compile:entry_generate_operators",
             "generate_driver = dune.codegen.compile:entry_generate_driver",
+            "show_options = dune.codegen.options:show_options",
         ]
     })
diff --git a/test/sumfact/CMakeLists.txt b/test/sumfact/CMakeLists.txt
index 187aee212ed125a99bc82670fe34e680379542f7..61b81b76082cfa3c83b04d2632194d5a4a6e6447 100644
--- a/test/sumfact/CMakeLists.txt
+++ b/test/sumfact/CMakeLists.txt
@@ -2,3 +2,4 @@ add_subdirectory(hyperbolic)
 add_subdirectory(mass)
 add_subdirectory(poisson)
 add_subdirectory(stokes)
+add_subdirectory(preconditioner)
diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt
index 5bb10fbcd303a916ce1ab353d7ecac8972f5dbe4..c81c94bc9fd536645149940d290846abce857715 100644
--- a/test/sumfact/poisson/CMakeLists.txt
+++ b/test/sumfact/poisson/CMakeLists.txt
@@ -134,3 +134,24 @@ if(benchmark_FOUND)
                                     INIFILE poisson_fastdg_volumes_3d_benchmark.mini
                                     )
 endif()
+
+
+#================================================
+# Poisson fastdg with performance transformations
+#================================================
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl
+                                  BASENAME sumfact_poisson_fastdg_2d_performance_transformations
+                                  INIFILE poisson_fastdg_2d_performance_transformations.mini
+                                  )
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
+                                  BASENAME sumfact_poisson_fastdg_3d_performance_transformations
+                                  INIFILE poisson_fastdg_3d_performance_transformations.mini
+                                  )
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl
+                                  BASENAME sumfact_poisson_dg_2d_performance_transformations
+                                  INIFILE poisson_dg_2d_performance_transformations.mini
+                                  )
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
+                                  BASENAME sumfact_poisson_dg_3d_performance_transformations
+                                  INIFILE poisson_dg_3d_performance_transformations.mini
+                                  )
diff --git a/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..47b0e5269c9ebf62d03328e0f5370528a14ac9f6
--- /dev/null
+++ b/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini
@@ -0,0 +1,28 @@
+__name = sumfact_poisson_dg_2d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 16 16
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = explicit, none | expand gradvec
+fastdg = 0
+geometry_mixins = sumfact_equidistant
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+
+[formcompiler.ufl_variants]
+degree = 2
\ No newline at end of file
diff --git a/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..0b6d75f5c1181bcfe9f5c7ab9c6a941cd9653251
--- /dev/null
+++ b/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini
@@ -0,0 +1,29 @@
+__name = sumfact_poisson_dg_3d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 8 8 8
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+# vectorization_strategy = explicit, none | expand gradvec
+vectorization_strategy = model, none | expand gradvec
+fastdg = 0
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+geometry_mixins = sumfact_equidistant
+
+[formcompiler.ufl_variants]
+degree = 2
diff --git a/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..22461b9cc8bbafc6a9e15c2489bd35bf432bb5de
--- /dev/null
+++ b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini
@@ -0,0 +1,28 @@
+__name = sumfact_poisson_fastdg_2d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 16 16
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = model, none | expand gradvec
+fastdg = 1
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+geometry_mixins = sumfact_equidistant
+
+[formcompiler.ufl_variants]
+degree = 2
\ No newline at end of file
diff --git a/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..0e9cfea80bab55167a277ce8e189761e9b780420
--- /dev/null
+++ b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini
@@ -0,0 +1,28 @@
+__name = sumfact_poisson_fastdg_3d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 8 8 8
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = explicit, none | expand gradvec
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+
+[formcompiler.ufl_variants]
+degree = 2
diff --git a/test/sumfact/preconditioner/CMakeLists.txt b/test/sumfact/preconditioner/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ea77ba5dca71ce5afdff567be1f872dc0dc604e0
--- /dev/null
+++ b/test/sumfact/preconditioner/CMakeLists.txt
@@ -0,0 +1,11 @@
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl
+                                  BASENAME sumfact_preconditioner_2d
+                                  INIFILE preconditioner_2d.mini
+                                  SOURCE test_preconditioner_2d.cc
+                                  )
+
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
+                                  BASENAME sumfact_preconditioner_3d
+                                  INIFILE preconditioner_3d.mini
+                                  SOURCE test_preconditioner_3d.cc
+                                  )
diff --git a/test/sumfact/preconditioner/poisson_dg_2d.ufl b/test/sumfact/preconditioner/poisson_dg_2d.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..ff41f6164df46aa1909db03768aef4041011afae
--- /dev/null
+++ b/test/sumfact/preconditioner/poisson_dg_2d.ufl
@@ -0,0 +1,38 @@
+cell = "quadrilateral"
+dim = 2
+degree = 1
+
+x = SpatialCoordinate(cell)
+c = (0.5-x[0])**2 + (0.5-x[1])**2
+g = exp(-1.*c)
+f = 2*(2.-2*c)*g
+
+V = FiniteElement("DG", cell, degree)
+
+u = TrialFunction(V)
+v = TestFunction(V)
+
+n = FacetNormal(cell)('+')
+
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = 1.0
+
+r = inner(grad(u), grad(v))*dx \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - inner(n, grad(u))*v*ds \
+  + gamma_ext*u*v*ds \
+  + theta*u*inner(grad(v), n)*ds \
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
+
+exact_solution = g
diff --git a/test/sumfact/preconditioner/poisson_dg_3d.ufl b/test/sumfact/preconditioner/poisson_dg_3d.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..cc6594573762ee12e1b11206eeba07a4bd038726
--- /dev/null
+++ b/test/sumfact/preconditioner/poisson_dg_3d.ufl
@@ -0,0 +1,38 @@
+cell = hexahedron
+dim = 3
+degree = 1
+
+x = SpatialCoordinate(cell)
+c = (0.5-x[0])**2 + (0.5-x[1])**2 + (0.5-x[2])**2
+g = exp(-1.*c)
+f = 2*(3.-2*c)*g
+
+V = FiniteElement("DG", cell, degree)
+
+u = TrialFunction(V)
+v = TestFunction(V)
+
+n = FacetNormal(cell)('+')
+
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = 1.0
+
+r = inner(grad(u), grad(v))*dx \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - inner(n, grad(u))*v*ds \
+  + gamma_ext*u*v*ds \
+  + theta*u*inner(grad(v), n)*ds \
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
+
+exact_solution = g
diff --git a/test/sumfact/preconditioner/preconditioner_2d.mini b/test/sumfact/preconditioner/preconditioner_2d.mini
new file mode 100644
index 0000000000000000000000000000000000000000..adcf214dde7018a9c5aa093cff89c51cf67d6a3a
--- /dev/null
+++ b/test/sumfact/preconditioner/preconditioner_2d.mini
@@ -0,0 +1,46 @@
+__name = preconditioner_2d
+__exec_suffix = exec
+
+cells = 2 2
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+operators = r, blockdiag, blockoffdiag, pointdiag
+
+[formcompiler.r]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+classname = FullOperator
+filename = full_2d_operator.hh
+
+[formcompiler.blockdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_diagonal = 1
+form = r
+classname = BlockDiagonalOperator
+filename = block_diagonal_2d_operator.hh
+
+[formcompiler.blockoffdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_offdiagonal = 1
+form = r
+classname = BlockOffDiagonalOperator
+filename = block_offdiagonal_2d_operator.hh
+
+[formcompiler.pointdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_pointdiagonal = 1
+form = r
+classname = PointDiagonalOperator
+filename = point_diagonal_2d_operator.hh
diff --git a/test/sumfact/preconditioner/preconditioner_3d.mini b/test/sumfact/preconditioner/preconditioner_3d.mini
new file mode 100644
index 0000000000000000000000000000000000000000..304d6e28f0e6f152d6327929984c0121dd55e978
--- /dev/null
+++ b/test/sumfact/preconditioner/preconditioner_3d.mini
@@ -0,0 +1,46 @@
+__name = preconditioner_3d
+__exec_suffix = exec
+
+cells = 2 2 2
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+operators = r, blockdiag, blockoffdiag, pointdiag
+
+[formcompiler.r]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+classname = FullOperator
+filename = full_3d_operator.hh
+
+[formcompiler.blockdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_diagonal = 1
+form = r
+classname = BlockDiagonalOperator
+filename = block_diagonal_3d_operator.hh
+
+[formcompiler.blockoffdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_offdiagonal = 1
+form = r
+classname = BlockOffDiagonalOperator
+filename = block_offdiagonal_3d_operator.hh
+
+[formcompiler.pointdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_pointdiagonal = 1
+form = r
+classname = PointDiagonalOperator
+filename = point_diagonal_3d_operator.hh
diff --git a/test/sumfact/preconditioner/test_preconditioner_2d.cc b/test/sumfact/preconditioner/test_preconditioner_2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2781b32399f800d1d6334565aefbede360fdbfa9
--- /dev/null
+++ b/test/sumfact/preconditioner/test_preconditioner_2d.cc
@@ -0,0 +1,129 @@
+#include "config.h"
+
+#include "dune/common/parallel/mpihelper.hh"
+#include "dune/pdelab/stationary/linearproblem.hh"
+#include "dune/pdelab/backend/istl.hh"
+#include "dune/grid/yaspgrid.hh"
+#include "dune/pdelab/finiteelementmap/qkdg.hh"
+#include "dune/pdelab/gridoperator/fastdg.hh"
+#include "dune/testtools/gridconstruction.hh"
+#include "dune/common/parametertree.hh"
+#include "dune/common/parametertreeparser.hh"
+#include <random>
+#include "dune/pdelab/gridfunctionspace/vtk.hh"
+#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh"
+#include "string"
+#include "dune/codegen/vtkpredicate.hh"
+
+// Include all the generated operators
+#include "full_2d_operator.hh"
+#include "block_diagonal_2d_operator.hh"
+#include "block_offdiagonal_2d_operator.hh"
+#include "point_diagonal_2d_operator.hh"
+
+int main(int argc, char** argv){  
+  try
+  {    
+    // Initialize basic stuff...    
+    Dune::MPIHelper& mpihelper = Dune::MPIHelper::instance(argc, argv);
+    using RangeType = double;
+    Dune::ParameterTree initree;
+    Dune::ParameterTreeParser::readINITree(argv[1], initree);
+    
+    // Setup grid (view)...    
+    using Grid = Dune::YaspGrid<2, Dune::EquidistantCoordinates<RangeType, 2>>;
+    using GV = Grid::LeafGridView;
+    using DF = Grid::ctype;
+    IniGridFactory<Grid> factory(initree);
+    std::shared_ptr<Grid> grid = factory.getGrid();
+    GV gv = grid->leafGridView();
+    
+    // Set up finite element maps...    
+    using DG2_FEM = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RangeType, 1, 2>;
+    DG2_FEM dg2_fem;
+    
+    // Set up grid function spaces...    
+    using VectorBackendDG2 = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::fixed>;
+    using NoConstraintsAssembler = Dune::PDELab::NoConstraints;
+    using DG2_GFS = Dune::PDELab::GridFunctionSpace<GV, DG2_FEM, NoConstraintsAssembler, VectorBackendDG2>;
+    DG2_GFS dg2_gfs_(gv, dg2_fem);
+    dg2_gfs_.name("dg2_gfs_");
+    
+    // Set up constraints container...    
+    using DG2_GFS_CC = DG2_GFS::ConstraintsContainer<RangeType>::Type;
+    DG2_GFS_CC dg2_gfs__cc;
+    dg2_gfs__cc.clear();
+    Dune::PDELab::constraints(dg2_gfs_, dg2_gfs__cc);
+    
+    // Set up grid grid operators...    
+    using FullLOP = FullOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
+    using FullGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, FullLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    FullLOP fulllop(dg2_gfs_, dg2_gfs_, initree);
+    dg2_gfs_.update();
+    MatrixBackend mb(5);
+    FullGO fullgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, fulllop, mb);
+
+    // Additional grid operators for preconditioner
+    using BDLOP = BlockDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using BDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BDLOP bdlop(dg2_gfs_, dg2_gfs_, initree);
+    BDGO bdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bdlop, mb);
+
+    using BODLOP = BlockOffDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using BODGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BODLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BODLOP bodlop(dg2_gfs_, dg2_gfs_, initree);
+    BODGO bodgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bodlop, mb);
+
+    using PDLOP = PointDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using PDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, PDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    PDLOP pdlop(dg2_gfs_, dg2_gfs_, initree);
+    PDGO pdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, pdlop, mb);
+
+    // Set up solution vectors...    
+    using V_R = Dune::PDELab::Backend::Vector<DG2_GFS,DF>;
+    V_R x(dg2_gfs_, 0.0);
+    
+    // Testing!
+    
+    // Assemble all those matrices
+    using Dune::PDELab::Backend::native;
+    using M = typename FullGO::Traits::Jacobian;
+    M m(fullgo);
+    fullgo.jacobian(x, m);
+    Dune::printmatrix(std::cout, native(m),"full matrix","row",9,1);
+
+    using BDM = typename BDGO::Traits::Jacobian;
+    BDM bdm(bdgo);
+    bdgo.jacobian(x, bdm);
+    Dune::printmatrix(std::cout, native(bdm),"blockdiagonal matrix","row",9,1);
+
+    using BODM = typename BODGO::Traits::Jacobian;
+    BODM bodm(bodgo);
+    bodgo.jacobian(x, bodm);
+    Dune::printmatrix(std::cout, native(bodm),"blockoffdiagonal matrix","row",9,1);
+
+    V_R pd(dg2_gfs_, 0.0);
+    pdgo.residual(x, pd);
+    Dune::printvector(std::cout, native(pd), "point diagonal vector", "row");
+
+    // test failure boolean
+    bool testfail(false);
+
+    // TODO: Properly test this stuff given the above matrices.
+    //       Right now, visuals need to suffice.
+
+    // Return statement...    
+    return testfail;
+    
+  }  
+  catch (Dune::Exception& e)
+  {    std::cerr << "Dune reported error: " << e << std::endl;
+    return 1;
+  }  
+  catch (std::exception& e)
+  {    std::cerr << "Unknown exception thrown!" << std::endl;
+    return 1;
+  }  
+}
+
diff --git a/test/sumfact/preconditioner/test_preconditioner_3d.cc b/test/sumfact/preconditioner/test_preconditioner_3d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec84a1efbf7e19f46d8531622374f2902a3139eb
--- /dev/null
+++ b/test/sumfact/preconditioner/test_preconditioner_3d.cc
@@ -0,0 +1,129 @@
+#include "config.h"
+
+#include "dune/common/parallel/mpihelper.hh"
+#include "dune/pdelab/stationary/linearproblem.hh"
+#include "dune/pdelab/backend/istl.hh"
+#include "dune/grid/yaspgrid.hh"
+#include "dune/pdelab/finiteelementmap/qkdg.hh"
+#include "dune/pdelab/gridoperator/fastdg.hh"
+#include "dune/testtools/gridconstruction.hh"
+#include "dune/common/parametertree.hh"
+#include "dune/common/parametertreeparser.hh"
+#include <random>
+#include "dune/pdelab/gridfunctionspace/vtk.hh"
+#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh"
+#include "string"
+#include "dune/codegen/vtkpredicate.hh"
+
+// Include all the generated operators
+#include "full_3d_operator.hh"
+#include "block_diagonal_3d_operator.hh"
+#include "block_offdiagonal_3d_operator.hh"
+#include "point_diagonal_3d_operator.hh"
+
+int main(int argc, char** argv){  
+  try
+  {    
+    // Initialize basic stuff...    
+    Dune::MPIHelper& mpihelper = Dune::MPIHelper::instance(argc, argv);
+    using RangeType = double;
+    Dune::ParameterTree initree;
+    Dune::ParameterTreeParser::readINITree(argv[1], initree);
+    
+    // Setup grid (view)...    
+    using Grid = Dune::YaspGrid<3, Dune::EquidistantCoordinates<RangeType, 3>>;
+    using GV = Grid::LeafGridView;
+    using DF = Grid::ctype;
+    IniGridFactory<Grid> factory(initree);
+    std::shared_ptr<Grid> grid = factory.getGrid();
+    GV gv = grid->leafGridView();
+    
+    // Set up finite element maps...    
+    using DG2_FEM = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RangeType, 1, 3>;
+    DG2_FEM dg2_fem;
+    
+    // Set up grid function spaces...    
+    using VectorBackendDG2 = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::fixed>;
+    using NoConstraintsAssembler = Dune::PDELab::NoConstraints;
+    using DG2_GFS = Dune::PDELab::GridFunctionSpace<GV, DG2_FEM, NoConstraintsAssembler, VectorBackendDG2>;
+    DG2_GFS dg2_gfs_(gv, dg2_fem);
+    dg2_gfs_.name("dg2_gfs_");
+    
+    // Set up constraints container...    
+    using DG2_GFS_CC = DG2_GFS::ConstraintsContainer<RangeType>::Type;
+    DG2_GFS_CC dg2_gfs__cc;
+    dg2_gfs__cc.clear();
+    Dune::PDELab::constraints(dg2_gfs_, dg2_gfs__cc);
+    
+    // Set up grid grid operators...    
+    using FullLOP = FullOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
+    using FullGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, FullLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    FullLOP fulllop(dg2_gfs_, dg2_gfs_, initree);
+    dg2_gfs_.update();
+    MatrixBackend mb(5);
+    FullGO fullgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, fulllop, mb);
+
+    // Additional grid operators for preconditioner
+    using BDLOP = BlockDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using BDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BDLOP bdlop(dg2_gfs_, dg2_gfs_, initree);
+    BDGO bdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bdlop, mb);
+
+    using BODLOP = BlockOffDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using BODGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BODLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BODLOP bodlop(dg2_gfs_, dg2_gfs_, initree);
+    BODGO bodgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bodlop, mb);
+
+    using PDLOP = PointDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using PDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, PDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    PDLOP pdlop(dg2_gfs_, dg2_gfs_, initree);
+    PDGO pdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, pdlop, mb);
+
+    // Set up solution vectors...    
+    using V_R = Dune::PDELab::Backend::Vector<DG2_GFS,DF>;
+    V_R x(dg2_gfs_, 0.0);
+    
+    // Testing!
+    
+    // Assemble all those matrices
+    using Dune::PDELab::Backend::native;
+    using M = typename FullGO::Traits::Jacobian;
+    M m(fullgo);
+    fullgo.jacobian(x, m);
+    Dune::printmatrix(std::cout, native(m),"full matrix","row",9,1);
+
+    using BDM = typename BDGO::Traits::Jacobian;
+    BDM bdm(bdgo);
+    bdgo.jacobian(x, bdm);
+    Dune::printmatrix(std::cout, native(bdm),"blockdiagonal matrix","row",9,1);
+
+    using BODM = typename BODGO::Traits::Jacobian;
+    BODM bodm(bodgo);
+    bodgo.jacobian(x, bodm);
+    Dune::printmatrix(std::cout, native(bodm),"blockoffdiagonal matrix","row",9,1);
+
+    V_R pd(dg2_gfs_, 0.0);
+    pdgo.residual(x, pd);
+    Dune::printvector(std::cout, native(pd), "point diagonal vector", "row");
+
+    // test failure boolean
+    bool testfail(false);
+
+    // TODO: Properly test this stuff given the above matrices.
+    //       Right now, visuals need to suffice.
+
+    // Return statement...    
+    return testfail;
+    
+  }  
+  catch (Dune::Exception& e)
+  {    std::cerr << "Dune reported error: " << e << std::endl;
+    return 1;
+  }  
+  catch (std::exception& e)
+  {    std::cerr << "Unknown exception thrown!" << std::endl;
+    return 1;
+  }  
+}
+