diff --git a/CMakeLists.txt b/CMakeLists.txt
index e333cab7b37c5466067f1cd5f05c9385e5fa120c..bf740e74d1a29d6e822e44698388d2984e7c7fb9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.1.3)
 project(dune-codegen CXX)
 
 if(NOT (dune-common_DIR OR dune-common_ROOT OR
@@ -29,6 +29,7 @@ dune_register_package_flags(LIBRARIES dunecodegen)
 
 dune_enable_all_packages()
 
+add_subdirectory(doc)
 add_subdirectory(dune/codegen)
 add_subdirectory(cmake/modules)
 
diff --git a/README.md b/README.md
index 8896b12177ec1532e69d64f1748b81f140123643..5856f3514a91a64db9734af6cf7825952b3935c0 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,25 @@ ctest
 
 Note that this takes quite a while.
 
+## Building and Running dune-codegen in an offline environment
+
+dune-codegen relies on installing Python packages into self-contained environments
+during its configuration and build process. In order to do this in an offline
+environment, we recommend using the tool `devpi`. One of its use cases is to provide
+a local mirror for the Python package index. A quickstart tutorial for this use case
+is available [5]. It boils down to the following:
+
+* Installing the `devpi-server` package through your favorite method
+* Setting up a local server with `devpi-server --init`
+* Making sure it is running in the background (explicitly with `devpi-server --start/stop` or by configuring a systemd service.
+* Have the environment variable `PIP_INDEX_URL` to its index, e.g. by adding this line to your `~/.bashrc` (where `http://localhost:3141` might differ depending on your devpi configuration):
+```
+export PIP_INDEX_URL=http://localhost:3141/root/pypi/+simple/
+```
+
+At first installation, the locally mirrored package index will access PyPI.
+Later on, it will install packages from its local cache.
+
 ## Links
 
 [0]: https://git-lfs.github.com/
@@ -112,3 +131,4 @@ Note that this takes quite a while.
 [2]: https://gitlab.dune-project.org/quality/dune-testtools
 [3]: http://isl.gforge.inria.fr/
 [4]: https://www.dune-project.org/doc/installation/
+[5]: https://github.com/devpi/devpi/blob/master/doc/quickstart-pypimirror.rst
diff --git a/bin/donkey_benchmark_compilation_wrapper.sh b/bin/donkey_benchmark_compilation_wrapper.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a786d111264ef0e67769100713f1776e3e002c3d
--- /dev/null
+++ b/bin/donkey_benchmark_compilation_wrapper.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+ml gcc/6.4.0
+ml benchmark/1.4.0
+ml python/3.6.3
+ml openmpi
+ml cmake
+ml openblas
+ml metis
+ml suite-sparse
+ml superlu
+ml parmetis
+
+("$@")
+code=$?
+exit $code
diff --git a/bin/donkey_benchmark_execution_wrapper.py b/bin/donkey_benchmark_execution_wrapper.py
new file mode 100755
index 0000000000000000000000000000000000000000..22835c7cbce61336168396007e73703394eb7433
--- /dev/null
+++ b/bin/donkey_benchmark_execution_wrapper.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+import time
+import sys
+import subprocess
+import os
+
+os.environ['DUNE_CODEGEN_THREADS'] = '20'
+
+# Run the actual command
+
+# The actual measurements will be called like this (mpi parallelism)
+# command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split()
+
+# Command for autotune benchmarks with threading
+command = "srun -p haswell10c -n 1 -c 20 --hint=nomultithread --cpu_bin=verbose,core".split()
+
+command.extend(sys.argv[1:])
+ret = subprocess.call(command)
+
+# If that failed - fail!
+if ret != 0:
+    sys.exit(ret)
diff --git a/bin/donkey_benchmark_wrapper.py b/bin/donkey_benchmark_wrapper.py
deleted file mode 100755
index 7951b8b06bfdeb217bd3328c9b3b88f229bbc606..0000000000000000000000000000000000000000
--- a/bin/donkey_benchmark_wrapper.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env python
-
-import time
-import sys
-import subprocess
-import os
-
-# Run the actual command
-command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split()
-command.extend(sys.argv[1:])
-ret = subprocess.call(command)
-
-# If that failed - fail!
-if ret != 0:
-    sys.exit(ret)
-
-# If that was succesful, wait for the output file to be available on the filesystem
-# This step is necessary because the NFS synchronization is too slow for our workflow.
-while not os.path.isfile(sys.argv[2]):
-    time.sleep(0.1)
diff --git a/cmake/modules/DuneCodegenMacros.cmake b/cmake/modules/DuneCodegenMacros.cmake
index 787f9d48399d35ae25062f59d8d2343cce2f9607..cff09c5ec49c0675680b5f49c1f24f9008c93cc5 100644
--- a/cmake/modules/DuneCodegenMacros.cmake
+++ b/cmake/modules/DuneCodegenMacros.cmake
@@ -1,6 +1,6 @@
 # File for module specific CMake tests.
 #
-# .. cmake_function:: add_generated_executable
+# .. cmake_function:: dune_add_generated_executable
 #
 #    .. cmake_param:: UFLFILE
 #       :single:
@@ -22,19 +22,20 @@
 #       The name given to the added executable target.
 #
 #    .. cmake_param:: SOURCE
+#       :single:
 #
 #       The cc source file to build from. If omitted, a minimal
 #       source file and a driver file will be generated.
 #
 #    .. cmake_param:: FORM_COMPILER_ARGS
 #       :multi:
-#       :argname arg:
+#       :argname: arg
 #
 #       Additional arguments as recognized by the form compiler.
 #
 #    .. cmake_param:: DEPENDS
 #       :multi:
-#       :argname dep:
+#       :argname: dep
 #
 #       Additional dependencies of the generated executable (changes in those
 #       will retrigger generation)
@@ -57,7 +58,7 @@
 #
 #    .. cmake_param:: ANALYZE_GRID_COMMAND
 #       :multi:
-#       :argname command:
+#       :argname: command
 #
 #       Use this to pass a custom grid analysis command. This is necessary
 #       if you use a custom grid generation methdod. The inifile and the
@@ -79,6 +80,12 @@
 #    generator is run.
 #
 
+find_package(benchmark)
+
+if (DUNE_CODEGEN_PROFILING)
+  find_package(likwid)
+endif()
+
 add_custom_target(generation)
 
 # Gather a list of form compiler sources to add as dependencies
@@ -98,7 +105,7 @@ else()
 endif()
 file(GLOB_RECURSE UFL2PDELAB_SOURCES ${UFL2PDELAB_GLOB_PATTERN})
 
-function(add_generated_executable)
+function(dune_add_generated_executable)
   set(OPTIONS EXCLUDE_FROM_ALL ANALYZE_GRID)
   set(SINGLE TARGET SOURCE UFLFILE INIFILE)
   set(MULTI FORM_COMPILER_ARGS DEPENDS ANALYZE_GRID_COMMAND)
@@ -106,15 +113,20 @@ function(add_generated_executable)
   cmake_parse_arguments(GEN "${OPTIONS}" "${SINGLE}" "${MULTI}" ${ARGN})
 
   if(GEN_UNPARSED_ARGUMENTS)
-    message(FATAL_ERROR "Unrecognized arguments in add_generated_executable. This usually indicates a typo.")
+    message(FATAL_ERROR "Unrecognized arguments in dune_add_generated_executable. This usually indicates a typo.")
+  endif()
+
+  set(MPI_OPTION "0")
+  if(MPI_FOUND)
+    set(MPI_OPTION "1")
   endif()
 
   # Apply defaults and enforce requirements
   if(NOT GEN_TARGET)
-    message(FATAL_ERROR "Need to specify the TARGET parameter for add_generated_executable")
+    message(FATAL_ERROR "Need to specify the TARGET parameter for dune_add_generated_executable")
   endif()
   if(NOT GEN_UFLFILE)
-    message(FATAL_ERROR "Need to specify the UFLFILE parameter for add_generated_executable")
+    message(FATAL_ERROR "Need to specify the UFLFILE parameter for dune_add_generated_executable")
   endif()
   if(NOT IS_ABSOLUTE ${GEN_UFLFILE})
     set(GEN_UFLFILE ${CMAKE_CURRENT_SOURCE_DIR}/${GEN_UFLFILE})
@@ -132,6 +144,7 @@ function(add_generated_executable)
                                --target-name ${GEN_TARGET}
                                --driver-file ${GEN_SOURCE}
                                --project-basedir ${CMAKE_BINARY_DIR}
+                               --with-mpi ${MPI_OPTION}
                                ${GEN_FORM_COMPILER_ARGS}
                        DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES}
                        COMMENT "Generating driver for the target ${GEN_TARGET}"
@@ -165,18 +178,13 @@ function(add_generated_executable)
   endif()
 
   # Parse a mapping of operators to build and their respective filenames
-  dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET}
-                       OUTPUT_VARIABLE depdata
-                       )
-  parse_python_data(PREFIX depdata INPUT ${depdata})
+  dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET} ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
+  parse_python_data(PREFIX depdata FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
 
   if(DUNE_CODEGEN_PROFILING)
     # This is a bit silly, but cProfile only finds entry point scripts
-    # if their full path is provided. So we resort to using which.
-    dune_execute_process(COMMAND which generate_operators
-                         OUTPUT_VARIABLE fullcommand
-                         OUTPUT_STRIP_TRAILING_WHITESPACE
-                         )
+    # if their full path is provided. 
+    set(fullcommand "${DUNE_PYTHON_VIRTUALENV_PATH}/bin/generate_operators")
   endif()
 	
   # Define build rules for all operator header files and gather a list of them
@@ -195,6 +203,7 @@ function(add_generated_executable)
                                --ini-file ${GEN_INIFILE}
                                --target-name ${GEN_TARGET}
                                --operator-to-build ${op}
+                               --with-mpi ${MPI_OPTION}
                                ${ANALYZE_GRID_OPTION}
                        DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES} ${ANALYZE_GRID_FILE}
                        COMMENT "Generating operator file ${depdata___${op}} for the target ${GEN_TARGET}"
diff --git a/cmake/modules/Findlikwid.cmake b/cmake/modules/Findlikwid.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..778901280ee0b778bd7131cf1fcee4b3181557f3
--- /dev/null
+++ b/cmake/modules/Findlikwid.cmake
@@ -0,0 +1,104 @@
+# .. cmake_module::
+#
+#    Module that checks whether likwid is available and usable.
+#
+#    Variables used by this module which you may want to set:
+#
+#    :ref:`likwid_ROOT`
+#       Path list to search for likwid.
+#
+#    Sets the following variables:
+#
+#    :code:`likwid_FOUND`
+#       True if likwid available.
+#
+#    :code:`likwid_INCLUDE_DIRS`
+#       Path to the likwid include directories.
+#
+#
+#    :code:`likwid_LIBRARIES`
+#       Link against these libraries to use likwid.
+#
+# .. cmake_variable:: likwid_ROOT
+#
+#    You may set this variable to have :ref:`Findlikwid` look
+#    for the likwid package in the given paths before inspecting
+#    system paths.
+#
+find_path(LIKWID_INCLUDE_DIR
+        NAMES "likwid.h"
+        PATHS ${likwid_ROOT}
+        PATH_SUFFIXES "include" "include/likwid"
+        NO_DEFAULT_PATH)
+find_path(LIKWID_INCLUDE_DIR
+        NAMES "likwid.h"
+        PATH_SUFFIXES "include" "include/likwid")
+
+find_library(LIKWID_LIBRARY
+        NAMES "likwid"
+        PATHS ${likwid_ROOT}
+        PATH_SUFFIXES "lib" "lib32" "lib64"
+        NO_DEFAULT_PATH)
+find_library(LIKWID_LIBRARY
+        NAMES "likwid"
+        PATH_SUFFIXES "lib" "lib32" "lib64")
+
+include(CMakePushCheckState)
+cmake_push_check_state()
+
+if(LIKWID_INCLUDE_DIR)
+    set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${LIKWID_INCLUDE_DIR})
+endif()
+if(LIKWID_LIBRARY)
+    set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} ${LIKWID_LIBRARY})
+endif()
+
+cmake_pop_check_state()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+        "likwid"
+        DEFAULT_MSG
+        LIKWID_INCLUDE_DIR
+        LIKWID_LIBRARY
+)
+
+mark_as_advanced(LIKWID_INCLUDE_DIR LIKWID_LIBRARY)
+
+# if headers are found, store results
+if(likwid_FOUND)
+    set(likwid_INCLUDE_DIRS ${LIKWID_INCLUDE_DIR})
+    set(likwid_LIBRARIES ${LIKWID_LIBRARY})
+    # log result
+    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+            "Determing location of likwid succeeded:\n"
+            "Include directory: ${likwid_INCLUDE_DIRS}\n"
+            "Libraries to link against: ${likwid_LIBRARIES}\n\n")
+
+    set(likwid_DUNE_COMPILE_FLAGS "-I${likwid_INCLUDE_DIRS}"
+            CACHE STRING "Compile Flags used by DUNE when compiling with likwid programs")
+    set(likwid_DUNE_LIBRARIES ${likwid_LIBRARIES}
+            CACHE STRING "Libraries used by DUNE when linking likwid programs")
+else()
+    # log errornous result
+    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+            "Determing location of likwid failed:\n"
+            "Include directory: ${likwid_INCLUDE_DIRS}\n"
+            "Libraries to link against: ${likwid_LIBRARIES}\n\n")
+endif()
+
+# set HAVE_LIKWID for config.h
+set(HAVE_LIKWID ${likwid_FOUND})
+
+
+# register all likwid related flags
+if(likwid_FOUND)
+    dune_register_package_flags(COMPILE_DEFINITIONS "ENABLE_LIKWID=1"
+            LIBRARIES "${likwid_LIBRARIES}"
+            INCLUDE_DIRS "${likwid_INCLUDE_DIRS}")
+endif()
+
+# text for feature summary
+set_package_properties("LIKWID" PROPERTIES
+        DESCRIPTION "likwid"
+        PURPOSE "Performance monitoring and benchmarking suite.")
\ No newline at end of file
diff --git a/cmake/modules/GeneratedSystemtests.cmake b/cmake/modules/GeneratedSystemtests.cmake
index 206ce3bcbf10a457182ffbb1a5f8e415ea3fcb24..2a1907950a7f5bd68e85a5c536465a73634828cf 100644
--- a/cmake/modules/GeneratedSystemtests.cmake
+++ b/cmake/modules/GeneratedSystemtests.cmake
@@ -1,11 +1,85 @@
-ï»¿# System testing for generated executables. All ideas taken from dune-testtools.
+# System testing for generated executables. All ideas taken from dune-testtools.
+#
+# .. cmake_function:: dune_add_formcompiler_system_test
+#
+#    .. cmake_param:: UFLFILE
+#       :single:
+#       :required:
+#
+#       The UFL file to create the generate code from.
+#
+#    .. cmake_param:: INIFILE
+#       :single:
+#       :required:
+#
+#       The ini file that controls the form compilation process.
+#       It is expected to contain a [formcompiler] section. This
+#       file may contain meta ini annotations.
+#
+#    .. cmake_param:: BASENAME
+#       :single:
+#       :required:
+#
+#       The basename for the generated executables.
+#
+#    .. cmake_param:: SCRIPT
+#       :single:
+#
+#       The python script that decides about test failure/success.
+#       Defaults to a script that simply runs the program and checks
+#       the exit code. More scripts to be found in dune-testtools.
+#
+#    .. cmake_param:: SOURCE
+#       :single:
+#
+#       The cc source file to build from. If omitted, a minimal
+#       source file and a driver file will be generated.
+#
+#    .. cmake_param:: CREATED_TARGETS
+#       :single:
+#
+#       A variable name that should be filled with the list of generated
+#       targets. This can be used to modify these lateron.
+#
+#    .. cmake_param:: DEPENDS
+#       :multi:
+#       :argname: dep
+#
+#       Additional dependencies of the generated executable (changes in those
+#       will retrigger generation)
+#
+#    .. cmake_param:: NO_TESTS
+#       :option:
+#
+#       If given, code will be generated and built normally, but no tests will
+#       be added to the test suite.
+#
+#    .. cmake_param:: ANALYZE_GRID
+#       :option:
+#
+#       Set this option to enable code generation time grid analysis.
+#       This is useful to reduce the variety of sum factorization kernels
+#       in unstructured grids. Note that the grid analysis tool needs to
+#       be able to construct your grid from the given inifile. If you have
+#       a custom grid construction method, you can use ANALYZE_GRID_COMMAND
+#       instead.
+#
+#    .. cmake_param:: ANALYZE_GRID_COMMAND
+#       :multi:
+#       :argname: command
+#
+#       Use this to pass a custom grid analysis command. This is necessary
+#       if you use a custom grid generation methdod. The inifile and the
+#       outputfile will be appended to this command. You can use the analysis code in
+#       dune/codegen/sumfact/analyzegrid.hh to write your own tool.
+#       Specifying this option will automatically set ANALYZE_GRID.
 #
 
 function(dune_add_formcompiler_system_test)
   # parse arguments
   set(OPTION DEBUG NO_TESTS ANALYZE_GRID)
-  set(SINGLE INIFILE BASENAME SCRIPT UFLFILE SOURCE)
-  set(MULTI CREATED_TARGETS DEPENDS)
+  set(SINGLE INIFILE BASENAME SCRIPT UFLFILE SOURCE CREATED_TARGETS)
+  set(MULTI DEPENDS ANALYZE_GRID_COMMAND)
   cmake_parse_arguments(SYSTEMTEST "${OPTION}" "${SINGLE}" "${MULTI}" ${ARGN})
 
   if(SYSTEMTEST_UNPARSED_ARGUMENTS)
@@ -25,6 +99,10 @@ function(dune_add_formcompiler_system_test)
   if(SYSTEMTEST_ANALYZE_GRID)
     set(ANALYZE_GRID_STR "ANALYZE_GRID")
   endif()
+  set(ANALYZE_GRID_COMMAND_STR "")
+  if(SYSTEMTEST_ANALYZE_GRID_COMMAND)
+    set(ANALYZE_GRID_COMMAND_STR "ANALYZE_GRID_COMMAND ${SYSTEMTEST_ANALYZE_GRID_COMMAND}")
+  endif()
 
   # set a default for the script. call_executable.py just calls the executable.
   # There, it is also possible to hook in things depending on the inifile
@@ -41,9 +119,14 @@ function(dune_add_formcompiler_system_test)
   configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} ${CMAKE_CURRENT_BINARY_DIR}/tmp_${SYSTEMTEST_INIFILE})
 
   # expand the given meta ini file into the build tree
-  execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py --cmake --ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} --dir ${CMAKE_CURRENT_BINARY_DIR} --section formcompiler
-                  OUTPUT_VARIABLE output)
-  parse_python_data(PREFIX INIINFO INPUT "${output}")
+  execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py
+                          --cmake
+                          --ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE}
+                          --dir ${CMAKE_CURRENT_BINARY_DIR}
+                          --section formcompiler
+                          --file ${CMAKE_CURRENT_BINARY_DIR}/interface.log
+                          )
+  parse_python_data(PREFIX INIINFO FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
 
   foreach(inifile ${INIINFO_names})
     if(${INIINFO_${inifile}_suffix} STREQUAL "__empty")
@@ -55,23 +138,25 @@ function(dune_add_formcompiler_system_test)
       endif()
     endif()
 
-    add_generated_executable(TARGET ${tname}
-                             UFLFILE ${SYSTEMTEST_UFLFILE}
-                             INIFILE "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
-                             DEPENDS ${SYSTEMTEST_INIFILE} ${SYSTEMTEST_DEPENDS}
-                             EXCLUDE_FROM_ALL
-                             ${SOURCE}
-                             ${ANALYZE_GRID_STR}
-                             )
+    dune_add_generated_executable(TARGET ${tname}
+                                  UFLFILE ${SYSTEMTEST_UFLFILE}
+                                  INIFILE "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
+                                  DEPENDS ${SYSTEMTEST_INIFILE} ${SYSTEMTEST_DEPENDS}
+                                  EXCLUDE_FROM_ALL
+                                  ${SOURCE}
+                                  ${ANALYZE_GRID_STR}
+                                  ${ANALYZE_GRID_COMMAND_STR}
+                                  )
 
     # Enrich the target with preprocessor variables from the __static section
     # just the way that dune-testtools does.
     dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_extract_static.py
                                --ini ${inifile}
+                               --file ${CMAKE_CURRENT_BINARY_DIR}/interface.log
                          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
                          OUTPUT_VARIABLE output
                          ERROR_MESSAGE "Error extracting static info from ${inifile}")
-    parse_python_data(PREFIX STAT INPUT "${output}")
+    parse_python_data(PREFIX STAT FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
 
     foreach(config ${STAT___CONFIGS})
       foreach(cd ${STAT___STATIC_DATA})
diff --git a/cmake/modules/deplist.py b/cmake/modules/deplist.py
index 9cb5d7d42cbfc712e18e37798dffc2d553416f8c..7b0afac3c7c5f216e9710cef394d175c0c6c9ee9 100755
--- a/cmake/modules/deplist.py
+++ b/cmake/modules/deplist.py
@@ -22,5 +22,5 @@ def get_filename(operator):
 result = {"__{}".format(o): get_filename(o) for o in operators}
 result["__operators"] = ";".join(operators)
 
-printForCMake(result)
+printForCMake(result, sys.argv[3])
 sys.exit(0)
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..956de868a985aec3af90d7263417eb16b9be09df
--- /dev/null
+++ b/doc/CMakeLists.txt
@@ -0,0 +1 @@
+dune_cmake_sphinx_doc(MODULE_ONLY)
diff --git a/dune/codegen/common/simdtraits.hh b/dune/codegen/common/simdtraits.hh
new file mode 100644
index 0000000000000000000000000000000000000000..73ee4caff1bb743d275e20173556416cc0bd2d30
--- /dev/null
+++ b/dune/codegen/common/simdtraits.hh
@@ -0,0 +1,16 @@
+#ifndef DUNE_CODEGEN_COMMON_SIMD_TRAITS_HH
+#define DUNE_CODEGEN_COMMON_SIMD_TRAITS_HH
+
+/** This is just the declaration of the traits classes, specialization for VCL and
+ *  OpCounter VCL are elsewhere.
+ */
+
+template<typename T>
+struct base_floatingpoint
+{};
+
+template<typename T>
+struct simd_size
+{};
+
+#endif
diff --git a/dune/codegen/common/vcltraits.hh b/dune/codegen/common/vcltraits.hh
new file mode 100644
index 0000000000000000000000000000000000000000..de764dbe6f78a51d1bf5934856ca36d4b90a77d0
--- /dev/null
+++ b/dune/codegen/common/vcltraits.hh
@@ -0,0 +1,86 @@
+#ifndef DUNE_CODEGEN_COMMON_VCLTRAITS_HH
+#define DUNE_CODEGEN_COMMON_VCLTRAITS_HH
+
+/** A collection of traits tools for the Vector Class Library */
+
+#include<dune/codegen/common/vectorclass.hh>
+
+
+template<>
+struct base_floatingpoint<Vec2d>
+{
+  using value = double;
+};
+
+template<>
+struct base_floatingpoint<Vec4f>
+{
+  using value = float;
+};
+
+template<>
+struct simd_size<Vec2d>
+{
+  static constexpr std::size_t value = 2;
+};
+
+template<>
+struct simd_size<Vec4f>
+{
+  static constexpr std::size_t value = 4;
+};
+
+#if MAX_VECTOR_SIZE >= 256
+template<>
+struct base_floatingpoint<Vec4d>
+{
+  using value = double;
+};
+
+template<>
+struct base_floatingpoint<Vec8f>
+{
+  using value = float;
+};
+
+template<>
+struct simd_size<Vec4d>
+{
+  static constexpr std::size_t value = 4;
+};
+
+template<>
+struct simd_size<Vec8f>
+{
+  static constexpr std::size_t value = 8;
+};
+#endif
+
+#if MAX_VECTOR_SIZE >= 512
+template<>
+struct base_floatingpoint<Vec8d>
+{
+  using value = double;
+};
+
+template<>
+struct base_floatingpoint<Vec16f>
+{
+  using value = float;
+};
+
+template<>
+struct simd_size<Vec8d>
+{
+  static constexpr std::size_t value = 8;
+};
+
+template<>
+struct simd_size<Vec16f>
+{
+  static constexpr std::size_t value = 16;
+};
+
+#endif
+
+#endif
diff --git a/dune/codegen/common/vectorclass.hh b/dune/codegen/common/vectorclass.hh
index aa3bba7b98f9cf57dcc223f61755e3aa26c5bdca..648ea52fe24490efc8af431fe54de4089515af33 100644
--- a/dune/codegen/common/vectorclass.hh
+++ b/dune/codegen/common/vectorclass.hh
@@ -1,71 +1,23 @@
 #ifndef DUNE_CODEGEN_COMMON_VECTORCLASS_HH
 #define DUNE_CODEGEN_COMMON_VECTORCLASS_HH
 
-
-template<typename T>
-struct base_floatingpoint
-{};
+#include<dune/codegen/common/simdtraits.hh>
 
 #ifdef ENABLE_COUNTER
+
 #if HAVE_DUNE_OPCOUNTER
 #include<dune/opcounter/vectorclass.hh>
-
-template<typename F, int size>
-struct base_floatingpoint<OpCounter::impl::OpCounterVector<F, size>>
-{
-  using value = OpCounter::OpCounter<F>;
-};
-
-
 #else
 #error "dune-opcounter is needed for opcounted vector types"
 #endif
+
 #else
+
 #include<dune/codegen/vectorclass/vectorclass.h>
 #include<dune/codegen/vectorclass/vectormath_exp.h>
 #include<dune/codegen/vectorclass/vectormath_hyp.h>
 #include<dune/codegen/vectorclass/vectormath_trig.h>
-
-template<>
-struct base_floatingpoint<Vec2d>
-{
-  using value = double;
-};
-
-template<>
-struct base_floatingpoint<Vec4f>
-{
-  using value = float;
-};
-
-
-#if MAX_VECTOR_SIZE >= 256
-template<>
-struct base_floatingpoint<Vec4d>
-{
-  using value = double;
-};
-
-template<>
-struct base_floatingpoint<Vec8f>
-{
-  using value = float;
-};
-#endif
-
-#if MAX_VECTOR_SIZE >= 512
-template<>
-struct base_floatingpoint<Vec8d>
-{
-  using value = double;
-};
-
-template<>
-struct base_floatingpoint<Vec16f>
-{
-  using value = float;
-};
-#endif
+#include<dune/codegen/common/vcltraits.hh>
 
 #endif
 
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 84ac56603f1c749cf8979d52d3d890ed9d2e0ee4..67694937376b1b6cdaa66d44db07289652f62bcc 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -21,6 +21,10 @@ dune_python_add_test(NAME pep8-ourcode
 
 add_subdirectory(test)
 
-# Add a dummy target to extract compiler flags for the autotune tool chain
 add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc)
 target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing)
+
+find_package(Threads)
+if(benchmark_FOUND AND Threads_FOUND)
+  target_link_libraries(_autotune_target benchmark Threads::Threads)
+endif()
diff --git a/python/dune/codegen/__init__.py b/python/dune/codegen/__init__.py
index e892baf44c4c9fb301536c15c40059a5acc5d602..196419782487f33f13cf797d19c1fbdcbd13d946 100644
--- a/python/dune/codegen/__init__.py
+++ b/python/dune/codegen/__init__.py
@@ -10,3 +10,6 @@ import dune.codegen.loopy.symbolic  # noqa
 import dune.codegen.pdelab  # noqa
 import dune.codegen.sumfact  # noqa
 import dune.codegen.blockstructured  # noqa
+
+# Trigger imports that register finite elements in UFL
+import dune.codegen.ufl.finiteelement  # noqa
diff --git a/python/dune/codegen/blockstructured/accumulation.py b/python/dune/codegen/blockstructured/accumulation.py
index 9af9759d5e3426df12a10db39f94048e48006867..ec2e3d9917998a7a024796d48aaf6be6488a271c 100644
--- a/python/dune/codegen/blockstructured/accumulation.py
+++ b/python/dune/codegen/blockstructured/accumulation.py
@@ -1,12 +1,12 @@
-from dune.codegen.blockstructured.tools import sub_element_inames
-from dune.codegen.generation import accumulation_mixin, instruction
+from dune.codegen.blockstructured.tools import sub_element_inames, name_accumulation_alias
+from dune.codegen.generation import accumulation_mixin, instruction, get_global_context_value
 from dune.codegen.loopy.target import dtype_floatingpoint
 from dune.codegen.options import get_form_option
 from dune.codegen.pdelab.geometry import world_dimension, name_intersection_geometry_wrapper
 from dune.codegen.pdelab.localoperator import determine_accumulation_space, GenericAccumulationMixin
 from dune.codegen.pdelab.argument import name_accumulation_variable
 from dune.codegen.pdelab.localoperator import boundary_predicates
-from dune.codegen.generation.loopy import function_mangler, globalarg, temporary_variable
+from dune.codegen.generation.loopy import function_mangler, temporary_variable
 import loopy as lp
 import pymbolic.primitives as prim
 
@@ -16,32 +16,10 @@ from loopy.match import Writes
 @accumulation_mixin("blockstructured")
 class BlockStructuredAccumulationMixin(GenericAccumulationMixin):
     def generate_accumulation_instruction(self, expr):
-        if get_form_option('vectorization_blockstructured'):
-            return generate_accumulation_instruction_vectorized(expr, self)
-        else:
+        if get_global_context_value("form_type") == "jacobian":
             return generate_accumulation_instruction(expr, self)
-
-
-def name_accumulation_alias(container, accumspace):
-    name = container + "_" + accumspace.lfs.name + "_alias"
-    name_tail = container + "_" + accumspace.lfs.name + "_alias_tail"
-    k = get_form_option("number_of_blocks")
-    p = accumspace.element.degree()
-
-    def _add_alias_insn(name):
-        dim = world_dimension()
-        element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim))
-        index_stride = tuple((p * k + 1)**i for i in range(0, dim))
-        globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True)
-        code = "auto {} = &{}.container()({},0);".format(name, container, accumspace.lfs.name)
-        instruction(within_inames=frozenset(),
-                    code=code,
-                    read_variables=frozenset({container}),
-                    assignees=frozenset({name}))
-
-    _add_alias_insn(name)
-    _add_alias_insn(name_tail)
-    return name
+        else:
+            return generate_accumulation_instruction_vectorized(expr, self)
 
 
 @function_mangler
diff --git a/python/dune/codegen/blockstructured/argument.py b/python/dune/codegen/blockstructured/argument.py
index 420773e85bea93ee55cb310255da7fa60d55d9de..deff1b8415d246d0ed197692df46b61db2dbd5cc 100644
--- a/python/dune/codegen/blockstructured/argument.py
+++ b/python/dune/codegen/blockstructured/argument.py
@@ -1,29 +1,11 @@
-from dune.codegen.generation import (kernel_cached,
-                                     valuearg, instruction, globalarg)
+from dune.codegen.generation import kernel_cached, valuearg
 from dune.codegen.options import get_form_option
 from dune.codegen.pdelab.argument import CoefficientAccess
-from dune.codegen.blockstructured.tools import micro_index_to_macro_index, sub_element_inames
-from dune.codegen.pdelab.geometry import world_dimension
+from dune.codegen.blockstructured.tools import micro_index_to_macro_index, sub_element_inames, name_container_alias
 from loopy.types import NumpyType
 import pymbolic.primitives as prim
 
 
-def name_alias(container, lfs, element):
-    name = container + "_" + lfs.name + "_alias"
-    k = get_form_option("number_of_blocks")
-    p = element.degree()
-    dim = world_dimension()
-    element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim))
-    index_stride = tuple((p * k + 1)**i for i in range(0, dim))
-    globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True)
-    code = "const auto {} = &{}({},0);".format(name, container, lfs.name)
-    instruction(within_inames=frozenset(),
-                code=code,
-                read_variables=frozenset({container}),
-                assignees=frozenset({name}))
-    return name
-
-
 # TODO remove the need for element
 @kernel_cached
 def pymbolic_coefficient(container, lfs, element, index):
@@ -36,9 +18,6 @@ def pymbolic_coefficient(container, lfs, element, index):
         lfs = prim.Variable(lfs)
 
     # use higher order FEM index instead of Q1 index
-    if get_form_option("vectorization_blockstructured"):
-        subelem_inames = sub_element_inames()
-        coeff_alias = name_alias(container, lfs, element)
-        return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index))
-    else:
-        return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),))
+    subelem_inames = sub_element_inames()
+    coeff_alias = name_container_alias(container, lfs, element)
+    return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index))
diff --git a/python/dune/codegen/blockstructured/basis.py b/python/dune/codegen/blockstructured/basis.py
index 5af8bb48f2bb625a0b336c5cae6d42863f0063c5..efbc1bc9b3b206d0503686679d6a15547d5004a9 100644
--- a/python/dune/codegen/blockstructured/basis.py
+++ b/python/dune/codegen/blockstructured/basis.py
@@ -7,7 +7,7 @@ from dune.codegen.generation import (basis_mixin,
                                      globalarg,
                                      class_member,
                                      initializer_list,
-                                     include_file,)
+                                     include_file, preamble)
 from dune.codegen.tools import get_pymbolic_basename, get_pymbolic_indices
 from dune.codegen.loopy.target import type_floatingpoint
 from dune.codegen.pdelab.basis import (GenericBasisMixin,
@@ -38,11 +38,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
         assert not isinstance(element, MixedElement)
         name = "phi_{}".format(FEM_name_mangling(element))
         name = restricted_name(name, restriction)
+        self.init_basis_cache(element, restriction)
         self.evaluate_basis(element, name, restriction)
         inames = self.lfs_inames(element, restriction, number, context=context)
 
         return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0))
 
+    @preamble(kernel='operator')
+    def init_basis_cache(self, element, restriction):
+        if not restriction:
+            cache = name_localbasis_cache(element)
+            localbasis = name_localbasis(element)
+            qp_name = get_pymbolic_basename(self.quadrature_position_in_micro())
+            return ["for (int i=0; i < {}.size(); ++i)".format(qp_name),
+                    "{",
+                    "  {}.evaluateFunction({}[i], {});".format(cache, qp_name, localbasis),
+                    "}"]
+        else:
+            return []
+
     @kernel_cached
     def evaluate_basis(self, element, name, restriction):
         temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1),
@@ -60,11 +74,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
         assert not isinstance(element, MixedElement)
         name = "js_{}".format(FEM_name_mangling(element))
         name = restricted_name(name, restriction)
+        self.init_gradient_cache(element, restriction)
         self.evaluate_reference_gradient(element, name, restriction)
         inames = self.lfs_inames(element, restriction, number, context=context)
 
         return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0))
 
+    @preamble(kernel='operator')
+    def init_gradient_cache(self, element, restriction):
+        if not restriction:
+            cache = name_localbasis_cache(element)
+            localbasis = name_localbasis(element)
+            qp_name = get_pymbolic_basename(self.quadrature_position_in_micro())
+            return ["for (int i=0; i < {}.size(); ++i)".format(qp_name),
+                    "{",
+                    "  {}.evaluateJacobian({}[i], {});".format(cache, qp_name, localbasis),
+                    "}"]
+        else:
+            return []
+
     @kernel_cached
     def evaluate_reference_gradient(self, element, name, restriction):
         temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1, world_dimension()),
diff --git a/python/dune/codegen/blockstructured/tools.py b/python/dune/codegen/blockstructured/tools.py
index 802b819f23754aa0862e122f4127a3db18593117..e9acf640cc3f6febad32dd9bec40cda7d75b881f 100644
--- a/python/dune/codegen/blockstructured/tools.py
+++ b/python/dune/codegen/blockstructured/tools.py
@@ -2,7 +2,7 @@ from dune.codegen.tools import get_pymbolic_basename
 from dune.codegen.generation import (iname,
                                      domain,
                                      temporary_variable,
-                                     instruction)
+                                     instruction, globalarg, preamble)
 from dune.codegen.pdelab.geometry import world_dimension
 from dune.codegen.options import get_form_option
 import pymbolic.primitives as prim
@@ -74,3 +74,33 @@ def name_point_in_macro(point_in_micro, visitor):
     name = get_pymbolic_basename(point_in_micro) + "_macro"
     define_point_in_macro(name, point_in_micro, visitor)
     return name
+
+
+@preamble
+def define_container_alias(name, container, lfs, element, is_const):
+    k = get_form_option("number_of_blocks")
+    p = element.degree()
+    dim = world_dimension()
+    element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim))
+    index_stride = tuple((p * k + 1)**i for i in range(0, dim))
+    globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True)
+    if is_const:
+        return "const auto {} = &{}({},0);".format(name, container, lfs.name)
+    else:
+        return "auto {} = &{}.container()({},0);".format(name, container, lfs.name)
+
+
+def name_accumulation_alias(container, accumspace):
+    name = container + "_" + accumspace.lfs.name + "_alias"
+    name_tail = container + "_" + accumspace.lfs.name + "_alias_tail"
+
+    define_container_alias(name, container, accumspace.lfs, accumspace.element, is_const=False)
+    define_container_alias(name_tail, container, accumspace.lfs, accumspace.element, is_const=False)
+    return name
+
+
+def name_container_alias(container, lfs, element):
+    name = container + "_" + lfs.name + "_alias"
+
+    define_container_alias(name, container, lfs, element, is_const=True)
+    return name
diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py
index b35918b23b3d6e8eebf12d6ef08c94a927d692f9..94bc4875f262c72d695007be04c0b4107208f2f5 100644
--- a/python/dune/codegen/blockstructured/vectorization.py
+++ b/python/dune/codegen/blockstructured/vectorization.py
@@ -1,8 +1,10 @@
 import loopy as lp
 import numpy as np
 import pymbolic.primitives as prim
+from dune.codegen.blockstructured.tools import sub_element_inames
 
-from loopy.match import Tagged, Id, Writes, Or
+from loopy.match import Tagged, Id, Writes, Reads, And, Or, Iname, All, Not
+from islpy import BasicSet
 
 from dune.codegen.generation import get_global_context_value
 from dune.codegen.loopy.target import dtype_floatingpoint
@@ -14,38 +16,39 @@ from dune.codegen.pdelab.geometry import world_dimension
 from dune.codegen.tools import get_pymbolic_basename
 
 
-def add_vcl_temporaries(knl):
+def add_vcl_temporaries(knl, vcl_size):
     vector_alias = [a for a in knl.arg_dict if a.endswith('alias')]
 
     # add new temporaries for vectors
     # hope one read insn doesn't have two different reads from the same temporary
     new_vec_temporaries = dict()
     new_insns = []
-    init_iname = 'init_vec'
+    init_iname = 'init_vec{}'.format(vcl_size)
     from islpy import BasicSet
     init_domain = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(init_iname, get_vcl_type_size(dtype_floatingpoint())))
+
+    silenced_warnings = []
+
     for alias in vector_alias:
-        vector_name = alias.replace('alias', 'vec')
+        vector_name = alias.replace('alias', 'vec{}'.format(vcl_size))
         new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64,
-                                                                 shape=(get_vcl_type_size(np.float64),), managed=True,
+                                                                 shape=(vcl_size,), managed=True,
                                                                  scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',))
-        # write once to the vector such that loopy won't complain
-        new_insns.append(lp.Assignment(assignee=prim.Subscript(prim.Variable(vector_name), prim.Variable(init_iname)),
-                                       expression=0, within_inames=frozenset({init_iname}),
-                                       id='init_' + vector_name))
+        # silence warning such that loopy won't complain
+        silenced_warnings.append("read_no_write({})".format(vector_name))
 
     from loopy.kernel.data import VectorizeTag
     return knl.copy(instructions=knl.instructions + new_insns, domains=knl.domains + [init_domain],
                     temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries),
-                    iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()}))
+                    iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()}),
+                    silenced_warnings=knl.silenced_warnings + silenced_warnings)
 
 
-def add_vcl_accum_insns(knl, iname_inner, iname_outer):
+def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size, level):
     nptype = dtype_floatingpoint()
-    vcl_size = get_vcl_type_size(np.float64)
 
-    from loopy.match import Tagged
-    accum_insns = set(lp.find_instructions(knl, Tagged('accum')))
+    accum_insns = lp.find_instructions(knl, And((Tagged('accum'), Iname(inner_iname))))
+    accum_ids = [insn.id for insn in accum_insns]
 
     new_insns = []
     vng = knl.get_var_name_generator()
@@ -53,12 +56,12 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer):
     new_vec_temporaries = dict()
     for insn in knl.instructions:
         # somehow CInstructions are not hashable....
-        if isinstance(insn, lp.MultiAssignmentBase) and insn in accum_insns:
+        if isinstance(insn, lp.MultiAssignmentBase) and insn.id in accum_ids:
             # write accum expr as "r = expr + r"
             expr_without_r = prim.Sum(tuple(e for e in insn.expression.children if not e == insn.assignee))
 
             inames_micro = set((i for i in insn.within_inames if i.startswith('micro')))
-            iname_ix = next((i for i in inames_micro if i.endswith("_x")))
+            iname_ix = next((i for i in inames_micro if '_x' in i))  # TODO use TaggedIname when available
 
             # need inames for head and tail handling a priori
             from loopy.match import Not, All
@@ -75,8 +78,8 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer):
             inames_tail = frozenset((var.name for var in replace_tail_inames.values()))
 
             # erstelle a[iy] und b
-            identifier_left = vng('left_node')
-            identifier_right = vng('right_node')
+            identifier_left = vng('left_node_vec{}'.format(vcl_size))
+            identifier_right = vng('right_node_vec{}'.format(vcl_size))
             new_vec_temporaries[identifier_left] = DuneTemporaryVariable(identifier_left, dtype=np.float64,
                                                                          shape=(2,) * (world_dimension() - 1) +
                                                                                (vcl_size,),
@@ -89,37 +92,40 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer):
 
             var_left = prim.Subscript(prim.Variable(identifier_left),
                                       tuple(prim.Variable(i) for i in sorted(inames_micro - frozenset({iname_ix}))) +
-                                      (prim.Variable(iname_inner),))
-            var_right = prim.Subscript(prim.Variable(identifier_right), (prim.Variable(iname_inner),))
+                                      (prim.Variable(inner_iname),))
+            var_right = prim.Subscript(prim.Variable(identifier_right), (prim.Variable(inner_iname),))
 
             # init a
             id_init_a = idg('insn_init_' + identifier_left)
             new_insns.append(lp.Assignment(assignee=substitute(var_left, replace_head_inames),
                                            expression=0,
                                            id=id_init_a,
-                                           within_inames=(insn.within_inames - frozenset({iname_outer}) -
+                                           within_inames=(insn.within_inames - frozenset({outer_iname}) -
                                                           inames_micro) | inames_head,
-                                           tags=frozenset({'head'})))
+                                           tags=frozenset({'head_vec{}'.format(vcl_size),
+                                                           'vectorized_{}'.format(level)})))
 
             # setze werte fÃ¼r a und b
             expr_right = substitute(expr_without_r, {iname_ix: 1})
             expr_left = prim.Sum((substitute(expr_without_r, {iname_ix: 0}), var_left))
 
-            id_set_left = idg('insn_' + identifier_left)
-            id_set_right = idg('insn_' + identifier_right)
+            id_set_left = idg('{}_{}'.format(insn.id, identifier_left))
+            id_set_right = idg('{}_{}'.format(insn.id, identifier_right))
             new_insns.append(lp.Assignment(assignee=var_right,
                                            expression=expr_right,
                                            id=id_set_right,
                                            depends_on=insn.depends_on,
-                                           within_inames=insn.within_inames - frozenset({iname_ix})))
+                                           within_inames=insn.within_inames - frozenset({iname_ix}),
+                                           tags=frozenset({'vectorized_{}'.format(level)})))
             new_insns.append(lp.Assignment(assignee=var_left,
                                            expression=expr_left,
                                            id=id_set_left,
                                            depends_on=insn.depends_on | frozenset({id_init_a}),
-                                           within_inames=insn.within_inames - frozenset({iname_ix})))
+                                           within_inames=insn.within_inames - frozenset({iname_ix}),
+                                           tags=frozenset({'vectorized_{}'.format(level)})))
 
             # r+=a[iy]
-            id_accum = idg('insn_mod_accum')
+            id_accum = idg('{}_mod_accum'.format(insn.id))
             expr_accum = prim.Sum((var_left,
                                    prim.Call(VCLPermute(nptype, vcl_size, (-1,) + tuple(range(vcl_size - 1))),
                                              (var_right,)),
@@ -130,9 +136,10 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer):
                                            depends_on=insn.depends_on | frozenset({id_set_left,
                                                                                    id_init_a, id_set_right}),
                                            within_inames=insn.within_inames - frozenset({iname_ix}),
-                                           tags=frozenset({'accum'})))
+                                           tags=frozenset({'accum_vec{}'.format(vcl_size),
+                                                           'vectorized_{}'.format(level)})))
             # a[iy] = permute
-            id_permute = idg('insn_permute')
+            id_permute = idg('{}_permute'.format(insn.id))
             expr_permute = prim.Call(VCLPermute(nptype, vcl_size, (vcl_size - 1,) + (-1,) * (vcl_size - 1)),
                                      (var_right,))
             new_insns.append(lp.Assignment(assignee=var_left,
@@ -140,16 +147,17 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer):
                                            id=id_permute,
                                            depends_on=insn.depends_on | frozenset({id_set_left, id_init_a, id_set_right,
                                                                                    id_accum}),
-                                           within_inames=insn.within_inames - frozenset({iname_ix})
+                                           within_inames=insn.within_inames - frozenset({iname_ix}),
+                                           tags=frozenset({'vectorized_{}'.format(level)})
                                            ))
 
             # tail handling, uses tail alias
-            id_accum_tail = idg('insn_accum_tail')
-            subst_map = {iname_inner: vcl_size - 1, iname_outer: get_form_option("number_of_blocks") // vcl_size - 1,
+            id_accum_tail = idg('{}_accum_tail'.format(insn.id))
+            subst_map = {inner_iname: vcl_size - 1, outer_iname: get_form_option("number_of_blocks") // vcl_size - 1,
                          iname_ix: 1, insn.assignee_name: prim.Variable(insn.assignee_name + '_tail'),
                          **replace_tail_inames}
             assignee_tail = substitute(insn.assignee, subst_map)
-            expr_tail = prim.Sum((substitute(var_left, {iname_inner: 0, **replace_tail_inames}), assignee_tail))
+            expr_tail = prim.Sum((substitute(var_left, {inner_iname: 0, **replace_tail_inames}), assignee_tail))
 
             write_to_tail_ids = tuple(i.id for i in lp.find_instructions(knl,
                                                                          Writes(get_pymbolic_basename(assignee_tail))))
@@ -159,39 +167,56 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer):
                                            id=id_accum_tail,
                                            depends_on=(frozenset({id_accum, id_permute, id_set_left, id_init_a}) |
                                                        frozenset(write_to_tail_ids)),
-                                           within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer}) -
+                                           within_inames=(insn.within_inames - frozenset({inner_iname, outer_iname}) -
                                                           inames_micro) | inames_tail,
-                                           tags=frozenset({'tail'})))
+                                           tags=frozenset({'tail_vec{}'.format(vcl_size),
+                                                           'vectorized_{}'.format(level)})))
         else:
-            new_insns.append(insn)
+            if insn.id.endswith('tail') and insn.id.replace('_tail', '') in accum_ids:
+                accum_id = insn.id.replace('_tail', '')
+                new_insns.append(insn.copy(depends_on=insn.depends_on | frozenset({accum_id + '_accum_tail'})))
+            else:
+                new_insns.append(insn)
 
     return knl.copy(instructions=new_insns,
                     temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries))
 
 
-def add_vcl_access(knl, iname_inner):
-    from loopy.match import Reads, Tagged
-    accum_insns = set((insn.id for insn in lp.find_instructions(knl, Tagged('accum'))))
-    read_insns = set((insn.id for insn in lp.find_instructions(knl, Reads('*alias'))))
+def add_vcl_access(knl, inner_iname, vcl_size, level=0):
+    accum_insns = set((insn.id for insn in lp.find_instructions(knl, And((Tagged('accum_vec{}'.format(vcl_size)),
+                                                                          Iname(inner_iname))))))
+    read_insns = set((insn.id for insn in lp.find_instructions(knl, And((Reads('*alias'), Iname(inner_iname))))))
     vectorized_insns = accum_insns | read_insns
 
+    alias_suffix = 'alias'
+    vector_sufix = 'vec{}'.format(vcl_size)
+
     from loopy.symbolic import CombineMapper
     from loopy.symbolic import IdentityMapper
 
     class AliasIndexCollector(CombineMapper, IdentityMapper):
+        def __init__(self):
+            self.found_alias = False
+
         def combine(self, values):
             return sum(values, tuple())
 
         def map_constant(self, expr):
-            return tuple()
+            if self.found_alias:
+                return (expr,)
+            else:
+                return tuple()
 
         map_variable = map_constant
         map_function_symbol = map_constant
         map_loopy_function_identifier = map_constant
 
         def map_subscript(self, expr):
-            if expr.aggregate.name.endswith('alias'):
-                return expr.aggregate, expr.index_tuple
+            if expr.aggregate.name.endswith(alias_suffix):
+                self.found_alias = True
+                indices = self.combine((self.rec(index) for index in expr.index_tuple))
+                self.found_alias = False
+                return expr.aggregate, indices
             else:
                 return tuple()
 
@@ -208,23 +233,25 @@ def add_vcl_access(knl, iname_inner):
 
         alias, index = aic(insn.expression)
         name_alias = alias.name
-        name_vec = name_alias.replace('alias', 'vec')
+        name_vec = name_alias.replace(alias_suffix, vector_sufix)
         vectorized_insn_to_vector_names[id] = (name_alias, name_vec)
 
         # compute index without vec iname
         strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
-        index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
-                               if i != 0 and i.name != iname_inner))
+        flat_index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
+                                    if not (isinstance(i, prim.Variable) and i.name == inner_iname)))
 
         # find write insns
         write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec)))))
 
         # add load instruction
         load_id = idg('insn_' + name_vec + '_load')
-        call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), index)),))
+        call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), flat_index)),))
         load_insns.append(lp.CallInstruction(assignees=(), expression=call_load,
                                              id=load_id, within_inames=insn.within_inames | insn.reduction_inames(),
-                                             depends_on=insn.depends_on | write_ids,))
+                                             depends_on=insn.depends_on | write_ids,
+                                             depends_on_is_final=True,
+                                             tags=frozenset({'vectorized_{}'.format(level)})))
         read_dependencies.setdefault(id, set())
         read_dependencies[id].add(load_id)
 
@@ -235,61 +262,66 @@ def add_vcl_access(knl, iname_inner):
 
         alias, index = aic(insn.expression)
         name_alias = alias.name
-        name_vec = name_alias.replace('alias', 'vec')
+        name_vec = name_alias.replace(alias_suffix, vector_sufix)
         vectorized_insn_to_vector_names[id] = (name_alias, name_vec)
 
         # flat index without vec iname
         strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
-        index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
-                               if i != 0 and i.name != iname_inner))
+        flat_index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
+                                    if not (isinstance(i, prim.Variable) and i.name == inner_iname)))
 
         # find write insns
         write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec)))))
 
         # add store instruction
         store_id = idg('insn_' + name_vec + '_store')
-        call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), index)),))
+        call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), flat_index)),))
         store_insns.append(lp.CallInstruction(assignees=(), expression=call_store,
                                               id=store_id, within_inames=insn.within_inames,
                                               depends_on=(insn.depends_on | frozenset({id}) | read_dependencies[id] |
-                                                          write_ids)))
+                                                          write_ids),
+                                              depends_on_is_final=True,
+                                              tags=frozenset({'vectorized_{}'.format(level)})))
 
     # replace alias with vcl vector, except for accumulation assignee
-    vector_alias = [a for a in knl.arg_dict if a.endswith('alias')]
+    vector_alias = [a for a in knl.arg_dict if a.endswith(alias_suffix)]
     dim = world_dimension()
     dim_names = ["x", "y", "z"] + [str(i) for i in range(4, dim + 1)]
     # remove CInstructions since loopy extract expects to get only assignments
-    knl_without_cinsn = knl.copy(instructions=[insn for insn in knl.instructions
-                                               if not isinstance(insn, lp.CInstruction)])
+    knl_with_subst_insns = knl.copy(instructions=[insn for insn in lp.find_instructions(knl, Iname(inner_iname))
+                                                  if not isinstance(insn, lp.CInstruction)])
     for alias in vector_alias:
         # Rename lhs which would match the substitution rule since loopy doesn't want substitutions as lhs
         new_insns = []
-        for insn in knl_without_cinsn.instructions:
+        for insn in knl_with_subst_insns.instructions:
             if isinstance(insn, lp.Assignment) and isinstance(insn.assignee, prim.Subscript):
                 if insn.assignee.aggregate.name == alias:
                     new_insns.append(insn.copy(assignee=prim.Subscript(prim.Variable('dummy_' + alias),
                                                                        insn.assignee.index_tuple)))
-                    pass
                 else:
                     new_insns.append(insn)
             else:
                 new_insns.append(insn)
-        knl_without_cinsn = knl_without_cinsn.copy(instructions=new_insns)
-
-        # substitution rule for alias[ex_outer,ex_inner, ey, ix, iy] -> vec[ex_inner]
-        parameters = 'ex_o,ex_i,' + ','.join(['e' + d for d in dim_names[1:dim]]) + \
-                     ',ix,' + ','.join(['i' + d for d in dim_names[1:dim]])
-        knl_without_cinsn = lp.extract_subst(knl_without_cinsn, alias + '_subst', '{}[{}]'.format(alias, parameters),
-                                             parameters=parameters)
-        new_subst = knl_without_cinsn.substitutions.copy()
+        knl_with_subst_insns = knl_with_subst_insns.copy(instructions=new_insns)
+
+        # substitution rule for alias[[ex_o]*l,ex_inner, ey, ix, iy] -> vec[ex_inner]
+        parameters = ','.join(['ex_o{}'.format(l) for l in range(level + 1)]) + \
+                     ',v_i,' + \
+                     ','.join(['e' + d for d in dim_names[1:dim]]) + \
+                     ',ix,' + \
+                     ','.join(['i' + d for d in dim_names[1:dim]])
+        knl_with_subst_insns = lp.extract_subst(knl_with_subst_insns,
+                                                alias + '_subst', '{}[{}]'.format(alias, parameters),
+                                                parameters=parameters)
+        new_subst = knl_with_subst_insns.substitutions.copy()
         rule = new_subst[alias + '_subst']
-        rule.expression = prim.Subscript(prim.Variable(alias.replace('alias', 'vec')), (prim.Variable('ex_i'),))
-        knl_without_cinsn = knl_without_cinsn.copy(substitutions=new_subst)
+        rule.expression = prim.Subscript(prim.Variable(alias.replace(alias_suffix, vector_sufix)),
+                                         (prim.Variable('v_i'),))
+        knl_with_subst_insns = knl_with_subst_insns.copy(substitutions=new_subst)
 
-    from loopy.match import All
-    knl_without_cinsn = lp.expand_subst(knl_without_cinsn, All())
-    knl = knl_without_cinsn.copy(instructions=knl_without_cinsn.instructions + [insn for insn in knl.instructions
-                                                                                if isinstance(insn, lp.CInstruction)])
+    knl_with_subst_insns = lp.expand_subst(knl_with_subst_insns, Iname(inner_iname))
+    knl = knl.copy(instructions=knl_with_subst_insns.instructions + [insn for insn in knl.instructions
+                                                                     if insn.id not in knl_with_subst_insns.id_to_insn])
 
     # add store and load dependencies and set right accumulation assignee
     new_insns = []
@@ -305,17 +337,21 @@ def add_vcl_access(knl, iname_inner):
                 try:
                     assignee_vec = next((expr for expr in insn.expression.children
                                          if isinstance(expr, prim.Subscript) and
-                                         expr.aggregate.name.replace('vec', 'alias') ==
+                                         expr.aggregate.name.replace(vector_sufix, alias_suffix) ==
                                          assignee_alias.aggregate.name.replace('dummy_', '')))
                 except StopIteration:
                     from dune.codegen.error import CodegenVectorizationError
                     raise CodegenVectorizationError
                 new_insns.append(insn.copy(assignee=assignee_vec,
                                            depends_on=(insn.depends_on | read_dependencies[insn.id] |
-                                                       write_ids)))
+                                                       write_ids),
+                                           depends_on_is_final=True,
+                                           tags=insn.tags | frozenset({'vectorized_{}'.format(level)})))
             else:
                 new_insns.append(insn.copy(depends_on=(insn.depends_on | read_dependencies[insn.id] |
-                                                       write_ids)))
+                                                       write_ids),
+                                           depends_on_is_final=True,
+                                           tags=insn.tags | frozenset({'vectorized_{}'.format(level)})))
 
     return knl.copy(instructions=new_insns + load_insns + store_insns)
 
@@ -331,38 +367,66 @@ def find_accumulation_inames(knl):
     return inames
 
 
-def add_iname_array(knl, vec_iname):
-    insns_with_macro_points = lp.find_instructions(knl, Tagged(vec_iname))
+def add_iname_array(knl, iname):
+    insns_with_macro_points = lp.find_instructions(knl, Tagged(iname))
 
     if insns_with_macro_points:
-        array_name = vec_iname + '_arr'
-        vector_name = vec_iname + '_vec'
+        new_iname = knl.get_var_name_generator()(iname)
+
+        new_dom = BasicSet('{{ [{0}] : 0<={0}<{1} }}'.format(new_iname, get_form_option('number_of_blocks')))
+
+        array_name = iname + '_arr'
 
         new_temporaries = dict()
         new_temporaries[array_name] = DuneTemporaryVariable(array_name, managed=True,
                                                             shape=(get_form_option('number_of_blocks'),),
                                                             scope=lp.temp_var_scope.PRIVATE, dtype=np.float64,
-                                                            base_storage=vec_iname + '_buff',
+                                                            base_storage=array_name + '_buff',
                                                             _base_storage_access_may_be_aliasing=True)
-        new_temporaries[vector_name] = DuneTemporaryVariable(vector_name, managed=True,
-                                                             shape=(get_form_option('number_of_blocks'),),
-                                                             scope=lp.temp_var_scope.PRIVATE, dtype=np.float64,
-                                                             base_storage=vec_iname + '_buff',
-                                                             _base_storage_access_may_be_aliasing=True)
-        silenced_warning = ["read_no_write({})".format(vector_name)]
-
-        new_insns = [lp.Assignment(assignee=prim.Subscript(prim.Variable(array_name), (prim.Variable(vec_iname),)),
-                                   expression=prim.Variable(vec_iname),
-                                   id='init_{}_buffer'.format(vec_iname),
-                                   within_inames=frozenset({vec_iname}), within_inames_is_final=True)]
 
         replacemap = dict()
-        replacemap[vec_iname] = prim.Subscript(prim.Variable(vector_name), (prim.Variable(vec_iname),))
+        replacemap[iname] = prim.Subscript(prim.Variable(array_name), (prim.Variable(iname),))
+
+        new_insns = [lp.Assignment(assignee=prim.Subscript(prim.Variable(array_name), (prim.Variable(new_iname),)),
+                                   expression=prim.Variable(new_iname),
+                                   id='init_{}_buffer'.format(array_name),
+                                   within_inames=frozenset({new_iname}), within_inames_is_final=True)]
+
+        for insn in knl.instructions:
+            if insn in insns_with_macro_points:
+                transformed_insn = insn.with_transformed_expressions(lambda expr: substitute(expr, replacemap))
+                new_insns.append(transformed_insn.copy(depends_on='init_{}_buffer'.format(array_name)))
+            else:
+                new_insns.append(insn)
+
+        knl = knl.copy(instructions=new_insns, domains=knl.domains + [new_dom],
+                       temporary_variables=dict(**knl.temporary_variables, **new_temporaries))
+
+    return knl
+
+
+def add_vcl_iname_array(knl, iname, vec_iname, vcl_size, level):
+    insns_with_macro_points = lp.find_instructions(knl, And((Tagged(iname), Iname(vec_iname))))
 
+    if insns_with_macro_points:
+        iname_array = iname + '_arr'
+        vector_name = iname + '_vec{}'.format(vcl_size)
+
+        new_temporaries = {vector_name: DuneTemporaryVariable(vector_name, managed=True,
+                                                              shape=(get_form_option('number_of_blocks'),),
+                                                              scope=lp.temp_var_scope.PRIVATE, dtype=np.float64,
+                                                              base_storage=iname_array + '_buff',
+                                                              _base_storage_access_may_be_aliasing=True)}
+        silenced_warning = ["read_no_write({})".format(vector_name)]
+
+        replacemap = {iname_array: prim.Variable(vector_name)}
+
+        new_insns = []
         for insn in knl.instructions:
             if insn in insns_with_macro_points:
                 transformed_insn = insn.with_transformed_expressions(lambda expr: substitute(expr, replacemap))
-                new_insns.append(transformed_insn.copy(depends_on='init_{}_buffer'.format(vec_iname)))
+                new_insns.append(transformed_insn.copy(depends_on='init_{}_buffer'.format(iname_array),
+                                                       tags=insn.tags | frozenset({'vectorized_{}'.format(level)})))
             else:
                 new_insns.append(insn)
 
@@ -370,27 +434,120 @@ def add_iname_array(knl, vec_iname):
                        temporary_variables=dict(**knl.temporary_variables, **new_temporaries),
                        silenced_warnings=knl.silenced_warnings + silenced_warning)
 
-        knl = lp.duplicate_inames(knl, [vec_iname], within=Id('init_{}_buffer'.format(vec_iname)))
+        knl = lp.split_array_axis(knl, (vector_name,), 0, vcl_size)
+        knl = lp.tag_array_axes(knl, (vector_name,), ('c', 'vec'))
+
+    return knl
+
+
+def realize_tail(knl, inner_iname, outer_iname, outer_bound, tail_iname, vcl_size, level):
+    tail_size = get_form_option('number_of_blocks') % vcl_size
+    new_dom = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(tail_iname, tail_size))
+
+    insns_to_duplicate = lp.find_instructions(knl, Iname(inner_iname))
+    ids_to_duplicate = tuple((insn.id for insn in insns_to_duplicate))
+
+    subst_map = dict([(outer_iname, outer_bound // vcl_size),
+                      (inner_iname, prim.Variable(tail_iname))])
+
+    temporaries_to_duplicate = dict()
+    for insn in insns_to_duplicate:
+        if isinstance(insn, lp.Assignment):
+            assignee = insn.assignee
+            name = get_pymbolic_basename(assignee)
+            if name in knl.temporary_variables:
+                new_name = name + '_tail'
+                temporaries_to_duplicate[new_name] = knl.temporary_variables[name].copy(name=new_name)
+                subst_map[name] = prim.Variable(new_name)
+
+    new_insns = []
+    for insn in insns_to_duplicate:
+        new_insn = insn.with_transformed_expressions(lambda e: substitute(e, subst_map))
+        new_depends_on = frozenset((insn_id + '_tail' if insn_id in ids_to_duplicate else insn_id
+                                    for insn_id in insn.depends_on))
+        new_within_inames = frozenset((iname + '_tail' if iname == inner_iname else iname
+                                       for iname in insn.within_inames)) - frozenset({outer_iname})
+        new_insns.append(new_insn.copy(id=insn.id + '_tail', depends_on=new_depends_on,
+                                       within_inames=new_within_inames,
+                                       tags=insn.tags | frozenset({'tail_{}'.format(level)})))
+
+    knl = knl.copy(domains=knl.domains + [new_dom], instructions=knl.instructions + new_insns,
+                   temporary_variables=dict(**knl.temporary_variables, **temporaries_to_duplicate))
+
+    common_inames = knl.all_inames()
+    for insn in new_insns:
+        common_inames = common_inames & (insn.within_inames | insn.reduction_inames())
+
+    if get_form_option('vectorization_blockstructured_tail_ordering') == 'blocked':
+        # TODO need to be more clever to get the right inames
+        macro_inames = frozenset((iname + '_0' * level) for iname in sub_element_inames())
+        common_inames = common_inames - macro_inames
+
+    additional_inames_to_duplicate = frozenset()
+    for insn in new_insns:
+        insn_inames = insn.within_inames | insn.reduction_inames()
+        additional_inames_to_duplicate = additional_inames_to_duplicate | (insn_inames - common_inames)
+
+    knl = lp.duplicate_inames(knl, tuple(additional_inames_to_duplicate),
+                              Or(tuple((Id(insn.id) for insn in new_insns))))
+
+    return lp.make_reduction_inames_unique(knl)
+
+
+def do_vectorization(knl, orig_iname, vec_iname, iname_bound, vcl_size, level=0):
+    inner_iname = vec_iname + '_inner'
+    outer_iname = vec_iname + '_outer'
+
+    tail_size = iname_bound % vcl_size
+    if get_form_option('vectorization_blockstructured_tail'):
+        tail_vcl_size = vcl_size
+        while tail_vcl_size > tail_size:
+            tail_vcl_size = tail_vcl_size // 2
+        vectorize_tail = tail_vcl_size > 1
+    else:
+        vectorize_tail = False
+
+    # manually add tail, since split_iname with slabs tries to vectorize the tail
+    if tail_size > 0:
+        # fake suitable loop bound
+        vectorizable_bound = (iname_bound // vcl_size) * vcl_size
+        from loopy.kernel.tools import DomainChanger
+        domch = DomainChanger(knl, (vec_iname,))
+        knl = knl.copy(domains=domch.get_domains_with(
+            BasicSet('{{ [{0}]: 0<={0}<{1} }}'.format(vec_iname, vectorizable_bound))))
+
+        knl = lp.split_iname(knl, vec_iname, vcl_size, outer_iname=outer_iname, inner_iname=inner_iname)
+
+        tail_iname = vec_iname + '_inner' + '_tail'
+        knl = realize_tail(knl, inner_iname, outer_iname, iname_bound, tail_iname, vcl_size, level)
+    else:
+        knl = lp.split_iname(knl, vec_iname, vcl_size)
+
+    knl = lp.tag_inames(knl, [(inner_iname, 'vec')])
+
+    array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
+    knl = lp.split_array_axis(knl, array_alias, level, vcl_size)
+
+    knl = add_vcl_temporaries(knl, vcl_size)
+    knl = add_vcl_iname_array(knl, orig_iname, inner_iname, vcl_size, level)
+    knl = add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size, level)
+    knl = add_vcl_access(knl, inner_iname, vcl_size, level)
+
+    if tail_size > 0:
+        knl = lp.add_dependency(knl, And((Tagged('tail_{}'.format(level)), Not(Tagged('head*')))),
+                                Tagged('vectorized_{}'.format(level)))
+        if vectorize_tail:
+            knl = do_vectorization(knl, orig_iname, tail_iname, tail_size, tail_vcl_size, level + 1)
 
     return knl
 
 
 def vectorize_micro_elements(knl):
     vec_iname = "subel_x"
+    orig_iname = vec_iname
     if vec_iname in knl.all_inames() and get_global_context_value('integral_type') == 'cell':
         vcl_size = get_vcl_type_size(np.float64)
-
-        assert get_form_option('number_of_blocks') % vcl_size == 0
-
         knl = add_iname_array(knl, vec_iname)
 
-        knl = lp.split_iname(knl, vec_iname, vcl_size, inner_tag='vec')
-        array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
-        iname_vector = [a for a in knl.temporary_variables.keys() if a.endswith('vec')]
-        knl = lp.split_array_axis(knl, array_alias + iname_vector, 0, vcl_size)
-        knl = lp.tag_array_axes(knl, iname_vector, ('c', 'vec'))
-
-        knl = add_vcl_temporaries(knl)
-        knl = add_vcl_accum_insns(knl, vec_iname + '_inner', vec_iname + '_outer')
-        knl = add_vcl_access(knl, vec_iname + '_inner')
+        knl = do_vectorization(knl, orig_iname, orig_iname, get_form_option('number_of_blocks'), vcl_size)
     return knl
diff --git a/python/dune/codegen/generation/__init__.py b/python/dune/codegen/generation/__init__.py
index d0cf1d4dc8b6880db13bfdec4458d815ea8208c2..97090e18852359b10d1a2d3f74a268a3abac60f1 100644
--- a/python/dune/codegen/generation/__init__.py
+++ b/python/dune/codegen/generation/__init__.py
@@ -16,6 +16,7 @@ from dune.codegen.generation.cpp import (base_class,
                                          class_member,
                                          constructor_parameter,
                                          dump_accumulate_timer,
+                                         register_liwkid_timer,
                                          end_of_file,
                                          include_file,
                                          initializer_list,
@@ -23,6 +24,7 @@ from dune.codegen.generation.cpp import (base_class,
                                          preamble,
                                          post_include,
                                          template_parameter,
+                                         dump_ssc_marks
                                          )
 
 from dune.codegen.generation.hooks import (hook,
diff --git a/python/dune/codegen/generation/cpp.py b/python/dune/codegen/generation/cpp.py
index 29384f98554ab895d37670c017fe5ecc4f191655..2ea4c346590ee80ef329fdc9394b9fbc3c59db9c 100644
--- a/python/dune/codegen/generation/cpp.py
+++ b/python/dune/codegen/generation/cpp.py
@@ -50,3 +50,15 @@ def dump_accumulate_timer(name):
 
     code = "DUMP_TIMER({},{},{},{});".format(get_option("instrumentation_level"), name, os, reset)
     return code
+
+
+@generator_factory(item_tags=("register_likwid_timers",))
+def register_liwkid_timer(name):
+    return "LIKWID_MARKER_REGISTER(\"{}\");".format(name)
+
+
+@generator_factory(item_tags=("register_ssc_marks",))
+def dump_ssc_marks(name):
+    from dune.codegen.pdelab.driver.timings import get_region_marks
+    return 'std::cout << "{}: " << {} << " <--> " << {} << std::endl;'.format(name,
+                                                                              *get_region_marks(name, driver=False))
diff --git a/python/dune/codegen/logging.conf b/python/dune/codegen/logging.conf
index 32974b12744dd1c23c36f561148943ec936b3ccb..1fcc4ba02962460ea3f1db8c678dbba5ef3fef2e 100644
--- a/python/dune/codegen/logging.conf
+++ b/python/dune/codegen/logging.conf
@@ -1,5 +1,5 @@
 [loggers]
-keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.vectorization
+keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.transformations,dune.codegen.sumfact.vectorization
 
 [handlers]
 keys=consoleHandler
@@ -16,6 +16,12 @@ handlers=consoleHandler
 qualname=dune.codegen.pdelab.localoperator
 propagate=0
 
+[logger_dune.codegen.sumfact.transformations]
+level=INFO
+handlers=consoleHandler
+qualname=dune.codegen.sumfact.transformations
+propagate=0
+
 [logger_dune.codegen.sumfact.vectorization]
 level=INFO
 handlers=consoleHandler
diff --git a/python/dune/codegen/loopy/symbolic.py b/python/dune/codegen/loopy/symbolic.py
index 799c7d6040e2e7d9e40c296d73c391aeb43cc6f3..c76fbb063b88b62732ac54bbce39f532ba96aa9a 100644
--- a/python/dune/codegen/loopy/symbolic.py
+++ b/python/dune/codegen/loopy/symbolic.py
@@ -122,6 +122,7 @@ lp.type_inference.TypeInferenceMapper.map_vectorized_sumfact_kernel = needs_reso
 
 # FusedMultiplyAdd node
 lp.symbolic.IdentityMapper.map_fused_multiply_add = identity_map_fused_multiply_add
+lp.symbolic.SubstitutionMapper.map_fused_multiply_add = identity_map_fused_multiply_add
 lp.symbolic.WalkMapper.map_fused_multiply_add = walk_map_fused_multiply_add
 lp.symbolic.StringifyMapper.map_fused_multiply_add = stringify_map_fused_multiply_add
 lp.symbolic.DependencyMapper.map_fused_multiply_add = dependency_map_fused_multiply_add
diff --git a/python/dune/codegen/loopy/transformations/instrumentation.py b/python/dune/codegen/loopy/transformations/instrumentation.py
index 7b13a09e597490dd1bef85344c9be6bdfb859dd3..2fab53a6215a15f0e06e7d42a18f4736b46da34a 100644
--- a/python/dune/codegen/loopy/transformations/instrumentation.py
+++ b/python/dune/codegen/loopy/transformations/instrumentation.py
@@ -1,9 +1,11 @@
 """ Add instrumentation instructions to a kernel """
 
 from dune.codegen.generation import (dump_accumulate_timer,
+                                     register_liwkid_timer,
                                      post_include,
                                      )
 from dune.codegen.options import get_option
+from dune.codegen.pdelab.driver.timings import start_region_timer_instruction, stop_region_timer_instruction
 
 import loopy as lp
 
@@ -67,28 +69,25 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
     # Define the start instruction and correct dependencies for it
     start_id = "{}_start".format(ident)
     start_depends = _union(tuple(i.depends_on for i in insns)).difference(frozenset(i.id for i in insns))
-    start_insn = lp.CInstruction([],
-                                 "HP_TIMER_START({});".format(identifier),
-                                 id=start_id,
-                                 within_inames=within,
-                                 depends_on=depends_on.union(start_depends),
-                                 boostable_into=frozenset(),
-                                 tags=uniontags,
-                                 )
+    start_insn = start_region_timer_instruction(identifier,
+                                                id=start_id,
+                                                within_inames=within,
+                                                depends_on=depends_on.union(start_depends),
+                                                boostable_into=frozenset(),
+                                                tags=uniontags,)
 
     # Add dependencies on the timing instructions
     rewritten_insns.extend([i.copy(depends_on=i.depends_on.union(frozenset({start_id}))) for i in insns])
 
     # Define the stop instruction and correct dependencies for it
     stop_id = "{}_stop".format(ident)
-    stop_insn = lp.CInstruction([],
-                                "HP_TIMER_STOP({});".format(identifier),
-                                id=stop_id,
-                                within_inames=within,
-                                depends_on=frozenset(i.id for i in insns),
-                                boostable_into=frozenset(),
-                                tags=uniontags,
-                                )
+    stop_insn = stop_region_timer_instruction(identifier,
+                                              id=stop_id,
+                                              within_inames=within,
+                                              depends_on=frozenset(i.id for i in insns),
+                                              boostable_into=frozenset(),
+                                              tags=uniontags,
+                                              )
 
     # Find all the instructions that should depend on stop
     dep_insns = filter(lambda i: _intersect((i.depends_on, frozenset(i.id for i in insns))),
@@ -97,8 +96,11 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
     rewritten_insns.extend([i.copy(depends_on=i.depends_on.union(frozenset({stop_id}))) for i in dep_insns])
 
     # Trigger code generation on the file/operator level
-    post_include('HP_DECLARE_TIMER({});'.format(identifier), filetag=filetag)
-    dump_accumulate_timer(identifier)
+    if get_option("use_likwid"):
+        register_liwkid_timer(identifier)
+    else:
+        post_include('HP_DECLARE_TIMER({});'.format(identifier), filetag=filetag)
+        dump_accumulate_timer(identifier)
 
     # Filter all the instructions which were untouched
     other_insns = list(filter(lambda i: i.id not in [j.id for j in rewritten_insns], knl.instructions))
diff --git a/python/dune/codegen/loopy/transformations/performance.py b/python/dune/codegen/loopy/transformations/performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..02644e80c3c0f782309a165c050949e0f1d71cb2
--- /dev/null
+++ b/python/dune/codegen/loopy/transformations/performance.py
@@ -0,0 +1,8 @@
+from dune.codegen.options import get_form_option
+from dune.codegen.sumfact.transformations import sumfact_performance_transformations
+
+
+def performance_transformations(kernel, signature):
+    if get_form_option("sumfact"):
+        kernel = sumfact_performance_transformations(kernel, signature)
+    return kernel
diff --git a/python/dune/codegen/loopy/transformations/remove_reductions.py b/python/dune/codegen/loopy/transformations/remove_reductions.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ccca09c230f5ec32a9134035f3e883c63cc3680
--- /dev/null
+++ b/python/dune/codegen/loopy/transformations/remove_reductions.py
@@ -0,0 +1,71 @@
+import loopy as lp
+
+import pymbolic.primitives as prim
+
+
+def remove_reduction(knl, match):
+    """Removes all matching reductions and do direct accumulation in assignee instead"""
+
+    # Find reductions
+    for instr in lp.find_instructions(knl, match):
+        if isinstance(instr.expression, lp.symbolic.Reduction):
+            instructions = []
+
+            # Dependencies
+            depends_on = instr.depends_on
+            depending = []
+            for i in knl.instructions:
+                if instr.id in i.depends_on:
+                    depending.append(i.id)
+
+            # Remove the instruction from the kernel
+            knl = lp.remove_instructions(knl, set([instr.id]))
+
+            # Add instruction that sets assignee to zero
+            id_zero = instr.id + '_set_zero'
+            instructions.append(lp.Assignment(instr.assignee,
+                                              0,
+                                              within_inames=instr.within_inames,
+                                              id=id_zero,
+                                              tags=('set_zero',)
+                                              ))
+
+            # Add instruction that accumulates directly in assignee
+            assignee = instr.assignee
+            expression = prim.Sum((assignee, instr.expression.expr))
+            within_inames = frozenset(tuple(instr.within_inames) + instr.expression.inames)
+            id_accum = instr.id + '_accum'
+            instructions.append(lp.Assignment(assignee,
+                                              expression,
+                                              within_inames=within_inames,
+                                              id=id_accum,
+                                              depends_on=frozenset((id_zero,) + tuple(depends_on)),
+                                              tags=('assignment',)))
+
+            knl = knl.copy(instructions=knl.instructions + instructions)
+
+            # Restore dependencies
+            for dep in depending:
+                match = lp.match.Id(dep)
+                knl = lp.add_dependency(knl, match, id_accum)
+    return knl
+
+
+def remove_all_reductions(knl):
+    """Remove all reductions from loopy kernel
+
+    This removes all reductions by instead setting the assignee to zero and
+    directly accumulating in the assignee.
+    """
+    # Find ids of all reductions
+    ids = []
+    for instr in knl.instructions:
+        if isinstance(instr.expression, lp.symbolic.Reduction):
+            ids.append(instr.id)
+
+    # Remove reductions
+    for id in ids:
+        match = lp.match.Id(id)
+        knl = remove_reduction(knl, match)
+
+    return knl
diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index a3111553a1f69aeffc3f8da5286fd201dd5ec0ec..9407709a515ef497b9f9b7561f570cf0411dd4ed 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -37,6 +37,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
     debug_cache_with_stack = CodegenOption(default=False, helpstr="Store stack along with cache objects. Makes debugging caching issues easier.")
     driver_file = CodegenOption(helpstr="The filename for the generated driver header")
     explicit_time_stepping = CodegenOption(default=False, helpstr="use explicit time stepping")
+    time_stepping_order = CodegenOption(default=1, helpstr="Order of the time stepping method")
     exact_solution_expression = CodegenOption(helpstr="name of the exact solution expression in the ufl file")
     compare_l2errorsquared = CodegenOption(helpstr="maximal allowed l2 error squared of difference between numerical solution and interpolation of exact solution (NOTE: requires --exact-solution-expression)")
     grid_info = CodegenOption(default=None, helpstr="Path to file with information about facedir and facemod variations. This can be used to limit the generation of skeleton kernels.")
@@ -49,12 +50,17 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
     architecture = CodegenOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl|skylake")
     yaspgrid_offset = CodegenOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.")
     grid_unstructured = CodegenOption(default=False, helpstr="Set to true if you want to use an unstructured grid.")
+    grid_consistent = CodegenOption(default=False, helpstr="The used grid is already consistent.")
     precision_bits = CodegenOption(default=64, helpstr="The number of bits for the floating point type")
     overlapping = CodegenOption(default=False, helpstr="Use an overlapping solver and constraints. You still need to make sure to construct a grid with overlap! The parallel option will be set automatically.")
     operators = CodegenOption(default="r", helpstr="A comma separated list of operators, each name will be interpreted as a subsection name within the formcompiler section")
     target_name = CodegenOption(default=None, helpstr="The target name from CMake")
     operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
     debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).")
+    use_likwid = CodegenOption(default=False, helpstr="Use likwid instead of own performance measurements.")
+    use_sde = CodegenOption(default=False, helpstr="Use sde instead of own performance measurements.")
+    autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
+    with_mpi = CodegenOption(default=True, helpstr="The module was configured with mpi")
     permuting_horizontal_add = CodegenOption(default=True, helpstr="Whether SIMD horizontal_add should use a permuting implementation.")
 
     # Arguments that are mainly to be set by logic depending on other options
@@ -84,6 +90,9 @@ class CodegenFormOptionsArray(ImmutableRecord):
     sumfact = CodegenOption(default=False, helpstr="Use sumfactorization")
     sumfact_regular_jacobians = CodegenOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)")
     sumfact_on_boundary = CodegenOption(default=True, helpstr="Whether boundary integrals should be vectorized. It might not be worth the hassle...")
+    sumfact_optimize_loop_order = CodegenOption(default=False, helpstr="Optimize order of loops in sumf factorization function using autotuning.")
+    sumfact_performance_transformations = CodegenOption(default=False, helpstr="Apply sum factorization specific performance transformations.")
+    sumfact_performance_transformations_testrun = CodegenOption(default=0, helpstr="If larger than zero determines test case to run.")
     vectorization_quadloop = CodegenOption(default=False, helpstr="whether to generate code with explicit vectorization")
     vectorization_strategy = CodegenOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune")
     vectorization_not_fully_vectorized_error = CodegenOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize")
@@ -102,12 +111,15 @@ class CodegenFormOptionsArray(ImmutableRecord):
     blockstructured = CodegenOption(default=False, helpstr="Use block structure")
     number_of_blocks = CodegenOption(default=1, helpstr="Number of sub blocks in one direction")
     vectorization_blockstructured = CodegenOption(default=False, helpstr="Vectorize block structuring")
+    vectorization_blockstructured_tail = CodegenOption(default=True, helpstr="Try to fully vectorize block structuring even when 'nunmber_of_blocks' is not divisible by vector length")
+    vectorization_blockstructured_tail_ordering = CodegenOption(default='consecutive', helpstr="Ordering of the tail w.r.t the vectorized loop. Possible values: consecutive|blocked")
     adjoint = CodegenOption(default=False, helpstr="Generate adjoint operator")
     control = CodegenOption(default=False, helpstr="Generate operator of derivative w.r.t. the control variable")
     objective_function = CodegenOption(default=None, helpstr="Name of form representing the objective function in UFL file")
     control_variable = CodegenOption(default=None, helpstr="Name of control variable in UFL file")
     block_preconditioner_diagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the diagonal part of a block preconditioner")
     block_preconditioner_offdiagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the off-diagonal part of a block preconditioner")
+    block_preconditioner_pointdiagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the point diagonal part of a block preconditioner")
     geometry_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for geometries. Currently implemented mixins: generic, axiparallel, equidistant, sumfact_multilinear, sumfact_axiparallel, sumfact_equidistant")
     quadrature_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for quadrature. Currently implemented: generic, sumfact")
     basis_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for basis function evaluation. Currently implemented: generic, sumfact")
@@ -122,6 +134,20 @@ _global_options = CodegenGlobalOptionsArray()
 _form_options = {}
 
 
+def show_options():
+    def subopt(arr):
+        for k, v in arr.__dict__.items():
+            if isinstance(v, CodegenOption) and v.helpstr is not None:
+                print("{}\n    {}".format(k, v.helpstr))
+
+    print("This is a summary of options available for the code generation process:\n")
+    print("The following options can be given in the [formcompiler] section:")
+    subopt(CodegenGlobalOptionsArray)
+
+    print("\nThefollowing options can be given in a form-specific subsection of [formcompiler]:")
+    subopt(CodegenFormOptionsArray)
+
+
 def initialize_options():
     """ Initialize the options from the command line """
     global _global_options
@@ -206,10 +232,16 @@ def process_form_options(opt, form):
     if opt.filename is None:
         opt = opt.copy(filename="{}_{}_file.hh".format(get_option("target_name"), opt.classname))
 
+    if opt.block_preconditioner_pointdiagonal:
+        opt = opt.copy(generate_jacobians=False,
+                       basis_mixins="sumfact_pointdiagonal",
+                       accumulation_mixins="sumfact_pointdiagonal",
+                       )
+
     if opt.block_preconditioner_diagonal or opt.block_preconditioner_offdiagonal:
         assert opt.numerical_jacobian is False
         opt = opt.copy(generate_residuals=False,
-                       generate_jacobians=False,
+                       generate_jacobians=True,
                        matrix_free=True,
                        )
 
diff --git a/python/dune/codegen/pdelab/argument.py b/python/dune/codegen/pdelab/argument.py
index c3cc48298c1b2efb70c6c7716b29b221486044e1..dc1acd660c137c42be6fb65bb687bafe03fbc730 100644
--- a/python/dune/codegen/pdelab/argument.py
+++ b/python/dune/codegen/pdelab/argument.py
@@ -19,6 +19,7 @@ from dune.codegen.pdelab.spaces import (lfs_iname,
                                         )
 from dune.codegen.pdelab.restriction import restricted_name
 from dune.codegen.ufl.modified_terminals import Restriction
+from dune.codegen.options import get_form_option
 
 from pymbolic.primitives import Call, Subscript, Variable
 
@@ -116,6 +117,10 @@ def type_coefficientcontainer():
     return "X"
 
 
+def type_linearizationpointcontainer():
+    return "Z"
+
+
 def name_jacobian(restriction1, restriction2):
     # Restrictions may only differ if NONE
     if (restriction1 == Restriction.NONE) or (restriction2 == Restriction.NONE):
@@ -144,6 +149,8 @@ def name_accumulation_variable(restrictions=None):
                 restrictions = (Restriction.NONE,)
             else:
                 restrictions = (Restriction.POSITIVE,)
+        if get_form_option("block_preconditioner_pointdiagonal"):
+            restrictions = restrictions[:1]
         return name_residual(*restrictions)
     if ft == 'jacobian':
         if restrictions is None:
diff --git a/python/dune/codegen/pdelab/driver/__init__.py b/python/dune/codegen/pdelab/driver/__init__.py
index 6526dcbf85cfb60d663a3f0193dde07351a361bd..124354b50dd354263800c84d99425cc6c9a57c01 100644
--- a/python/dune/codegen/pdelab/driver/__init__.py
+++ b/python/dune/codegen/pdelab/driver/__init__.py
@@ -39,7 +39,10 @@ def get_form_ident():
 
 def get_form():
     data = get_global_context_value("data")
-    return data.object_by_name[get_form_option("form", get_form_ident())]
+    form = get_form_option("form")
+    if form is None:
+        form = get_form_ident()
+    return data.object_by_name[form]
 
 
 def get_dimension():
@@ -212,7 +215,10 @@ def name_initree():
 @preamble(section="init")
 def define_mpihelper(name):
     include_file("dune/common/parallel/mpihelper.hh", filetag="driver")
-    return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name)
+    if get_option("with_mpi"):
+        return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name)
+    else:
+        return "Dune::FakeMPIHelper& {} = Dune::FakeMPIHelper::instance(argc, argv);".format(name)
 
 
 def name_mpihelper():
@@ -270,19 +276,25 @@ def generate_driver():
 
     # Make sure that timestream is declared before retrieving chache items
     if get_option("instrumentation_level") >= 1:
-        from dune.codegen.pdelab.driver.timings import setup_timer, name_timing_stream
+        from dune.codegen.pdelab.driver.timings import setup_timer
         setup_timer()
-        timestream = name_timing_stream()
 
     from dune.codegen.pdelab.driver.error import return_statement
     return_statement()
 
     from dune.codegen.generation import retrieve_cache_items
-    from cgen import FunctionDeclaration, FunctionBody, Block, Value, LineComment, Line
+    from cgen import FunctionDeclaration, FunctionBody, Block, Value, LineComment, Line, Generable
     driver_signature = FunctionDeclaration(Value('int', 'main'), [Value('int', 'argc'), Value('char**', 'argv')])
 
     contents = []
 
+    # Assert that this program was called with ini file
+    contents += ['if (argc != 2){',
+                 '  std::cerr << "This program needs to be called with an ini file" << std::endl;',
+                 '  return 1;',
+                 '}',
+                 '']
+
     def add_section(tag, comment):
         tagcontents = [i for i in retrieve_cache_items("preamble and {}".format(tag), make_generable=True)]
         if tagcontents:
@@ -292,6 +304,11 @@ def generate_driver():
             contents.append(Line("\n"))
 
     add_section("init", "Initialize basic stuff...")
+
+    if get_option("instrumentation_level") >= 1:
+        init_contents = contents
+        contents = []
+
     add_section("grid", "Setup grid (view)...")
     add_section("fem", "Set up finite element maps...")
     add_section("gfs", "Set up grid function spaces...")
@@ -306,13 +323,14 @@ def generate_driver():
     add_section("error", "Maybe calculate errors for test results...")
 
     if get_option("instrumentation_level") >= 1:
-        from dune.codegen.generation import post_include
-        post_include("HP_DECLARE_TIMER(driver);\n", filetag="driver")
-        contents.insert(0, Line(text="HP_TIMER_START(driver);\n"))
-        contents.insert(len(contents) - 1, Line(text="HP_TIMER_STOP(driver);\n"))
-        contents.insert(len(contents) - 1, Line(text="DUMP_TIMER({}, driver, {}, true);\n".format(get_option("instrumentation_level"), timestream)))
-    contents.insert(0, Line(text="\n"))
-    driver_body = Block(contents)
+        from dune.codegen.pdelab.driver.timings import timed_region
+        contents = init_contents + timed_region('driver', contents)
+
+    add_section("end", "Stuff that should happen at the end...")
+    add_section("return_stmt", "Return statement...")
+
+    contents.insert(0, "\n")
+    driver_body = Block([c if isinstance(c, Generable) else Line(c + '\n') for c in contents])
 
     # Wrap a try/catch block around the driver body
     from dune.codegen.cgen import CatchBlock, TryCatchBlock, Value, Block, Line
diff --git a/python/dune/codegen/pdelab/driver/error.py b/python/dune/codegen/pdelab/driver/error.py
index cf9fe42933ca41f2696b39324bb4f443c35d9003..02207b4986dccd7a652c1943402243e755a8c681 100644
--- a/python/dune/codegen/pdelab/driver/error.py
+++ b/python/dune/codegen/pdelab/driver/error.py
@@ -186,8 +186,7 @@ def compare_L2_squared():
             "  {} = true;".format(fail)]
 
 
-@preamble(section="error")
+@preamble(section="return_stmt")
 def return_statement():
-    from dune.codegen.pdelab.driver.error import name_test_fail_variable
     fail = name_test_fail_variable()
     return "return {};".format(fail)
diff --git a/python/dune/codegen/pdelab/driver/gridfunctionspace.py b/python/dune/codegen/pdelab/driver/gridfunctionspace.py
index 377b304b03499aa3c109390cf72f4af0f6327e9b..9aad1d5e6096c9d1735790badd63824f24026d42 100644
--- a/python/dune/codegen/pdelab/driver/gridfunctionspace.py
+++ b/python/dune/codegen/pdelab/driver/gridfunctionspace.py
@@ -82,7 +82,8 @@ def define_grid(name):
     _type = type_grid()
     # TODO: In principle this is only necessary if we use sum factorization in
     # one of the operators. So this could be turned off if that is not the case.
-    if isQuadrilateral(get_trial_element().cell()) and get_option("grid_unstructured"):
+    if isQuadrilateral(get_trial_element().cell()) and get_option("grid_unstructured") and not \
+            get_option("grid_consistent"):
         include_file("dune/consistent-edge-orientation/createconsistentgrid.hh", filetag="driver")
         return ["IniGridFactory<{}> factory({});".format(_type, ini),
                 "std::shared_ptr<{}> grid_nonconsistent = factory.getGrid();".format(_type),
diff --git a/python/dune/codegen/pdelab/driver/instationary.py b/python/dune/codegen/pdelab/driver/instationary.py
index 5cf7170f1dca6f4274db4bf77b4b1627466e20cb..355fd0e743f6a59716319241018d3c5794c784bd 100644
--- a/python/dune/codegen/pdelab/driver/instationary.py
+++ b/python/dune/codegen/pdelab/driver/instationary.py
@@ -132,10 +132,25 @@ def name_time():
 def typedef_timesteppingmethod(name):
     r_type = type_range()
     explicit = get_option('explicit_time_stepping')
+    order = get_option('time_stepping_order')
     if explicit:
-        return "using {} = Dune::PDELab::ExplicitEulerParameter<{}>;".format(name, r_type)
+        if order == 1:
+            return "using {} = Dune::PDELab::ExplicitEulerParameter<{}>;".format(name, r_type)
+        elif order == 2:
+            return "using {} = Dune::PDELab::HeunParameter<{}>;".format(name, r_type)
+        elif order == 3:
+            return "using {} = Dune::PDELab::Shu3Parameter<{}>;".format(name, r_type)
+        elif order == 4:
+            return "using {} = Dune::PDELab::RK4Parameter<{}>;".format(name, r_type)
+        else:
+            raise NotImplementedError("Time stepping method not supported")
     else:
-        return "using {} = Dune::PDELab::OneStepThetaParameter<{}>;".format(name, r_type)
+        if order == 1:
+            return "using {} = Dune::PDELab::OneStepThetaParameter<{}>;".format(name, r_type)
+        elif order == 2:
+            return "using {} = Dune::PDELab::Alexander2Parameter<{}>;".format(name, r_type)
+        elif order == 3:
+            return "using {} = Dune::PDELab::Alexander3Parameter<{}>;".format(name, r_type)
 
 
 def type_timesteppingmethod():
@@ -150,8 +165,12 @@ def define_timesteppingmethod(name):
     if explicit:
         return "{} {};".format(tsm_type, name)
     else:
-        ini = name_initree()
-        return "{} {}({}.get<double>(\"instat.theta\",1.0));".format(tsm_type, name, ini)
+        order = get_option('time_stepping_order')
+        if order == 1:
+            ini = name_initree()
+            return "{} {}({}.get<double>(\"instat.theta\",1.0));".format(tsm_type, name, ini)
+        else:
+            return "{} {};".format(tsm_type, name)
 
 
 def name_timesteppingmethod():
diff --git a/python/dune/codegen/pdelab/driver/solve.py b/python/dune/codegen/pdelab/driver/solve.py
index 79dfac051c66c2faa50c4e262eb419ed30b5fde1..4a6a3c9e7e235ee5368927fc6f97ac817ffde5f5 100644
--- a/python/dune/codegen/pdelab/driver/solve.py
+++ b/python/dune/codegen/pdelab/driver/solve.py
@@ -57,23 +57,8 @@ def dune_solve():
     if get_form_option("generate_jacobians"):
         print_matrix()
 
-    if get_option('instrumentation_level') >= 2:
-        from dune.codegen.pdelab.driver.timings import setup_timer, name_timing_stream, name_timing_identifier
-        timestream = name_timing_stream()
-        setup_timer()
-        from dune.codegen.generation import post_include
-        post_include("HP_DECLARE_TIMER(solve);", filetag="driver")
-
-        solve = ["HP_TIMER_START(solve);",
-                 "{}".format(solve),
-                 "HP_TIMER_STOP(solve);",
-                 "DUMP_TIMER({}, solve, {}, true);".format(get_option("instrumentation_level"), timestream),
-                 ]
-
-        if get_option('instrumentation_level') >= 3:
-            from dune.codegen.pdelab.driver.gridoperator import name_localoperator
-            lop_name = name_localoperator(form_ident)
-            solve.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
+    from dune.codegen.pdelab.driver.timings import timed_region
+    solve = timed_region('solve', solve)
 
     return solve
 
diff --git a/python/dune/codegen/pdelab/driver/timings.py b/python/dune/codegen/pdelab/driver/timings.py
index 714f263a353c5a8a3a6b3b83dbc651ffd3401961..6bbbd07e4b7701fe516eff9509525165ac23a5eb 100644
--- a/python/dune/codegen/pdelab/driver/timings.py
+++ b/python/dune/codegen/pdelab/driver/timings.py
@@ -1,12 +1,11 @@
 """ Timing related generator functions """
 
-from dune.codegen.options import get_option
 from dune.codegen.generation import (cached,
                                      include_file,
                                      pre_include,
-                                     post_include,
                                      preamble,
-                                     )
+                                     post_include)
+from dune.codegen.options import get_option
 from dune.codegen.pdelab.driver import (get_form_ident,
                                         is_linear,
                                         name_initree,
@@ -21,10 +20,13 @@ from dune.codegen.pdelab.driver.gridoperator import (name_gridoperator,
                                                      type_gridoperator,
                                                      )
 from dune.codegen.pdelab.driver.solve import (name_vector,
-                                              type_vector,
+                                              define_vector,
                                               )
 
 
+_sde_marks = {}
+
+
 @preamble(section="timings")
 def define_timing_identifier(name):
     ini = name_initree()
@@ -90,109 +92,203 @@ def name_timing_stream():
     return name
 
 
+def name_temporary_vector(name, form):
+    name = "{}_{}".format(name, form)
+    define_vector(name, form)
+    return name
+
+
+@preamble(section="timings")
+def define_jacobian(name, form_ident):
+    t_go = type_gridoperator(form_ident)
+    n_go = name_gridoperator(form_ident)
+    return ["using M_{} = typename {}::Traits::Jacobian;".format(form_ident, t_go),
+            "M_{} {}({});".format(form_ident, name, n_go)]
+
+
+def name_jacobian(form_ident):
+    name = "J_{}".format(form_ident)
+    define_jacobian(name, form_ident)
+    return name
+
+
+@preamble(section="init")
+def init_likwid():
+    return ["LIKWID_MARKER_INIT;", "LIKWID_MARKER_THREADINIT;"]
+
+
+@preamble(section="end")
+def finalize_likwid():
+    return ["LIKWID_MARKER_CLOSE;"]
+
+
+@preamble(section="timings")
+def local_operator_likwid():
+    lop_name = name_localoperator(get_form_ident())
+    return "{}.register_likwid_timers();".format(lop_name)
+
+
+@preamble(section="timings")
+def local_operator_ssc_marks():
+    lop_name = name_localoperator(get_form_ident())
+    return "{}.dump_ssc_marks();".format(lop_name)
+
+
+def ssc_macro():
+    return '#define __SSC_MARK(x) do{ __asm__ __volatile__' \
+           '("movl %0, %%ebx; .byte 100, 103, 144" : :"i"(x) : "%ebx"); } while(0)'
+
+
 @cached
 def setup_timer():
     # TODO check that we are using YASP?
-    if get_option('opcounter'):
-        pre_include("#define ENABLE_COUNTER", filetag="driver")
-    pre_include("#define ENABLE_HP_TIMERS", filetag="driver")
-    include_file("dune/codegen/common/timer.hh", filetag="driver")
+    if get_option("use_likwid"):
+        pre_include("#define LIKWID_PERFMON", filetag="driver")
+        include_file("likwid.h", filetag="driver")
+        init_likwid()
+        if get_option('instrumentation_level') >= 3:
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.warning("timings: using instrumentation level >= 3 with likwid will slow down your code considerably")
+            local_operator_likwid()
+        finalize_likwid()
+    elif get_option("use_sde"):
+        post_include(ssc_macro(), filetag='driver')
+        if get_option('instrumentation_level') >= 3:
+            local_operator_ssc_marks()
+    else:
+        from dune.codegen.loopy.target import type_floatingpoint
+        pre_include("#define HP_TIMER_OPCOUNTER {}".format(type_floatingpoint()), filetag="driver")
+        if get_option('opcounter'):
+            pre_include("#define ENABLE_COUNTER", filetag="driver")
+        pre_include("#define ENABLE_HP_TIMERS", filetag="driver")
+        include_file("dune/codegen/common/timer.hh", filetag="driver")
 
 
-@preamble(section="timings")
-def evaluate_residual_timer():
-    n_go = name_gridoperator(get_form_ident())
-    v = name_vector(get_form_ident())
-    t_v = type_vector(get_form_ident())
-    setup_timer()
+@preamble(section="init")
+def init_likwid_timer(region):
+    return ["LIKWID_MARKER_REGISTER(\"{}\");".format(region)]
 
-    if get_option('instrumentation_level') >= 2:
-        # Write back times
+
+def init_region_timer(region):
+    setup_timer()
+    if get_option("use_likwid"):
+        init_likwid_timer(region)
+    elif get_option("use_sde"):
+        pass
+    else:
         from dune.codegen.generation import post_include
-        post_include("HP_DECLARE_TIMER(residual_evaluation);", filetag="driver")
+        post_include("HP_DECLARE_TIMER({});".format(region), filetag="driver")
+
+
+def get_region_marks(region, driver):
+    if driver:
+        return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 11, (2 * (len(_sde_marks) + 1) + 1) * 11))
+    else:
+        return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 1, (2 * (len(_sde_marks) + 1) + 1) * 1))
+
+
+def start_region_timer(region):
+    if get_option("use_likwid"):
+        return ["LIKWID_MARKER_START(\"{}\");".format(region)]
+    elif get_option("use_sde"):
+        marks = get_region_marks(region, driver=True)
+        return ["__SSC_MARK(0x{});".format(marks[0])]
+    else:
+        return ["HP_TIMER_START({});".format(region)]
+
+
+def stop_region_timer(region):
+    if get_option("use_likwid"):
+        return ["LIKWID_MARKER_STOP(\"{}\");".format(region)]
+    elif get_option("use_sde"):
+        marks = get_region_marks(region, driver=True)
+        return ["__SSC_MARK(0x{});".format(marks[1]),
+                "std::cout << \"Timed region {}: {} <--> {}\" << std::endl;".format(region, *marks)]
+    else:
         timestream = name_timing_stream()
-        print_times = []
+        return ["HP_TIMER_STOP({});".format(region),
+                "DUMP_TIMER({}, {}, {}, true);".format(get_option("instrumentation_level"), region, timestream)]
 
-    lop_name = name_localoperator(get_form_ident())
-    if get_option('instrumentation_level') >= 3:
-        print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
 
-    if get_option('instrumentation_level') >= 2:
-        evaluation = ["HP_TIMER_START(residual_evaluation);",
-                      "{}.residual({}, r);".format(n_go, v),
-                      "HP_TIMER_STOP(residual_evaluation);",
-                      "DUMP_TIMER({}, residual_evaluation, {}, true);".format(get_option("instrumentation_level"), timestream)]
-        evaluation.extend(print_times)
+def start_region_timer_instruction(region, **kwargs):
+    if get_option("use_likwid"):
+        code = "LIKWID_MARKER_START(\"{}\");".format(region)
     else:
-        evaluation = ["{}.residual({}, r);".format(n_go, v)]
+        code = "HP_TIMER_START({});".format(region)
+    from loopy import CInstruction
+    return CInstruction([], code, **kwargs)
 
-    evaluation = ["{} r({});".format(t_v, v), "r=0.0;"] + evaluation
 
-    return evaluation
+def stop_region_timer_instruction(region, **kwargs):
+    if get_option("use_likwid"):
+        code = "LIKWID_MARKER_STOP(\"{}\");".format(region)
+    else:
+        code = "HP_TIMER_STOP({});".format(region)
+    from loopy import CInstruction
+    return CInstruction([], code, **kwargs)
 
 
-@preamble(section="timings")
-def apply_jacobian_timer():
-    n_go = name_gridoperator(get_form_ident())
-    v = name_vector(get_form_ident())
-    t_v = type_vector(get_form_ident())
-    setup_timer()
+def timed_region(region, actions):
+    if isinstance(actions, str):
+        actions = [actions]
+
+    assert(isinstance(actions, list))
 
     if get_option('instrumentation_level') >= 2:
-        # Write back times
-        from dune.codegen.generation import post_include
-        post_include("HP_DECLARE_TIMER(apply_jacobian);", filetag="driver")
-        timestream = name_timing_stream()
+        assembly = []
         print_times = []
 
-    lop_name = name_localoperator(get_form_ident())
-    if get_option('instrumentation_level') >= 3:
-        print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
+        init_region_timer(region)
 
-    if is_linear():
-        declaration = ["{} j({});".format(t_v, v), "j=0.0;"]
-        evaluation = ["{}.jacobian_apply({}, j);".format(n_go, v)]
-    else:
-        declaration = ["{} j0({});".format(t_v, v), "j0=0.0;",
-                       "{} j1({});".format(t_v, v), "j1=0.0;"]
-        evaluation = ["{}.nonlinear_jacobian_apply({}, j0, j1);".format(n_go, v)]
+        if get_option('instrumentation_level') >= 3 and not (get_option('use_likwid') or get_option("use_sde")):
+            timestream = name_timing_stream()
+            lop_name = name_localoperator(get_form_ident())
+            print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
 
-    if get_option('instrumentation_level') >= 2:
-        evaluation = ["HP_TIMER_START(apply_jacobian);"] + evaluation + ["HP_TIMER_STOP(apply_jacobian);", "DUMP_TIMER({}, apply_jacobian, {}, true);".format(get_option("instrumentation_level"), timestream)]
-        evaluation.extend(print_times)
+        assembly += start_region_timer(region)
+        assembly += actions
+        assembly += stop_region_timer(region)
 
-    return declaration + evaluation
+        return assembly + print_times
+    else:
+        return actions
 
 
 @preamble(section="timings")
-def assemble_matrix_timer():
-    t_go = type_gridoperator(get_form_ident())
+def evaluate_residual_timer():
     n_go = name_gridoperator(get_form_ident())
     v = name_vector(get_form_ident())
-    t_v = type_vector(get_form_ident())
-    setup_timer()
+    r = name_temporary_vector("r", get_form_ident())
 
-    if get_option('instrumentation_level') >= 2:
-        # Write back times
-        from dune.codegen.generation import post_include
-        post_include("HP_DECLARE_TIMER(matrix_assembly);", filetag="driver")
-        timestream = name_timing_stream()
-        print_times = []
+    action = "{}.residual({}, {});".format(n_go, v, r)
 
-    lop_name = name_localoperator(get_form_ident())
-    if get_option('instrumentation_level') >= 3:
-        print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
+    return timed_region("residual_evaluation", action)
 
-    if get_option('instrumentation_level') >= 2:
-        assembly = ["HP_TIMER_START(matrix_assembly);",
-                    "{}.jacobian({},m);".format(n_go, v),
-                    "HP_TIMER_STOP(matrix_assembly);",
-                    "DUMP_TIMER({}, matrix_assembly, {}, true);".format(get_option("instrumentation_level"), timestream)]
-        assembly.extend(print_times)
+
+@preamble(section="timings")
+def apply_jacobian_timer():
+    form = get_form_ident()
+    n_go = name_gridoperator(form)
+    v = name_vector(form)
+
+    if is_linear():
+        j = name_temporary_vector("j", form)
+        action = "{}.jacobian_apply({}, {});".format(n_go, v, j)
     else:
-        assembly = ["{}.jacobian({},m);".format(n_go, v)]
+        j0 = name_temporary_vector("j0", form)
+        j1 = name_temporary_vector("j1", form)
+        action = "{}.nonlinear_jacobian_apply({}, {}, {});".format(n_go, v, j0, j1)
+
+    return timed_region("apply_jacobian", action)
+
+
+@preamble(section="timings")
+def assemble_matrix_timer():
+    n_go = name_gridoperator(get_form_ident())
+    v = name_vector(get_form_ident())
+    m = name_jacobian(get_form_ident())
 
-    assembly = ["using M = typename {}::Traits::Jacobian;".format(t_go),
-                "M m({});".format(n_go)] + assembly
+    action = "{}.jacobian({},{});".format(n_go, v, m)
 
-    return assembly
+    return timed_region("matrix_assembly", action)
diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py
index 70ab9f4793a6163559d070e33e3600f9cd8c5578..0ca6bfb45ed80271af92800980df7405200dca8b 100644
--- a/python/dune/codegen/pdelab/localoperator.py
+++ b/python/dune/codegen/pdelab/localoperator.py
@@ -17,6 +17,7 @@ from dune.codegen.generation import (accumulation_mixin,
                                      constructor_parameter,
                                      domain,
                                      dump_accumulate_timer,
+                                     register_liwkid_timer,
                                      end_of_file,
                                      function_mangler,
                                      generator_factory,
@@ -31,6 +32,7 @@ from dune.codegen.generation import (accumulation_mixin,
                                      ReturnArg,
                                      run_hook,
                                      template_parameter,
+                                     dump_ssc_marks
                                      )
 from dune.codegen.cgen.clazz import (AccessModifier,
                                      BaseClass,
@@ -152,6 +154,11 @@ def enum_alpha():
     return _enum_alpha(ufl_measure_to_pdelab_measure(integral_type))
 
 
+@class_member(classtag="operator")
+def enum_skeleton_twosided():
+    return "enum { doSkeletonTwoSided = true };"
+
+
 def name_initree_member():
     define_initree("_iniParams")
     return "_iniParams"
@@ -278,7 +285,7 @@ def determine_accumulation_space(info, number):
     return AccumulationSpace(lfs=lfs,
                              index=lfsi,
                              restriction=info.restriction,
-                             element=element
+                             element=subel
                              )
 
 
@@ -593,6 +600,10 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin
     for trafo in transformations:
         kernel = trafo[0](kernel, *trafo[1], **trafo[2])
 
+    # Apply performance transformations
+    from dune.codegen.loopy.transformations.performance import performance_transformations
+    kernel = performance_transformations(kernel, signature)
+
     from dune.codegen.loopy import heuristic_duplication
     kernel = heuristic_duplication(kernel)
 
@@ -673,6 +684,32 @@ class TimerMethod(ClassMember):
         ClassMember.__init__(self, content)
 
 
+class RegisterLikwidMethod(ClassMember):
+    def __init__(self):
+        knl = name_example_kernel()
+        assert(knl is not None)
+
+        content = ["void register_likwid_timers()"
+                   "{"]
+        register_liwkid_timers = [i for i in retrieve_cache_items(condition='register_likwid_timers')]
+        content.extend(map(lambda x: '  ' + x, register_liwkid_timers))
+        content += ["}"]
+        ClassMember.__init__(self, content)
+
+
+class RegisterSSCMarksMethod(ClassMember):
+    def __init__(self):
+        knl = name_example_kernel()
+        assert(knl is not None)
+
+        content = ["void dump_ssc_marks()"
+                   "{"]
+        register_liwkid_timers = [i for i in retrieve_cache_items(condition='register_ssc_marks')]
+        content.extend(map(lambda x: '  ' + x, register_liwkid_timers))
+        content += ["}"]
+        ClassMember.__init__(self, content)
+
+
 class LoopyKernelMethod(ClassMember):
     def __init__(self, signature, kernel, add_timings=True, initializer_list=[]):
         from loopy import generate_body
@@ -693,26 +730,64 @@ class LoopyKernelMethod(ClassMember):
                 from dune.codegen.pdelab.signatures import assembler_routine_name
                 timer_name = assembler_routine_name() + '_kernel'
                 name_example_kernel(name=timer_name)
-                post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-                content.append('  ' + 'HP_TIMER_START({});'.format(timer_name))
-                dump_accumulate_timer(timer_name)
 
-            if add_timings and get_option("instrumentation_level") >= 4:
-                setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
-                post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile')
-                content.append('  HP_TIMER_START({});'.format(setuptimer))
-                dump_accumulate_timer(setuptimer)
+                if get_option('use_likwid'):
+                    from dune.codegen.pdelab.driver.timings import init_likwid_timer
+                    include_file("likwid.h", filetag="operatorfile")
+                    init_likwid_timer(timer_name)
+                    content.append('  ' + 'LIKWID_MARKER_START(\"{}\");'.format(timer_name))
+                    register_liwkid_timer(timer_name)
+                elif get_option('use_sde'):
+                    from dune.codegen.pdelab.driver.timings import get_region_marks, ssc_macro
+                    post_include(ssc_macro(), filetag='operatorfile')
+                    marks = get_region_marks(timer_name, driver=False)
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(marks[0]))
+                    dump_ssc_marks(timer_name)
+                else:
+                    post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+                    content.append('  ' + 'HP_TIMER_START({});'.format(timer_name))
+                    dump_accumulate_timer(timer_name)
+
+                if add_timings and get_option("instrumentation_level") >= 4:
+                    setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
+                    if get_option('use_likwid'):
+                        from dune.codegen.pdelab.driver.timings import init_likwid_timer
+                        init_likwid_timer(setuptimer)
+                        content.append('  ' + 'LIKWID_MARKER_START(\"{}\");'.format(setuptimer))
+                        register_liwkid_timer(setuptimer)
+                    elif get_option('use_sde'):
+                        from dune.codegen.pdelab.driver.timings import get_region_marks
+                        setup_marks = get_region_marks(setuptimer, driver=False)
+                        content.append('  ' + '__SSC_MARK(0x{});'.format(setup_marks[0]))
+                        dump_ssc_marks(setuptimer)
+                    else:
+                        post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile')
+                        content.append('  HP_TIMER_START({});'.format(setuptimer))
+                        dump_accumulate_timer(setuptimer)
 
             # Add kernel preamble
             for i, p in kernel.preambles:
                 content.append('  ' + p)
 
+            if add_timings and get_option('instrumentation_level') >= 4:
+                if get_option('use_likwid'):
+                    content.append('  ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(setuptimer))
+                elif get_option('use_sde'):
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(setup_marks[1]))
+                else:
+                    content.append('  ' + 'HP_TIMER_STOP({});'.format(setuptimer))
+
             # Add kernel body
             content.extend(l for l in generate_body(kernel).split('\n')[1:-1])
 
             # Stop timer
             if add_timings and get_option('instrumentation_level') >= 3:
-                content.append('  ' + 'HP_TIMER_STOP({});'.format(timer_name))
+                if get_option('use_likwid'):
+                    content.append('  ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(timer_name))
+                elif get_option('use_sde'):
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(marks[1]))
+                else:
+                    content.append('  ' + 'HP_TIMER_STOP({});'.format(timer_name))
 
         content.append('}')
         ClassMember.__init__(self, content, name=kernel.name if kernel is not None else "")
@@ -800,6 +875,14 @@ def local_operator_default_settings(operator, form):
         base_class('Dune::PDELab::InstationaryLocalOperatorDefaultMethods<{}>'
                    .format(rf), classtag="operator")
 
+    # *always* add the volume pattern, PDELab cannot handle matrices without diagonal blocks
+    with global_context(integral_type="cell"):
+        enum_pattern()
+        pattern_baseclass()
+
+    if get_form_option("block_preconditioner_diagonal") or get_form_option("block_preconditioner_pointdiagonal"):
+        enum_skeleton_twosided()
+
 
 def measure_is_enabled(measure):
     option_dict = {"cell": "enable_volume",
@@ -814,6 +897,16 @@ def generate_residual_kernels(form, original_form):
     if not get_form_option("generate_residuals"):
         return {}
 
+    if get_form_option("block_preconditioner_pointdiagonal"):
+        from ufl import derivative
+        jacform = derivative(original_form, original_form.coefficients()[0])
+
+        from dune.codegen.ufl.preprocess import preprocess_form
+        jacform = preprocess_form(jacform).preprocessed_form
+
+        from dune.codegen.ufl.transformations.blockpreconditioner import diagonal_block_jacobian
+        form = diagonal_block_jacobian(jacform)
+
     logger = logging.getLogger(__name__)
     with global_context(form_type='residual'):
         operator_kernels = {}
@@ -916,6 +1009,12 @@ def generate_jacobian_kernels(form, original_form):
                     from dune.codegen.pdelab.signatures import assembler_routine_name
                     with global_context(kernel=assembler_routine_name()):
                         kernel = [k for k in generate_kernels_per_integral(jac_apply_form.integrals_by_type(measure))]
+
+                        if kernel:
+                            enum_pattern()
+                            pattern_baseclass()
+                            enum_alpha()
+
                 operator_kernels[(measure, 'jacobian_apply')] = kernel
 
                 # Generate dummy functions for those kernels, that vanished in the differentiation process
@@ -945,6 +1044,12 @@ def generate_jacobian_kernels(form, original_form):
                         from dune.codegen.pdelab.signatures import assembler_routine_name
                         with global_context(kernel=assembler_routine_name()):
                             kernel = [k for k in generate_kernels_per_integral(jacform.integrals_by_type(measure))]
+
+                            if kernel:
+                                enum_pattern()
+                                pattern_baseclass()
+                                enum_alpha()
+
                     operator_kernels[(measure, 'jacobian')] = kernel
 
                 # Generate dummy functions for those kernels, that vanished in the differentiation process
@@ -1141,7 +1246,12 @@ def generate_localoperator_file(kernels, filename):
 
     if get_option('instrumentation_level') >= 3:
         include_file('dune/codegen/common/timer.hh', filetag='operatorfile')
-        operator_methods.append(TimerMethod())
+        if get_option('use_likwid'):
+            operator_methods.append(RegisterLikwidMethod())
+        elif get_option('use_sde'):
+            operator_methods.append(RegisterSSCMarksMethod())
+        else:
+            operator_methods.append(TimerMethod())
     elif get_option('opcounter'):
         include_file('dune/codegen/common/timer.hh', filetag='operatorfile')
 
diff --git a/python/dune/codegen/pdelab/signatures.py b/python/dune/codegen/pdelab/signatures.py
index 09b832ac252138181d8be19c9fd4c098b5bb9b68..34acea22ea1fda1b8369bf499d7ce8ed0c37859d 100644
--- a/python/dune/codegen/pdelab/signatures.py
+++ b/python/dune/codegen/pdelab/signatures.py
@@ -9,6 +9,7 @@ from dune.codegen.pdelab.argument import (name_accumulation_variable,
                                           name_coefficientcontainer,
                                           type_coefficientcontainer,
                                           name_applycontainer,
+                                          type_linearizationpointcontainer,
                                           )
 from dune.codegen.pdelab.spaces import (name_testfunctionspace,
                                         type_testfunctionspace,
@@ -293,8 +294,9 @@ def nonlinear_jacobian_apply_volume_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, avt)
 
 
 def nonlinear_jacobian_apply_volume_args():
@@ -312,8 +314,9 @@ def nonlinear_jacobian_apply_boundary_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, avt)
 
 
 def nonlinear_jacobian_apply_boundary_args():
@@ -331,8 +334,9 @@ def nonlinear_jacobian_apply_skeleton_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, lfsut, cct, cct, lfsvt, avt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, lfsut, cct, lpt, lfsvt, avt, avt)
 
 
 def nonlinear_jacobian_apply_skeleton_args():
diff --git a/python/dune/codegen/pdelab/tensors.py b/python/dune/codegen/pdelab/tensors.py
index a924a39ae1bba19a94c47e5542567d8717cedf57..68f6d393129600f0fda995ea1a4f239ff844c173 100644
--- a/python/dune/codegen/pdelab/tensors.py
+++ b/python/dune/codegen/pdelab/tensors.py
@@ -22,7 +22,7 @@ def define_assembled_tensor(name, expr, visitor):
         visitor.indices = indices
         instruction(assignee=prim.Subscript(prim.Variable(name), indices),
                     expression=visitor.call(expr),
-                    forced_iname_deps=frozenset(visitor.interface.quadrature_inames()),
+                    forced_iname_deps=frozenset(visitor.quadrature_inames()),
                     depends_on=frozenset({lp.match.Tagged("sumfact_stage1")}),
                     tags=frozenset({"quad"}),
                     )
@@ -36,17 +36,43 @@ def name_assembled_tensor(o, visitor):
 
 
 @kernel_cached
+def code_generation_time_inversion(expr, visitor):
+    mat = np.ndarray(expr.ufl_shape)
+    for indices in it.product(*tuple(range(i) for i in expr.ufl_shape)):
+        visitor.indices = indices
+        val = visitor.call(expr.ufl_operands[0])
+        if not isinstance(val, (float, int)):
+            visitor.indices = None
+            return None
+
+        mat[indices] = val
+
+    visitor.indices = None
+    return np.linalg.inv(mat)
+
+
 def pymbolic_matrix_inverse(o, visitor):
+    # Try to evaluate the matrix at code generation time.
+    # If this works (it does e.g. for Maxwell on structured grids)
+    # we can invert the matrix at code generation time!!!
     indices = visitor.indices
     visitor.indices = None
+
+    mat = code_generation_time_inversion(o, visitor)
+    if mat is not None:
+        return mat[indices]
+
+    # If code generation time inversion failed, we assemble it in C++
+    # and invert it there.
     name = name_assembled_tensor(o.ufl_operands[0], visitor)
 
     instruction(code="{}.invert();".format(name),
-                within_inames=frozenset(visitor.interface.quadrature_inames()),
+                within_inames=frozenset(visitor.quadrature_inames()),
                 depends_on=frozenset({lp.match.Writes(name),
                                       lp.match.Tagged("sumfact_stage1"),
                                       }),
                 tags=frozenset({"quad"}),
+                assignees=frozenset({name}),
                 )
 
     visitor.indices = indices
diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py
index ae4a763063f0dc25303fc517356b58b02551eebc..90fd01fd5b90b722c0a3ee194e1da82ed44f6eab 100644
--- a/python/dune/codegen/sumfact/accumulation.py
+++ b/python/dune/codegen/sumfact/accumulation.py
@@ -55,6 +55,7 @@ import pymbolic.primitives as prim
 from loopy.symbolic import WalkMapper
 import ufl.classes as uc
 from ufl import FiniteElement, MixedElement, TensorProductElement
+import itertools
 
 
 basis_sf_kernels = generator_factory(item_tags=("basis_sf_kernels",), context_tags='kernel', no_deco=True)
@@ -161,10 +162,12 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
         if self.trial_element is None:
             return ()
         else:
-            from dune.codegen.sumfact.basis import SumfactBasisMixin
-            return SumfactBasisMixin.lfs_inames(SumfactBasisMixin(), get_leaf(self.trial_element, self.trial_element_index), self.restriction)
+            mixin = get_form_option("basis_mixins")
+            from dune.codegen.generation import construct_from_mixins
+            MixinType = construct_from_mixins(mixins=[mixin], mixintype="basis", name="MixinType")
+            return MixinType.lfs_inames(MixinType(), get_leaf(self.trial_element, self.trial_element_index), self.restriction)
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # The result of stage 2 has the correct quadrature permutation but no
         # cost permutation is applied. The inames for this method are
         # quadrature and cost permuted. This means we need to reverse the cost
@@ -175,7 +178,8 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
@@ -350,11 +354,68 @@ class SumfactAccumulationMixin(AccumulationMixinBase):
         return get_accumulation_info(expr, self)
 
     def list_accumulation_infos(self, expr):
-        return list_accumulation_infos(expr, self)
+        return itertools.product(_gradsplitting_generator(expr, self),
+                                 _trial_generator(expr, self),
+                                 )
 
     def generate_accumulation_instruction(self, expr):
         return generate_accumulation_instruction(expr, self)
 
+    def additional_matrix_sequence(self):
+        return None
+
+    @property
+    def prohibit_jacobian(self):
+        return False
+
+
+@accumulation_mixin("sumfact_pointdiagonal")
+class SumfactPointDiagonalAccumulationMixin(SumfactAccumulationMixin):
+    def additional_matrix_sequence(self):
+        info = self.current_info[1]
+        return construct_basis_matrix_sequence(transpose=True,
+                                               derivative=info.grad_index,
+                                               facedir=get_facedir(info.restriction),
+                                               facemod=get_facemod(info.restriction),
+                                               basis_size=get_basis_size(info),
+                                               )
+
+    def get_accumulation_info(self, expr):
+        element = expr.ufl_element()
+        leaf_element = element
+        element_index = 0
+        from ufl import MixedElement
+        if isinstance(expr.ufl_element(), MixedElement):
+            element_index = self.indices[0]
+            leaf_element = element.extract_component(element_index)[1]
+
+        restriction = self.restriction
+        if self.measure == 'exterior_facet':
+            from dune.codegen.pdelab.restriction import Restriction
+            restriction = Restriction.POSITIVE
+
+        grad_index = None
+        if self.reference_grad:
+            if isinstance(expr.ufl_element(), MixedElement):
+                grad_index = self.indices[1]
+            else:
+                grad_index = self.indices[0]
+
+        return SumfactAccumulationInfo(element=expr.ufl_element(),
+                                       element_index=element_index,
+                                       restriction=restriction,
+                                       grad_index=grad_index,
+                                       )
+
+    def list_accumulation_infos(self, expr):
+        return itertools.product(_gradsplitting_generator(expr, self, number=0),
+                                 _gradsplitting_generator(expr, self, number=1),
+                                 )
+
+    @property
+    def prohibit_jacobian(self):
+        return True
+
 
 class SumfactAccumulationInfo(ImmutableRecord):
     def __init__(self,
@@ -421,9 +482,9 @@ def _get_childs(element):
             yield (i, element.extract_component(i)[1])
 
 
-def _test_generator(expr, visitor):
+def _gradsplitting_generator(expr, visitor, number=0):
     from dune.codegen.ufl.modified_terminals import extract_modified_arguments
-    ma = extract_modified_arguments(expr, argnumber=0)
+    ma = extract_modified_arguments(expr, argnumber=number)
     if len(ma) == 0:
         return
     element = ma[0].argexpr.ufl_element()
@@ -439,7 +500,8 @@ def _test_generator(expr, visitor):
     for res in restrictions:
         for ei, e in _get_childs(element):
             for grad in (None,) + tuple(range(dim)):
-                yield SumfactAccumulationInfo(element_index=ei,
+                yield SumfactAccumulationInfo(element=element,
+                                              element_index=ei,
                                               restriction=res,
                                               grad_index=grad)
 
@@ -464,13 +526,20 @@ def _trial_generator(expr, visitor):
             yield SumfactAccumulationInfo(element_index=ei, restriction=res, element=e)
 
 
-def list_accumulation_infos(expr, visitor):
-    import itertools
-    return itertools.product(_test_generator(expr, visitor), _trial_generator(expr, visitor))
+def get_basis_size(info):
+    leaf_element = info.element
+    element_index = info.element_index
+    dim = world_dimension()
+    from ufl import MixedElement
+    if isinstance(leaf_element, MixedElement):
+        leaf_element = leaf_element.extract_component(element_index)[1]
+    degree = leaf_element._degree
+    if isinstance(degree, int):
+        degree = (degree,) * dim
+    return tuple(deg + 1 for deg in degree)
 
 
 def generate_accumulation_instruction(expr, visitor):
-    dim = world_dimension()
     test_info = visitor.test_info
     trial_info = visitor.trial_info
 
@@ -479,14 +548,7 @@ def generate_accumulation_instruction(expr, visitor):
     count_quadrature_point_operations(expr)
 
     # Number of basis functions per direction
-    leaf_element = test_info.element
-    from ufl import MixedElement
-    if isinstance(leaf_element, MixedElement):
-        leaf_element = leaf_element.extract_component(test_info.element_index)[1]
-    degree = leaf_element._degree
-    if isinstance(degree, int):
-        degree = (degree,) * dim
-    basis_size = tuple(deg + 1 for deg in degree)
+    basis_size = get_basis_size(test_info)
 
     # Anisotropic finite elements are not (yet) supported by Dune
     assert(size == basis_size[0] for size in basis_size)
@@ -522,20 +584,27 @@ def generate_accumulation_instruction(expr, visitor):
         derivative=test_info.grad_index,
         facedir=get_facedir(test_info.restriction),
         facemod=get_facemod(test_info.restriction),
-        basis_size=basis_size)
+        basis_size=basis_size,
+        additional_sequence=visitor.additional_matrix_sequence())
 
     jacobian_inames = trial_info.inames
     priority = test_info.grad_index
     if priority is None:
         priority = 3
 
+    trial_element = trial_info.element
+    trial_element_index = trial_info.element_index
+    if visitor.prohibit_jacobian:
+        trial_element = None
+        trial_element_index = 0
+
     output = AccumulationOutput(matrix_sequence=matrix_sequence,
                                 accumvar=accumvar,
                                 restriction=(test_info.restriction, trial_info.restriction),
                                 test_element=test_info.element,
                                 test_element_index=test_info.element_index,
-                                trial_element=trial_info.element,
-                                trial_element_index=trial_info.element_index,
+                                trial_element=trial_element,
+                                trial_element_index=trial_element_index,
                                 )
 
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index 68d81957e00917d55383282c748879f7f96db587..1e801f4550d17d89ef8972fe3ffa867d4a44ea25 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -1,19 +1,27 @@
 """ Autotuning for sum factorization kernels """
 
-from dune.codegen.generation import cache_restoring, delete_cache_items
-from dune.codegen.loopy.target import DuneTarget
-from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
-from dune.codegen.options import get_option, set_option
-from dune.codegen.error import CodegenAutotuneError
-
-import loopy as lp
-from pytools import product
-
 import os
 import re
 import subprocess
 import filelock
 import hashlib
+import logging
+import json
+from operator import mul
+import pkg_resources
+from six.moves import reduce
+import textwrap
+import time
+
+import loopy as lp
+from pytools import product
+from cgen import ArrayOf, AlignedAttribute, Initializer
+
+from dune.codegen.generation import cache_restoring, delete_cache_items
+from dune.codegen.loopy.target import DuneTarget, type_floatingpoint
+from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
+from dune.codegen.options import get_option, set_option
+from dune.codegen.error import CodegenAutotuneError
 
 
 def get_cmake_cache_entry(entry):
@@ -65,29 +73,37 @@ def compiler_invocation(name, filename):
     return compile_flags
 
 
-def generate_standalone_code(sf, filename):
-    delete_cache_items("kernel_default")
+def write_global_data(sf, filename):
+    opcounting = get_option("opcounter")
+    with open(filename, "a") as f:
+        # Get kernel
+        from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
+        knl = realize_sumfact_kernel_function(sf)
+        constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
+        constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
+        constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
 
-    with open(filename, "w") as f:
-        f.writelines(["#include \"config.h\"\n",
-                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
-                      "#include<dune/codegen/common/tsc.hh>\n",
-                      "#include<dune/codegen/common/vectorclass.hh>\n",
-                      "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
-                      "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
-                      "#include<random>\n",
-                      "#include<fstream>\n",
-                      "#include<iostream>\n",
-                      "\n"
-                      ])
+        target = DuneTarget()
+        from loopy.codegen import CodeGenerationState
+        codegen_state = CodeGenerationState(kernel=constructor_knl,
+                                            implemented_data_info=None,
+                                            implemented_domain=None,
+                                            implemented_predicates=frozenset(),
+                                            seen_dtypes=frozenset(),
+                                            seen_functions=frozenset(),
+                                            seen_atomic_dtypes=frozenset(),
+                                            var_subst_map={},
+                                            allow_complex=False,
+                                            is_generating_device_code=True,
+                                            )
 
-        f.writelines(["int main(int argc, char** argv)\n",
-                      "{\n",
-                      ])
+        for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
+            f.write("{}\n".format(next(iter(decl.generate()))))
 
+
+def write_setup_code(sf, filename, define_thetas=True):
+    with open(filename, "a") as f:
         # Setup a polynomial object (normally done in the LocalOperator members)
-        opcounting = get_option("opcounter")
-        set_option("opcounter", False)
         from dune.codegen.loopy.target import type_floatingpoint
         real = type_floatingpoint()
         f.write("  using RF = {};\n".format(real))
@@ -98,7 +114,7 @@ def generate_standalone_code(sf, filename):
         for deg in set(degs):
             f.write("  Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
 
-        # Get kernels
+        # Get kernel
         from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
         knl = realize_sumfact_kernel_function(sf)
         constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
@@ -106,11 +122,12 @@ def generate_standalone_code(sf, filename):
         constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
 
         # Allocate buffers
+        alignment = get_option("max_vector_width") // 8
         size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width,
                    product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width)
         size = int(size * (get_option("precision_bits") / 8))
-        f.writelines(["  char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size),
-                      "  char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size),
+        f.writelines(["  char buffer0[{}] __attribute__ ((aligned ({})));\n".format(size, alignment),
+                      "  char buffer1[{}] __attribute__ ((aligned ({})));\n".format(size, alignment),
                       ])
 
         # Setup fastdg inputs
@@ -119,7 +136,7 @@ def generate_standalone_code(sf, filename):
                 f.write("{} = 0;\n".format(arg))
             else:
                 size = sf.interface.fastdg_interface_object_size
-                f.write("RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size))
+                f.write("  RF {}[{}] __attribute__ ((aligned ({})));\n".format(arg.split()[-1], size, alignment))
 
         # Write stuff into the input buffer
         f.writelines(["  {0} *input = ({0} *)buffer0;\n".format(real),
@@ -142,8 +159,9 @@ def generate_standalone_code(sf, filename):
                                             is_generating_device_code=True,
                                             )
 
-        for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
-            f.write("  {}\n".format(next(iter(decl.generate()))))
+        if define_thetas:
+            for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
+                f.write("  {}\n".format(next(iter(decl.generate()))))
 
         for _, line in constructor_knl.preambles:
             if "gfsu" not in line:
@@ -162,12 +180,114 @@ def generate_standalone_code(sf, filename):
                       "  std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
                       ])
 
+
+def generate_standalone_code_google_benchmark(sf, filename):
+    delete_cache_items("kernel_default")
+
+    # Turn off opcounting
+    opcounting = get_option("opcounter")
+    set_option("opcounter", False)
+
+    # Extract sum factorization kernel
+    from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
+    knl = realize_sumfact_kernel_function(sf)
+
+    # Add the implementation of the kernel.
+    # TODO: This can probably done in a safer way?
+    first_line = knl.member.lines[0]
+    arguments = first_line[first_line.find("(") + 1:first_line.find(")")]
+
+    with open(filename, "w") as f:
+        f.writelines(["// {}".format(first_line),
+                      "\n",
+                      "#include \"config.h\"\n",
+                      "#include \"benchmark/benchmark.h\"\n",
+                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
+                      "#include<dune/codegen/common/vectorclass.hh>\n",
+                      "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
+                      "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
+                      "#include<random>\n",
+                      "#include<fstream>\n",
+                      "#include<iostream>\n",
+                      "\n"
+                      ])
+
+    write_global_data(sf, filename)
+
+    with open(filename, "a") as f:
+        arguments = ', '.join(sf.interface.signature_args)
+        if len(arguments) > 0:
+            arguments = ', ' + arguments
+        arguments = 'const char* buffer0, const char* buffer1' + arguments
+        f.write("void sumfact_kernel({})\n".format(arguments))
+        for line in knl.member.lines[1:]:
+            f.write("{}\n".format(line))
+
+        f.write("\n\n")
+        f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
+
+    write_setup_code(sf, filename, define_thetas=False)
+
+    additional_arguments = [i.split()[-1] for i in sf.interface.signature_args]
+    additional_arguments = ', '.join(additional_arguments)
+    if len(additional_arguments) > 0:
+        additional_arguments = ', ' + additional_arguments
+    with open(filename, "a") as f:
+        f.writelines(["  for (auto _ : state){\n",
+                      "    sumfact_kernel(buffer0, buffer1{});\n".format(additional_arguments),
+                      "  }\n",
+                      "}\n",
+                      "BENCHMARK(BM_sumfact_kernel);\n",
+                      "\n",
+                      "BENCHMARK_MAIN();"
+                      ])
+
+    # Maybe turn opcounting on again
+    set_option("opcounter", opcounting)
+
+
+def generate_standalone_code(sf, filename):
+    delete_cache_items("kernel_default")
+
+    # Turn off opcounting
+    opcounting = get_option("opcounter")
+    set_option("opcounter", False)
+
+    # Extract sum factorization kernel
+    from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
+    knl = realize_sumfact_kernel_function(sf)
+    first_line = knl.member.lines[0]
+
+    with open(filename, "w") as f:
+        f.writelines(["// {}".format(first_line),
+                      "\n",
+                      "#include \"config.h\"\n",
+                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
+                      "#include<dune/codegen/common/tsc.hh>\n",
+                      "#include<dune/codegen/common/vectorclass.hh>\n",
+                      "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
+                      "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
+                      "#include<random>\n",
+                      "#include<fstream>\n",
+                      "#include<iostream>\n",
+                      "\n"
+                      ])
+
+        f.writelines(["int main(int argc, char** argv)\n",
+                      "{\n",
+                      ])
+
+    write_setup_code(sf, filename)
+
+    # Write measurement
+    with open(filename, "a") as f:
         # Start a TSC timer
         f.writelines(["  auto start = Dune::PDELab::TSC::start();\n",
                       ])
 
         # Add the implementation of the kernel.
-        f.write("  for(int i=0; i<{}; ++i)\n".format(int(1e9 / sf.operations)))
+        repeats = int(1e9 / sf.operations)
+        f.write("  for(int i=0; i<{}; ++i)\n".format(repeats))
         f.write("  {\n")
         for line in knl.member.lines[1:]:
             f.write("    {}\n".format(line))
@@ -177,22 +297,218 @@ def generate_standalone_code(sf, filename):
         f.writelines(["  auto stop = Dune::PDELab::TSC::stop();\n",
                       "  std::ofstream file;\n",
                       "  file.open(argv[1]);\n",
-                      "  file << Dune::PDELab::TSC::elapsed(start, stop) << std::endl;\n",
+                      "  file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))),
                       "  file.close();\n",
                       "  accum += output[dis(rng)];\n",
                       "  std::cout << accum;\n",
                       "}\n",
                       ])
-        set_option("opcounter", opcounting)
 
+    # Maybe turn opcounting on again
+    set_option("opcounter", opcounting)
+
+
+def generate_standalone_kernel_code(kernel, signature, filename, transformations=None):
+    # Turn off opcounting
+    opcounting = get_option("opcounter")
+    set_option("opcounter", False)
+
+    # Remove opcounter from signature
+    p = re.compile('OpCounter::OpCounter<([^>]*)>')
+    assert len(signature) == 1
+    sig = signature[0]
+    sig = p.sub(r'\1', sig)
+    assert 'OpCounter' not in signature
+
+    # Which transformations were applied
+    codegen_transformations = ''
+    if transformations:
+        codegen_transformations = ''
+        for trafo in transformations:
+            codegen_transformations += '// {}\n'.format(trafo)
+
+    template = 'kernel_benchmark_template1.cc.in'
+    use_datasets = True
+
+    # Old benchmark template
+    # template = 'kernel_benchmark_template0.cc.in'
+    # use_datasets = False
+
+    template_filename = pkg_resources.resource_filename(__name__, template)
+    with open(template_filename, 'r') as f:
+        benchmark = f.read()
+
+    # Find function arguments and global arguments
+    arguments = sig[sig.find('(') + 1:sig.find(')')].split(',')
+    arguments = [a.split(' ')[-1] for a in arguments]
+    global_args = [a for a in kernel.args if a.name not in arguments]
+    buffer_arguments = [a for a in arguments if a.startswith('buff')]
+    input_arguments = [a for a in arguments if a not in buffer_arguments]
+
+    # Declare global arguments
+    codegen_declare_global_arguments = ''
+    target = DuneTarget()
+    for g in global_args:
+        decl_info = g.decl_info(target, True, g.dtype)
+        for idi in decl_info:
+            ast_builder = target.get_device_ast_builder()
+            arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
+            arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
+            arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
+            codegen_declare_global_arguments += '{}\n'.format(arg_decl)
+    codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, '  ')
+
+    # Helper function for argument initialization
+    def _initialize_arg(arg):
+        if isinstance(arg, lp.ValueArg):
+            return []
+        real = type_floatingpoint()
+        size = reduce(mul, arg.shape)
+        fill_name = arg.name + '_fill'
+        lines = ['  {}* {} = (double *) {};'.format(real, fill_name, arg.name),
+                 '  for (std::size_t i=0; i<{}; ++i){{'.format(size),
+                 '    {}[i] = unif(re);'.format(fill_name),
+                 '  }']
+        return lines
+
+    # Initialize global arguments
+    codegen_initialize_global_arguments = ''
+    for arg in global_args:
+        lines = _initialize_arg(arg)
+        codegen_initialize_global_arguments += '\n'.join(lines) + '\n'
+    codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, '  ')
+
+    codegen_initialize_input = ''
+
+    # Function we want to benchmark
+    codegen_benchmark_function = ''
+    codegen_benchmark_function += sig[0:sig.find(')') + 1]
+    codegen_benchmark_function += lp.generate_body(kernel)
+    codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, '  ')
+
+    # Declare function arguments
+    codegen_declare_arguments = []
+    codegen_declare_input = []
+    function_arguments = [a for a in kernel.args if a.name in arguments]
+    for arg in function_arguments:
+        if 'buffer' in arg.name:
+            byte_size = reduce(mul, arg.shape) * 8
+            codegen_declare_arguments.append('  char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
+                                                                                                      byte_size,
+                                                                                                      arg.alignment),)
+        elif isinstance(arg, lp.ValueArg):
+            assert 'jacobian_offset' in arg.name
+            decl = arg.get_arg_decl(ast_builder)
+            decl = Initializer(decl, 'unif_int(re)')
+            codegen_declare_arguments.append(('  {}\n'.format(decl)))
+        else:
+            assert 'fastdg' in arg.name
+            size = reduce(mul, arg.shape)
+            min_stride = min([tag.stride for tag in arg.dim_tags])
+            size *= min_stride
+            alignment = arg.dtype.itemsize
+            real = type_floatingpoint()
+            if use_datasets:
+                codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                                                             arg.name,
+                                                                                                             size,
+                                                                                                             alignment)))
+            else:
+                codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                                                   arg.name,
+                                                                                                   size,
+                                                                                                   alignment)))
+
+    codegen_declare_arguments = ''.join(codegen_declare_arguments)
+    codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, '  ')
+    codegen_declare_input = ''.join(codegen_declare_input)
+    codegen_declare_input = textwrap.indent(codegen_declare_input, '  ')
+
+    # Initialize function arguments
+    codegen_initialize_arguments = ''
+    codegen_initialize_input = ''
+    for arg in function_arguments:
+        if 'fastdg' in arg.name:
+            if use_datasets:
+                lines = _initialize_arg(arg)
+                lines = ['  ' + a for a in lines]
+                lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines]
+                lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){')
+                lines.append('}')
+                codegen_initialize_input += '\n'.join(lines) + '\n'
+            else:
+                lines = _initialize_arg(arg)
+                codegen_initialize_arguments += '\n'.join(lines) + '\n'
+        else:
+            lines = _initialize_arg(arg)
+            codegen_initialize_arguments += '\n'.join(lines) + '\n'
+    codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, '  ')
+    codegen_initialize_input = textwrap.indent(codegen_initialize_input, '  ')
+
+    # Call the benchmark function
+    if use_datasets:
+        arguments_with_datasets = arguments.copy()
+        arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments]
+        codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n'
+        codegen_call_benchmark_function += '  ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n'
+        for arg in input_arguments:
+            codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg)
+        codegen_call_benchmark_function += '}'
+    else:
+        codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n'
+    codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, '    ')
+
+    # Replace placeholders in benchmark template
+    benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input)
+    benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments)
+    benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function)
+
+    # Write benchmark source file
+    with open(filename, 'w') as f:
+        f.writelines(benchmark)
+
+    # Maybe turn opcounting on again
+    set_option("opcounter", opcounting)
+
+
+def autotune_realization(sf=None, kernel=None, signature=None, transformations=None):
+    """Generate an microbenchmark, compile run and return time
+
+    Parameters
+    ----------
+    sf: SumfactKernel or VectorizedSumfactKernel
+    kernel: loopy.kernel.LoopKernel
+    signature: str
+    transformation: list of str
+        Will be used to distinguish between autotune targets
+    """
+    if sf is None:
+        assert kernel is not None
+        assert signature is not None
+    else:
+        assert kernel is None
+        assert signature is None
+
+    logger = logging.getLogger(__name__)
 
-def autotune_realization(sf):
     # Make sure that the benchmark directory exists
     dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
     if not os.path.exists(dir):
         os.mkdir(dir)
 
-    basename = "autotune_sumfact_{}".format(sf.function_name)
+    if sf is None:
+        basename = "autotune_sumfact_{}".format(kernel.name)
+    else:
+        basename = "autotune_sumfact_{}".format(sf.function_name)
+    if transformations:
+        for trafo in transformations:
+            basename = '{}_{}'.format(basename, trafo)
     basename = hashlib.sha256(basename.encode()).hexdigest()
 
     filename = os.path.join(dir, "{}.cc".format(basename))
@@ -201,28 +517,75 @@ def autotune_realization(sf):
     executable = os.path.join(dir, basename)
 
     # Generate and compile a benchmark program
+    #
+    # Note: cache restoring is only necessary when generating from SumfactKernel
     with cache_restoring():
         with filelock.FileLock(lock):
             if not os.path.isfile(logname):
-                generate_standalone_code(sf, filename)
+                logger.debug('Generate autotune target in file {}'.format(filename))
 
+                if sf is None:
+                    generate_standalone_kernel_code(kernel, signature, filename, transformations)
+                elif get_option("autotune_google_benchmark"):
+                    generate_standalone_code_google_benchmark(sf, filename)
+                else:
+                    generate_standalone_code(sf, filename)
+
+                call = []
+                wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_COMPILATION_WRAPPER")
+                if wrapper:
+                    call.append(wrapper)
+
+                call.extend(compiler_invocation(executable, filename))
                 devnull = open(os.devnull, 'w')
-                ret = subprocess.call(compiler_invocation(executable, filename), stdout=devnull, stderr=subprocess.STDOUT)
+                os.environ['DUNE_CODEGEN_THREADS'] = '1'
+                ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
                 if ret != 0:
-                    raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(name, filename))))
+                    raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call)))
+
+                # File system synchronization!
+                while not os.path.exists(executable):
+                    time.sleep(0.01)
 
                 # Check whether the user specified an execution wrapper
                 call = []
-                wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_WRAPPER")
+                wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_EXECUTION_WRAPPER")
                 if wrapper:
                     call.append(wrapper)
 
                 # Run the benchmark program
                 call.append(executable)
-                call.append(logname)
+                if get_option("autotune_google_benchmark"):
+                    call.append("--benchmark_out={}".format(logname))
+                    call.append("--benchmark_repetitions=5")
+                    # call.append("--benchmark_out_format=csv")
+                else:
+                    call.append(logname)
                 ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
                 if ret != 0:
                     raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
 
+                # File system synchronization!
+                while not os.path.exists(logname):
+                    time.sleep(0.01)
+
             # Extract the result form the log file
-            return float(next(iter(open(logname, "r")))) / 1000000
+            if get_option("autotune_google_benchmark"):
+                import json
+                with open(logname) as json_file:
+                    try:
+                        data = json.load(json_file)
+                        minimal_time = 1e80
+                        for b in data['benchmarks']:
+                            if b['name'].endswith('_mean') or b['name'].endswith('_median') or b['name'].endswith('_stddev'):
+                                pass
+                            else:
+                                if b['cpu_time'] < minimal_time:
+                                    minimal_time = b['cpu_time']
+                        assert minimal_time < 1e80
+                        return minimal_time
+                    except Exception as e:
+                        print("Error while loading file {}".format(logname))
+                        raise e
+            else:
+                return float(next(iter(open(logname, "r")))) / 1000000
diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py
index c9b75eb445af01e9420e850650b12054dd0115f3..c49e052a90faac34ad6f64b539f93b3bb1d560cb 100644
--- a/python/dune/codegen/sumfact/basis.py
+++ b/python/dune/codegen/sumfact/basis.py
@@ -235,6 +235,28 @@ class SumfactBasisMixin(GenericBasisMixin):
         return prim.Subscript(var, vsf.quadrature_index(sf, self))
 
 
+@basis_mixin("sumfact_pointdiagonal")
+class SumfactPointDiagonalBasisMixin(SumfactBasisMixin):
+    def lfs_inames(self, element, restriction, number=1):
+        return ()
+
+    def implement_basis(self, element, restriction, number):
+        info = self.current_info[number]
+        if element == info.element and restriction == info.restriction:
+            return 1
+        else:
+            return 0
+
+    def implement_reference_gradient(self, element, restriction, number):
+        index, = self.indices
+        self.indices = None
+        info = self.current_info[number]
+        if element == info.element and restriction == info.restriction and index == info.grad_index:
+            return 1
+        else:
+            return 0
+
+
 class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
     def __init__(self,
                  matrix_sequence=None,
@@ -345,14 +367,15 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
 
         return insn_dep.union(frozenset({insn}))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Note: Here we do not need to reverse any permutation since this is
         # already done in the setup_input method above!
 
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py
index d37558f5c40f42355b5fa99ed9a4ea25ab7ade28..8ac8aaa4faa1a4849c3b88fb54f87b0f87ca7b2b 100644
--- a/python/dune/codegen/sumfact/geometry.py
+++ b/python/dune/codegen/sumfact/geometry.py
@@ -456,11 +456,12 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
 
         return insn_dep.union(frozenset({insn}))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in
new file mode 100644
index 0000000000000000000000000000000000000000..c4b49da5aec59006a9f007534dc7711b98ef0387
--- /dev/null
+++ b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in
@@ -0,0 +1,37 @@
+// Transformations:
+${CODEGEN_TRANSFORMATIONS}
+
+#include "config.h"
+#include <iostream>
+#include <fstream>
+#include <random>
+#include "benchmark/benchmark.h"
+#include <dune/codegen/common/vectorclass.hh>
+#include <dune/codegen/sumfact/horizontaladd.hh>
+
+${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+
+${CODEGEN_BENCHMARK_FUNCTION}
+
+static void BM_sumfact_kernel(benchmark::State& state){
+  std::uniform_real_distribution<double> unif(0,1);
+  std::uniform_int_distribution<int> unif_int(0,1);
+  std::default_random_engine re;
+
+  ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+  ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}
+
+  ${CODEGEN_DECLARE_INPUT}
+  ${CODEGEN_INITIALIZE_INPUT}
+
+  ${CODEGEN_DECLARE_ARGUMENTS}
+  ${CODEGEN_INITIALIZE_ARGUMENTS}
+
+  for (auto _ : state){
+    ${CODEGEN_CALL_BENCHMARK_FUNCTION}
+  }
+}
+
+${CODEGEN_BENCHMARK}
+
+BENCHMARK_MAIN();
diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in
new file mode 100644
index 0000000000000000000000000000000000000000..c03577d672123b3063020841b731fc1fd9780fce
--- /dev/null
+++ b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in
@@ -0,0 +1,58 @@
+// Transformations:
+${CODEGEN_TRANSFORMATIONS}
+
+#include "config.h"
+
+#include <iostream>
+#include <fstream>
+#include <random>
+#include <cstdlib>
+
+#include "benchmark/benchmark.h"
+#include <dune/codegen/common/vectorclass.hh>
+#include <dune/codegen/sumfact/horizontaladd.hh>
+
+const int datasets = 512;
+
+class Benchmark{
+  ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+
+  ${CODEGEN_DECLARE_INPUT}
+
+public:
+  Benchmark (){
+    std::uniform_real_distribution<double> unif(0,1);
+    std::uniform_int_distribution<int> unif_int(0,1);
+    std::default_random_engine re;
+
+    ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}
+
+    ${CODEGEN_INITIALIZE_INPUT}
+  }
+
+  ${CODEGEN_BENCHMARK_FUNCTION}
+
+  void run_benchmark(){
+    ${CODEGEN_DECLARE_ARGUMENTS}
+
+    std::uniform_real_distribution<double> unif(0,1);
+    std::uniform_int_distribution<int> unif_int(0,1);
+    std::default_random_engine re;
+
+    ${CODEGEN_INITIALIZE_ARGUMENTS}
+
+    ${CODEGEN_CALL_BENCHMARK_FUNCTION}
+  }
+
+};
+
+static void BM_sumfact_kernel(benchmark::State& state){
+  Benchmark benchmark;
+  for (auto _ : state){
+    benchmark.run_benchmark();
+  }
+}
+
+BENCHMARK(BM_sumfact_kernel) -> Threads(atoi(std::getenv("DUNE_CODEGEN_THREADS"))) -> UseRealTime();
+
+BENCHMARK_MAIN();
diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py
index 14cd0b1d0fe3814f294c22bc00f25260f4ee0233..461f5b0a3ace21e09a7a3707c4849aa1c36e6972 100644
--- a/python/dune/codegen/sumfact/realization.py
+++ b/python/dune/codegen/sumfact/realization.py
@@ -30,7 +30,6 @@ from dune.codegen.sumfact.quadrature import quadrature_points_per_direction
 from dune.codegen.sumfact.symbolic import (SumfactKernel,
                                            VectorizedSumfactKernel,
                                            )
-from dune.codegen.sumfact.vectorization import attach_vectorization_info
 from dune.codegen.sumfact.accumulation import sumfact_iname
 from dune.codegen.loopy.target import dtype_floatingpoint
 from dune.codegen.loopy.vcl import ExplicitVCLCast
@@ -61,6 +60,12 @@ def name_buffer_storage(buff, which):
     return name
 
 
+def _max_sum_factorization_buffer_size(sf):
+    size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width,
+               product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width)
+    return size
+
+
 @kernel_cached
 def _realize_sum_factorization_kernel(sf):
     insn_dep = sf.insn_dep
@@ -74,8 +79,7 @@ def _realize_sum_factorization_kernel(sf):
     for buf in buffers:
         # Determine the necessary size of the buffer. We assume that we do not
         # underintegrate the form!!!
-        size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width,
-                   product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width)
+        size = _max_sum_factorization_buffer_size(sf)
         temporary_variable("{}_dummy".format(buf),
                            shape=(size,),
                            custom_base_storage=buf,
@@ -118,10 +122,32 @@ class BufferSwitcher(object):
     def __init__(self):
         self.current = 0
 
-    def get_temporary(self, name=None, **kwargs):
+    def get_temporary(self, sf=None, name=None, **kwargs):
+        assert sf
         assert name
+
         bs = "buffer{}".format(self.current)
-        globalarg(bs)
+        shape = kwargs['shape']
+        assert shape
+        dim_tags = kwargs['dim_tags'].split(',')
+        assert dim_tags
+
+        # Calculate correct alignment
+        vec_size = 1
+        if 'vec' in dim_tags:
+            vec_size = shape[dim_tags.index('vec')]
+        from dune.codegen.loopy.target import dtype_floatingpoint
+        dtype = np.dtype(kwargs.get("dtype", dtype_floatingpoint()))
+        alignment = dtype.itemsize * vec_size
+
+        # Add this buffer as global argument to the kernel. Add a shape to make
+        # benchmark generation for autotuning of loopy kernels easier. Since
+        # this buffer is used to store different data sizes we need to make
+        # sure it is big enough.
+        assert sf
+        size = _max_sum_factorization_buffer_size(sf)
+        globalarg(bs, shape=(size,), alignment=alignment, dim_tags=['f', ])
+
         temporary_variable(name,
                            managed=True,
                            custom_base_storage=bs,
@@ -193,7 +219,8 @@ def realize_sumfact_kernel_function(sf):
         if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible:
             input_summand = sf.interface.realize_direct_input(input_inames, inp_shape)
         elif l == 0:
-            input_summand = sf.interface.realize_input(input_inames,
+            input_summand = sf.interface.realize_input(sf,
+                                                       input_inames,
                                                        inp_shape,
                                                        vec_iname,
                                                        vec_shape,
@@ -204,7 +231,8 @@ def realize_sumfact_kernel_function(sf):
             # Get a temporary that interprets the base storage of the input
             # as a column-major matrix. In later iteration of the matrix loop
             # this reinterprets the output of the previous iteration.
-            inp = buffer.get_temporary("buff_step{}_in".format(l),
+            inp = buffer.get_temporary(sf,
+                                       "buff_step{}_in".format(l),
                                        shape=inp_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
@@ -250,7 +278,8 @@ def realize_sumfact_kernel_function(sf):
                 output_inames = permute_backward(output_inames, sf.interface.quadrature_permutation)
                 output_shape = permute_backward(output_shape, sf.interface.quadrature_permutation)
 
-            out = buffer.get_temporary("buff_step{}_out".format(l),
+            out = buffer.get_temporary(sf,
+                                       "buff_step{}_out".format(l),
                                        shape=output_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
@@ -265,7 +294,8 @@ def realize_sumfact_kernel_function(sf):
 
         else:
             output_shape = tuple(out_shape[1:]) + (out_shape[0],)
-            out = buffer.get_temporary("buff_step{}_out".format(l),
+            out = buffer.get_temporary(sf,
+                                       "buff_step{}_out".format(l),
                                        shape=output_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py
index 60641b0ce4af4ddef525a4c03a438c4ebe277c0c..dfd9383f93c79d3c50bfcca24cdd717fcc16aa58 100644
--- a/python/dune/codegen/sumfact/symbolic.py
+++ b/python/dune/codegen/sumfact/symbolic.py
@@ -60,7 +60,7 @@ class SumfactKernelInterfaceBase(object):
         """
         raise NotImplementedError
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         """Interpret the input of sumfact kernel function in the right way (non fastdgg)
 
         This happens inside the sumfact kernel function.
@@ -73,6 +73,7 @@ class SumfactKernelInterfaceBase(object):
 
         Parameters
         ----------
+        sf : SumfactKernel or VectorizedSumfactKernel
         inames : tuple of pymbolic.primitives.Variable
             Inames for accesing the input. Ordered according to permuted matrix sequence.
         shape : tuple of int
@@ -252,11 +253,12 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
             dep = dep.union(inp.setup_input(sf, dep, index=i))
         return dep
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
@@ -358,7 +360,7 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
 
         return prim.Call(prim.Variable(hadd_function), (result,))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # The input for stage 3 is quadrature permuted. The inames and shape
         # passed to this method are quadrature and cost permuted. This means we
         # need to take the cost permutation back to get the right inames and
@@ -368,7 +370,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
 
         # Get a temporary that interprets the base storage of the input as a
         # column-major matrix.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
@@ -382,9 +385,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
     def realize_direct_output(self, result, inames, shape, **args):
         outputs = set(self.interfaces)
 
-        # If multiple horizontal_add's are to be performed with 'result'
-        # we need to precompute the result!
         if len(outputs) > 1:
+            # Introduce substrule for the argument of the horizontal add
             substname = "haddsubst_{}".format("_".join([i.name for i in inames]))
             subst_rule(substname, (), result)
             result = prim.Call(prim.Variable(substname), ())
@@ -503,6 +505,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
         the transformation brings the next reduction index in the fastest
         position.
 
+        In einsum notation from numpy this can be written as three contractions
+        of the form: 'ij,jkl->kli'
+
         It can make sense to permute the order of directions. If you have
         a small m_l (e.g. stage 1 on faces) it is better to do direction l
         first. This can be done by:
@@ -533,6 +538,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
             other.
         interface: An SumfactKernelInterfaceBase instance describing the input
             (stage 1) or output (stage 3) of the kernel
+
         """
         # Assert the inputs!
         assert isinstance(matrix_sequence, tuple)
@@ -828,6 +834,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
                  vertical_width=1,
                  buffer=None,
                  insn_dep=frozenset(),
+                 transformations=(),
                  ):
         # Assert the input data structure
         assert isinstance(kernels, tuple)
@@ -880,8 +887,9 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     #
     @property
     def function_name(self):
-        return "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted),
+        name = "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted),
                                     self.interface.function_name_suffix)
+        return name
 
     @property
     def cache_key(self):
diff --git a/python/dune/codegen/sumfact/tabulation.py b/python/dune/codegen/sumfact/tabulation.py
index 9def97eb3ba4fdae280cc65e7f12ca73164d1146..e2e4a771ec9918f960e5ea88821aa7acb885a0c2 100644
--- a/python/dune/codegen/sumfact/tabulation.py
+++ b/python/dune/codegen/sumfact/tabulation.py
@@ -50,6 +50,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
                  direction=None,
                  slice_size=None,
                  slice_index=None,
+                 additional_tabulation=None,
                  ):
         """
         Arguments:
@@ -61,6 +62,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
         direction: Direction corresponding to this matrix
         slice_size: Number of slices for this direction
         slice_index: To which slice does this belong
+        additional_tabulation: A factor to be multiplied with this basis tabulation matrix.
         """
         assert(isinstance(basis_size, int))
         ImmutableRecord.__init__(self,
@@ -71,6 +73,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
                                  direction=direction,
                                  slice_size=slice_size,
                                  slice_index=slice_index,
+                                 additional_tabulation=additional_tabulation,
                                  )
 
     @property
@@ -90,6 +93,9 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
         if self.slice_size is not None:
             infos.append("s{}".format(self.slice_index))
 
+        if self.additional_tabulation is not None:
+            infos.append("prod{}".format(self.additional_tabulation._shortname))
+
         return "".join(infos)
 
     def __str__(self):
@@ -122,7 +128,9 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
     def pymbolic(self, indices):
         name = str(self)
         define_theta(name, self)
-        return prim.Subscript(prim.Variable(name), indices)
+        ret = prim.Subscript(prim.Variable(name), indices)
+
+        return ret
 
     @property
     def vectorized(self):
@@ -461,22 +469,31 @@ def define_theta(name, tabmat, additional_indices=(), width=None):
     else:
         args.append(tabmat.face)
 
+    # Get right hand side of basis evaluation matrix assignment
+    expr = prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args))
+
+    # Maybe multiply another matrix (needed for the very special case of assembling point diagonals)
+    if tabmat.additional_tabulation is not None:
+        expr = prim.Product((expr, prim.Call(PolynomialLookup(polynomials, tabmat.additional_tabulation.derivative), tuple(args))))
+
     instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices),
-                expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)),
+                expression=expr,
                 kernel="operator",
                 depends_on=dep,
                 )
 
 
-def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None, basis_size=None):
+def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None, basis_size=None, additional_sequence=None):
     dim = world_dimension()
     result = [None] * dim
+    if additional_sequence is None:
+        additional_sequence = [None] * dim
     quadrature_size = quadrature_points_per_direction()
     assert (basis_size is not None)
     if facedir is not None:
         quadrature_size = quadrature_size[:facedir] + (1,) + quadrature_size[facedir:]
 
-    for i in range(dim):
+    for i, add_seq in zip(range(dim), additional_sequence):
         onface = None
         if facedir == i:
             onface = facemod
@@ -485,6 +502,7 @@ def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=No
                                           basis_size=basis_size[i],
                                           transpose=transpose,
                                           derivative=derivative == i,
-                                          face=onface)
+                                          face=onface,
+                                          additional_tabulation=add_seq)
 
     return tuple(result)
diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py
new file mode 100644
index 0000000000000000000000000000000000000000..a752d217add1013bbcb22771824cf992103b2237
--- /dev/null
+++ b/python/dune/codegen/sumfact/transformations.py
@@ -0,0 +1,422 @@
+import re
+import logging
+
+import loopy as lp
+import pymbolic.primitives as prim
+import islpy as isl
+
+from dune.codegen.generation import (get_counted_variable,
+                                     get_global_context_value,
+                                     )
+from dune.codegen.loopy.transformations.remove_reductions import remove_all_reductions, remove_reduction
+from dune.codegen.options import get_form_option, get_option
+from dune.codegen.pdelab.geometry import world_dimension
+from dune.codegen.error import CodegenAutotuneError
+from dune.codegen.sumfact.autotune import autotune_realization
+
+
+def _current_iname_order(current_inames, new_iname_order):
+    """Sort the inames for this contraction according to new_iname order"""
+    current_iname_order = []
+    for i in new_iname_order:
+        for j in current_inames:
+            if i in j:
+                current_iname_order.append(j)
+    return current_iname_order
+
+
+def _get_inames_of_reduction(instr, iname_permutation):
+    # Check which loops are inside the reduction loop
+    reduction_index = iname_permutation[-1]
+    within_reduction = tuple(i > reduction_index for i in iname_permutation)
+
+    # Basename of inner inames
+    dim = world_dimension()
+    inner_inames_base = []
+    for index, within in enumerate(within_reduction):
+        if within:
+            inner_inames_base.append('sf_out_inames_{}'.format(dim - 1 - index))
+
+    # Name of inner inames and outer inames
+    inner_inames = []
+    outer_inames = []
+    vec_inames = []
+    for i in [iname for iname in instr.within_inames]:
+        for j in inner_inames_base:
+            if j in i:
+                inner_inames.append(i)
+        if 'sf_vec' in i:
+            vec_inames.append(i)
+        elif i not in inner_inames:
+            outer_inames.append(i)
+
+    # Reduction iname
+    regex = re.compile('sf_red_([0-9]*)')
+    reduction_index = set(regex.findall(str(instr)))
+    if len(reduction_index) == 0:
+        reduction_iname = None
+    else:
+        assert len(reduction_index) == 1
+        reduction_index = reduction_index.pop()
+        reduction_iname = 'sf_red_{}'.format(reduction_index)
+
+    return outer_inames, reduction_iname, inner_inames, vec_inames
+
+
+class FindReductionMapper(lp.symbolic.WalkMapper):
+    def __init__(self):
+        self.reductions = []
+
+    def map_reduction(self, expr, **kwargs):
+        self.reductions.append(expr)
+        return lp.symbolic.WalkMapper.map_reduction(self, expr, **kwargs)
+
+
+def _reorder_reduction_loops(kernel, match, iname_order):
+    """Reorder loops in instructions containing one reduction
+
+    In order to reorder the loops we need to manually do the accumulation in a
+    temporary that is large enough to fit all the data created in loops inside
+    the reduction loop.
+
+    Parameters
+    ----------
+    kernel: loopy.kernel.LoopKernel
+    match:
+    iname_order: tuple of str
+        prefered loop order for this instruction
+    """
+    instructions = lp.find_instructions(kernel, match)
+    for instr in instructions:
+        # Get reduction
+        reduction = FindReductionMapper()
+        reduction(instr.expression)
+        assert len(reduction.reductions) == 1
+        reduction = reduction.reductions[0]
+
+        # Reduction iname, inner inames and vetcor inames
+        assert len(reduction.inames) == 1
+        reduction_iname = reduction.inames[0]
+        assert set([reduction_iname]) == instr.reduction_inames()
+        if reduction_iname not in iname_order:
+            lp.prioritize_loops(kernel, iname_order)
+            continue
+        inner_inames = iname_order[iname_order.index(reduction_iname) + 1:]
+        if len(inner_inames) == 0:
+            lp.prioritize_loops(kernel, iname_order)
+            continue
+        # TODO: There should be a better way to do that
+        vec_inames = [i for i in instr.within_inames if 'sf_vec' in i]
+        assert len(vec_inames) < 2
+
+        # {{{ Create new temporary variable
+
+        # Create dim_tags
+        dim_tags = ','.join(['f'] * len(inner_inames))
+        vectorized = len(vec_inames) > 0
+        if vectorized:
+            assert len(vec_inames) == 1
+            dim_tags = dim_tags + ',vec'
+
+        # Create shape
+        shape = tuple(kernel.get_constant_iname_length(i) for i in inner_inames + vec_inames)
+
+        # Update temporary_variables of this kernel
+        from dune.codegen.loopy.temporary import DuneTemporaryVariable
+        accum_variable = get_counted_variable('accum_variable')
+        from dune.codegen.loopy.target import dtype_floatingpoint
+        dtype = lp.types.NumpyType(dtype_floatingpoint())
+        var = {accum_variable: DuneTemporaryVariable(accum_variable,
+                                                     dtype=dtype,
+                                                     shape=shape,
+                                                     dim_tags=dim_tags,
+                                                     managed=True)}
+        tv = kernel.temporary_variables.copy()
+        tv.update(var)
+        kernel = kernel.copy(temporary_variables=tv)
+
+        # }}}
+
+        # Set accumulation variable to zero
+        accum_init_inames = tuple(prim.Variable(i) for i in inner_inames)
+        if vectorized:
+            accum_init_inames = accum_init_inames + (prim.Variable(vec_inames[0]),)
+        assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
+        accum_init_id = instr.id + '_accum_init'
+        accum_init_instr = lp.Assignment(assignee,
+                                         0,
+                                         within_inames=instr.within_inames,
+                                         id=accum_init_id,
+                                         depends_on=instr.depends_on,
+                                         tags=('accum_init',),
+                                         )
+        kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr])
+
+        # Accumulate in temporary variable
+        assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
+        expression = prim.Sum((assignee, reduction.expr))
+        within_inames = frozenset(tuple(instr.within_inames) + reduction.inames)
+        accum_id = instr.id + '_accum'
+        accum_instr = lp.Assignment(assignee,
+                                    expression,
+                                    within_inames=within_inames,
+                                    id=accum_id,
+                                    depends_on=frozenset([accum_init_id]),
+                                    tags=('accum',),
+                                    )
+        kernel = kernel.copy(instructions=kernel.instructions + [accum_instr])
+
+        # Replace reduction in insn with accumulation result
+        def map_reduction(expr, rec):
+            return assignee
+        mapper = lp.symbolic.ReductionCallbackMapper(map_reduction)
+        new_expression = mapper(instr.expression)
+        assign_id = instr.id + '_assign'
+        new_instr = instr.copy(expression=new_expression,
+                               id=assign_id,
+                               depends_on=frozenset(list(instr.depends_on) + [accum_id]))
+        kernel = kernel.copy(instructions=kernel.instructions + [new_instr])
+
+        # Fix dependencies and remove old instruction
+        for i in kernel.instructions:
+            if instr.id in i.depends_on:
+                match = lp.match.Id(i.id)
+                kernel = lp.add_dependency(kernel, match, assign_id)
+        kernel = lp.remove_instructions(kernel, set([instr.id]))
+
+        # Duplicate and reorder loops for accumulation
+        ids = [accum_init_id, accum_id]
+        duplicate_inames = tuple(inner_inames)
+        for idx in ids:
+            match = lp.match.Id(idx)
+            kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+            match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
+            current_iname_order = _current_iname_order(match_inames, iname_order)
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+
+        # Reorder assignment loops
+        kernel = lp.prioritize_loops(kernel, iname_order)
+        return kernel
+
+
+def _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation):
+    """Reorder the loop nests of a tensor contraction accumulating directly in the data structure"""
+    dim = world_dimension()
+
+    # Nothing to do if permutation is identity
+    if iname_permutation == tuple(range(dim + 1)):
+        return kernel
+
+    # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
+    default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
+    from dune.codegen.sumfact.permutation import permute_backward
+    new_iname_order = permute_backward(default_iname_order, iname_permutation)
+
+    for instr in kernel.instructions:
+        # Inames used in this reduction
+        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
+                                                                                           iname_permutation)
+        if reduction_iname:
+            current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
+        else:
+            current_inames = outer_inames + inner_inames + vec_inames
+        current_iname_order = _current_iname_order(current_inames,
+                                                   new_iname_order)
+
+        # We can directly use lp.prioritize_loops if
+        # - The reduction is the innermost loop
+        # - There is no reduction (eg reduced direction on faces)
+        if iname_permutation[-1] == dim or reduction_iname is None:
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+            continue
+
+        if isinstance(instr.expression, lp.symbolic.Reduction):
+            # If the instruction is a reduction we can directly accumulate in the assignee
+            assert set(inner_inames).issubset(set(i.name for i in instr.assignee.index_tuple))
+            match = lp.match.Id(instr.id)
+            kernel = remove_reduction(kernel, match)
+
+            lp.prioritize_loops(kernel, current_iname_order)
+            duplicate_inames = tuple(inner_inames)
+            match = lp.match.Id(instr.id + '_set_zero')
+            kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+            match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
+            set_zero_iname_order = _current_iname_order(match_inames, new_iname_order)
+            lp.prioritize_loops(kernel, tuple(set_zero_iname_order))
+        else:
+            # In stage 3 this is usually not a reduction. In this case direct
+            # accumulation is not possible and we accumulate in a temporay
+            # variable instead
+            match = lp.match.Id(instr.id)
+            kernel = _reorder_reduction_loops(kernel, match, current_iname_order)
+
+    return kernel
+
+
+def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
+    """Reorder the loop nests of a tensor contraction using an accumulation variable"""
+    dim = world_dimension()
+
+    # Nothing to do if permutation is identity
+    if iname_permutation == tuple(range(dim + 1)):
+        return kernel
+
+    # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
+    default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
+    from dune.codegen.sumfact.permutation import permute_backward
+    new_iname_order = permute_backward(default_iname_order, iname_permutation)
+
+    for instr in kernel.instructions:
+        # Inames used in this reduction
+        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
+                                                                                           iname_permutation)
+        if reduction_iname:
+            current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
+        else:
+            current_inames = outer_inames + inner_inames + vec_inames
+        current_iname_order = _current_iname_order(current_inames, new_iname_order)
+
+        # We can directly use lp.prioritize_loops if:
+        # - The reduction is the innermost loop
+        # - There is no reduction (eg reduced direction on faces)
+        if iname_permutation[-1] == dim or reduction_iname is None:
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+            continue
+
+        match = lp.match.Id(instr.id)
+        kernel = _reorder_reduction_loops(kernel, match, current_iname_order)
+
+    return kernel
+
+
+def reorder_loops_in_tensor_contraction(kernel, iname_permutation, accum_variable=True):
+    """Change loop order in tensor contractions of sum factorization kernel
+
+    Since there is a reduction involved this implies more than just reordering
+    loops. There are two possibilities to handle the reduction:
+
+    1) Generate an object of correct size for the accumulation and assign to
+    the result data structure afterwards
+
+    2) Make sure to set the correct entries of the result data structure to
+    zero and accumulate inplace
+
+    Parameters
+    ----------
+    kernel: loopy.kernel.LoopKernel
+    iname_permutation: tuple of ints
+        How to permute the loop inames. Should contain some permutation of the
+        numbers 0 to dim+1
+    accum_variable: bool
+        If True use method 1 else use method 2 from above
+
+    Notes
+    -----
+    Using einsum notation from numpy a contraction of a sum factorization
+    kernel can be written as:
+
+    - 3D: 'ij,jkl->kli'
+    - 2D: 'ij,jk->ki'
+
+    In the sum factorization kernel itself those inames are called:
+
+    sf_out_inames_2_* : l
+    sf_out_inames_1_* : k
+    sf_out_inames_0_* : i
+    sf_red_* : j
+
+    where * represents the current direction (0,1,2 for 3D problems). The
+    default order of the loops is
+
+    (sf_out_iname_2_*, sf_out_iname_1_*, sf_out_iname_0_*, red_*)
+
+    A permutation vector of (3,2,0,1) would mean that we do
+
+    (sf_out_iname_0_*, red_*, sf_out_iname_1_*, sf_out_iname_2_*)
+
+    instead.
+
+    """
+    if accum_variable:
+        kernel = _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation)
+        return kernel
+    else:
+        kernel = _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation)
+        return kernel
+
+
+def tensor_contraction_loop_order_generator(kernel):
+    yield kernel, ['None']
+
+    dim = world_dimension()
+    identity = range(dim + 1)
+    import itertools
+    for permutation in itertools.permutations(identity):
+        # Culling of stupid variant
+        if permutation[0] == dim:
+            continue
+
+        new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, accum_variable=True)
+        yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_True'.format(permutation)]
+
+        new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, accum_variable=False)
+        yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_False'.format(permutation)]
+
+
+def simple_autotuner(kernel_generator, signature):
+    kernel, transformations = next(kernel_generator)
+    best_cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations)
+    best_kernel = kernel
+    best_transformations = transformations
+    for kernel, transformations in kernel_generator:
+        cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations)
+        if cost < best_cost:
+            best_cost = cost
+            best_kernel = kernel
+            best_transformations = transformations
+    return best_kernel, best_transformations
+
+
+def autotune_tensor_contraction_loop_order(kernel, signature):
+    logger = logging.getLogger(__name__)
+
+    assert get_option('autotune_google_benchmark')
+    from dune.codegen.loopy.transformations.matchfma import match_fused_multiply_add
+    kernel = match_fused_multiply_add(kernel)
+    generator = tensor_contraction_loop_order_generator(kernel)
+    kernel, transformations = simple_autotuner(generator, signature)
+    logger.debug('autotute_tensor_contraction_loop_order - kernel {} transformations {}'.format(kernel.name, transformations))
+    return kernel
+
+
+def sumfact_performance_transformations(kernel, signature):
+    if get_form_option('sumfact_performance_transformations'):
+        assert not get_form_option('sumfact_performance_transformations_testrun')
+        if kernel.name.startswith('sfimpl'):
+            kernel = autotune_tensor_contraction_loop_order(kernel, signature)
+
+    # Testing performance transformations
+    elif get_form_option('sumfact_performance_transformations_testrun'):
+        assert not get_form_option('sumfact_performance_transformations')
+        if kernel.name.startswith('sfimpl'):
+            dim = world_dimension()
+            testcase = get_form_option('sumfact_performance_transformations_testrun')
+            if dim == 2:
+                testrun_dict = {
+                    1: [(reorder_loops_in_tensor_contraction, ((0, 2, 1), True))],
+                    2: [(reorder_loops_in_tensor_contraction, ((0, 2, 1), False))],
+                    3: [(reorder_loops_in_tensor_contraction, ((2, 0, 1), True))],
+                    4: [(reorder_loops_in_tensor_contraction, ((2, 0, 1), False))],
+                }
+                for trafo, arguments in testrun_dict[testcase]:
+                    kernel = trafo(kernel, *arguments)
+            else:
+                testrun_dict = {
+                    1: [(reorder_loops_in_tensor_contraction, ((3, 2, 0, 1), True))],
+                    2: [(reorder_loops_in_tensor_contraction, ((3, 2, 0, 1), False))],
+                    3: [(reorder_loops_in_tensor_contraction, ((2, 0, 1, 3), True))],
+                    4: [(reorder_loops_in_tensor_contraction, ((2, 0, 1, 3), False))],
+                }
+                for trafo, arguments in testrun_dict[testcase]:
+                    kernel = trafo(kernel, *arguments)
+    return kernel
diff --git a/python/dune/codegen/ufl/finiteelement.py b/python/dune/codegen/ufl/finiteelement.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae0ef35f2a3b30cd2837525447233dfe017cd76d
--- /dev/null
+++ b/python/dune/codegen/ufl/finiteelement.py
@@ -0,0 +1,43 @@
+""" Registering finite elements that we have, but FEniCS does not """
+
+from ufl import register_element, L2
+
+
+register_element("Discontinuous Gauss-Lobatto-Legendre",
+                 "DGLL",
+                 0,
+                 L2,
+                 "identity",
+                 (0, None),
+                 ('interval',),
+                 )
+
+
+register_element("Monomials",
+                 "Monom",
+                 0,
+                 L2,
+                 "identity",
+                 (0, None),
+                 ('interval',),
+                 )
+
+
+register_element("L2-Orthonormal Polynomials",
+                 "OPB",
+                 0,
+                 L2,
+                 "identity",
+                 (0, None),
+                 ('interval',),
+                 )
+
+
+register_element("Rannacher-Turek",
+                 "RaTu",
+                 0,
+                 L2,
+                 "identity",
+                 (1, 1),
+                 ('quadrilateral', 'hexahedron'),
+                 )
diff --git a/python/dune/codegen/ufl/transformations/blockpreconditioner.py b/python/dune/codegen/ufl/transformations/blockpreconditioner.py
index c992af12e0cae3499f6f6694027610dfd53c535e..b3fd11249e766bdc8abec4c0d01defbf5de9e59c 100644
--- a/python/dune/codegen/ufl/transformations/blockpreconditioner.py
+++ b/python/dune/codegen/ufl/transformations/blockpreconditioner.py
@@ -21,7 +21,7 @@ class OffDiagonalBlockSwitcher(MultiFunction):
     def positive_restricted(self, o):
         self.res = Restriction.POSITIVE
         ret = self(o.ufl_operands[0])
-        self.rest = Restriction.NONE
+        self.res = Restriction.NONE
         if isinstance(ret, uc.Zero):
             return ret
         else:
@@ -55,6 +55,8 @@ class OffDiagonalBlockSwitcher(MultiFunction):
 def list_restriction_tuples(diagonal):
     if diagonal:
         yield (Restriction.NONE, Restriction.NONE)
+        yield (Restriction.POSITIVE, Restriction.POSITIVE)
+        return
 
     res = (Restriction.POSITIVE, Restriction.NEGATIVE)
     amount = 1 if diagonal else 2
diff --git a/python/dune/codegen/ufl/visitor.py b/python/dune/codegen/ufl/visitor.py
index ab7c334f323bbc51221d926d357466d6ebc8cf88..e774e03d90e8bc61a1108767867ab046bfa56c81 100644
--- a/python/dune/codegen/ufl/visitor.py
+++ b/python/dune/codegen/ufl/visitor.py
@@ -37,6 +37,7 @@ from ufl.classes import (Coefficient,
                          JacobianDeterminant,
                          )
 
+from pytools import product as ptproduct
 import pymbolic.primitives as prim
 import numpy as np
 
@@ -278,7 +279,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
     #
 
     def product(self, o):
-        return prim.flattened_product(tuple(self.call(op) for op in o.ufl_operands))
+        ops = tuple(self.call(op) for op in o.ufl_operands)
+        if all(isinstance(op, (int, float)) for op in ops):
+            return ptproduct(ops)
+        return prim.flattened_product(ops)
 
     def float_value(self, o):
         return o.value()
@@ -290,7 +294,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         return prim.quotient(self.call(o.ufl_operands[0]), self.call(o.ufl_operands[1]))
 
     def sum(self, o):
-        return prim.flattened_sum(tuple(self.call(op) for op in o.ufl_operands))
+        ops = tuple(self.call(op) for op in o.ufl_operands)
+        if all(isinstance(op, (int, float)) for op in ops):
+            return sum(ops)
+        return prim.flattened_sum(ops)
 
     def zero(self, o):
         # UFL has Zeroes with shape. We ignore those indices.
diff --git a/python/setup.py b/python/setup.py
index 88431ad1b496afc1c41d13fae5608956274f3018..d847cb90064e734efe22ea0646129fa8bb86be35 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -47,5 +47,6 @@ setup(name='dune.codegen',
         "console_scripts": [
             "generate_operators = dune.codegen.compile:entry_generate_operators",
             "generate_driver = dune.codegen.compile:entry_generate_driver",
+            "show_options = dune.codegen.options:show_options",
         ]
     })
diff --git a/test/blockstructured/poisson/CMakeLists.txt b/test/blockstructured/poisson/CMakeLists.txt
index 1b48ca5f343efeffe528a5066d3bc4073898b8a5..aa711fedea7e867da04f96050bb817c7935ef83f 100644
--- a/test/blockstructured/poisson/CMakeLists.txt
+++ b/test/blockstructured/poisson/CMakeLists.txt
@@ -41,4 +41,9 @@ dune_add_formcompiler_system_test(UFLFILE poisson.ufl
 dune_add_formcompiler_system_test(UFLFILE poisson.ufl
         BASENAME blockstructured_poisson_grid
         INIFILE poisson_grid.mini
-)
\ No newline at end of file
+)
+
+dune_add_formcompiler_system_test(UFLFILE poisson.ufl
+        BASENAME blockstructured_poisson_vec_tail
+        INIFILE poisson_vec_tail.mini
+        )
diff --git a/test/blockstructured/poisson/poisson_tensor.mini b/test/blockstructured/poisson/poisson_tensor.mini
index 6f18a924c4deabebaeb8447abd77076b0e2e127f..1e6e90fa76bdf3c00be899a879921845cbdf73bb 100644
--- a/test/blockstructured/poisson/poisson_tensor.mini
+++ b/test/blockstructured/poisson/poisson_tensor.mini
@@ -1,8 +1,13 @@
 __name = blockstructured_poisson_tensor_{__exec_suffix}
-__exec_suffix = {grid_suffix}_{vec_suffix}_{dim_suffix}
+__exec_suffix = {grid_suffix}_{vec_suffix}_{dim_suffix}_blocks_{blocks}
 
 dim = 2, 3 | expand dimension
 
+blocks_2d = 8, 7 | expand blocks
+blocks_3d = 4, 5 | expand blocks
+
+blocks = {blocks_2d}, {blocks_3d} | expand dimension
+
 grid_suffix = structured, unstructured | expand unstructured
 vec_suffix = nonvec, vec | expand vectorized
 dim_suffix = 2d, 3d | expand dimension
@@ -26,7 +31,7 @@ matrix_free = 1
 vectorization_blockstructured = 0, 1 | expand vectorized
 generate_jacobians = 0
 blockstructured = 1
-number_of_blocks = 8, 4 | expand dimension
+number_of_blocks = {blocks}
 geometry_mixins = blockstructured_equidistant, blockstructured_multilinear | expand unstructured
 
 [formcompiler.ufl_variants]
diff --git a/test/blockstructured/poisson/poisson_vec_tail.mini b/test/blockstructured/poisson/poisson_vec_tail.mini
new file mode 100644
index 0000000000000000000000000000000000000000..9582d752ca3a6b26ea2e9dab28b8754a3f244953
--- /dev/null
+++ b/test/blockstructured/poisson/poisson_vec_tail.mini
@@ -0,0 +1,34 @@
+__name = blockstructured_poisson_vec_tail_{__exec_suffix}
+__exec_suffix = {dimname}_{tail_suffix}
+
+dim = 2, 3 | expand dimension
+dimname = 2d, 3d | expand dimension
+
+cells = 8, 2 | expand dimension | repeat {dim}
+extension = 1. | repeat {dim}
+
+tail_vec = 0, 1 | expand tail_vec
+tail_modus = consecutive, blocked | expand mod
+tail_suffix = novec_{tail_modus}, vec_{tail_modus} | expand tail_vec
+
+[wrapper.vtkcompare]
+name = {__name}
+reference = poisson_ref
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-7
+
+[formcompiler.r]
+matrix_free = 1
+generate_jacobians = 0
+blockstructured = 1
+number_of_blocks = 15, 7 | expand dimension
+vectorization_blockstructured = 1
+vectorization_blockstructured_tail = {tail_vec}
+vectorization_blockstructured_tail_ordering = {tail_modus}
+geometry_mixins = blockstructured_equidistant
+
+[formcompiler.ufl_variants]
+cell = quadrilateral, hexahedron | expand dimension
+degree = 1
\ No newline at end of file
diff --git a/test/sumfact/CMakeLists.txt b/test/sumfact/CMakeLists.txt
index 187aee212ed125a99bc82670fe34e680379542f7..61b81b76082cfa3c83b04d2632194d5a4a6e6447 100644
--- a/test/sumfact/CMakeLists.txt
+++ b/test/sumfact/CMakeLists.txt
@@ -2,3 +2,4 @@ add_subdirectory(hyperbolic)
 add_subdirectory(mass)
 add_subdirectory(poisson)
 add_subdirectory(stokes)
+add_subdirectory(preconditioner)
diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt
index 307e6d43e25caa89234a884156c7ee5643f16b8f..c81c94bc9fd536645149940d290846abce857715 100644
--- a/test/sumfact/poisson/CMakeLists.txt
+++ b/test/sumfact/poisson/CMakeLists.txt
@@ -119,3 +119,39 @@ dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
                                   BASENAME sumfact_poisson_dg_3d_diagonal
                                   INIFILE diagonal.mini
                                   )
+
+#======================================
+# Test autotuning with google-benchmark
+#======================================
+if(benchmark_FOUND)
+  dune_add_formcompiler_system_test(UFLFILE poisson_3d.ufl
+                                    BASENAME sumfact_poisson_3d_benchmark
+                                    INIFILE poisson_3d_benchmark.mini
+                                    )
+
+  dune_add_formcompiler_system_test(UFLFILE poisson_dg_volumes_3d.ufl
+                                    BASENAME sumfact_poisson_fastdg_volumes_3d_benchmark
+                                    INIFILE poisson_fastdg_volumes_3d_benchmark.mini
+                                    )
+endif()
+
+
+#================================================
+# Poisson fastdg with performance transformations
+#================================================
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl
+                                  BASENAME sumfact_poisson_fastdg_2d_performance_transformations
+                                  INIFILE poisson_fastdg_2d_performance_transformations.mini
+                                  )
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
+                                  BASENAME sumfact_poisson_fastdg_3d_performance_transformations
+                                  INIFILE poisson_fastdg_3d_performance_transformations.mini
+                                  )
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl
+                                  BASENAME sumfact_poisson_dg_2d_performance_transformations
+                                  INIFILE poisson_dg_2d_performance_transformations.mini
+                                  )
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
+                                  BASENAME sumfact_poisson_dg_3d_performance_transformations
+                                  INIFILE poisson_dg_3d_performance_transformations.mini
+                                  )
diff --git a/test/sumfact/poisson/poisson_3d_benchmark.mini b/test/sumfact/poisson/poisson_3d_benchmark.mini
new file mode 100644
index 0000000000000000000000000000000000000000..aca0d876328991b5b61e848919274631b78d8434
--- /dev/null
+++ b/test/sumfact/poisson/poisson_3d_benchmark.mini
@@ -0,0 +1,29 @@
+__name = sumfact_poisson_3d_benchmark_{__exec_suffix}
+__exec_suffix = {deg_suffix}_{diff_suffix}_{quadvec_suffix}_{gradvec_suffix}
+
+deg_suffix = deg{formcompiler.ufl_variants.degree}
+diff_suffix = symdiff
+quadvec_suffix = quadvec
+gradvec_suffix = autotunevec
+
+cells = 8 8 8
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+reference = poisson_ref
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = autotune
+geometry_mixins = sumfact_equidistant
+
+[formcompiler.ufl_variants]
+degree = 1
diff --git a/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..47b0e5269c9ebf62d03328e0f5370528a14ac9f6
--- /dev/null
+++ b/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini
@@ -0,0 +1,28 @@
+__name = sumfact_poisson_dg_2d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 16 16
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = explicit, none | expand gradvec
+fastdg = 0
+geometry_mixins = sumfact_equidistant
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+
+[formcompiler.ufl_variants]
+degree = 2
\ No newline at end of file
diff --git a/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..0b6d75f5c1181bcfe9f5c7ab9c6a941cd9653251
--- /dev/null
+++ b/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini
@@ -0,0 +1,29 @@
+__name = sumfact_poisson_dg_3d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 8 8 8
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+# vectorization_strategy = explicit, none | expand gradvec
+vectorization_strategy = model, none | expand gradvec
+fastdg = 0
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+geometry_mixins = sumfact_equidistant
+
+[formcompiler.ufl_variants]
+degree = 2
diff --git a/test/sumfact/poisson/poisson_dg_volumes_3d.ufl b/test/sumfact/poisson/poisson_dg_volumes_3d.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..4e491864547989ce900fa0c4bac527fe673e14b1
--- /dev/null
+++ b/test/sumfact/poisson/poisson_dg_volumes_3d.ufl
@@ -0,0 +1,38 @@
+cell = hexahedron
+dim = 3
+
+x = SpatialCoordinate(cell)
+c = (0.5-x[0])**2 + (0.5-x[1])**2 + (0.5-x[2])**2
+g = exp(-1.*c)
+f = 2*(3.-2*c)*g
+
+V = FiniteElement("DG", cell, degree)
+
+u = TrialFunction(V)
+v = TestFunction(V)
+
+n = FacetNormal(cell)('+')
+
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = 1.0
+
+r = inner(grad(u), grad(v))*dx \
+  - f*v*dx
+
+  # - inner(n, avg(grad(u)))*jump(v)*dS \
+  # + gamma_int*jump(u)*jump(v)*dS \
+  # + theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  # - inner(n, grad(u))*v*ds \
+  # + gamma_ext*u*v*ds \
+  # + theta*u*inner(grad(v), n)*ds \
+  # - gamma_ext*g*v*ds \
+  # - theta*g*inner(grad(v), n)*ds
+
+exact_solution = g
diff --git a/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..22461b9cc8bbafc6a9e15c2489bd35bf432bb5de
--- /dev/null
+++ b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini
@@ -0,0 +1,28 @@
+__name = sumfact_poisson_fastdg_2d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 16 16
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = model, none | expand gradvec
+fastdg = 1
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+geometry_mixins = sumfact_equidistant
+
+[formcompiler.ufl_variants]
+degree = 2
\ No newline at end of file
diff --git a/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..0e9cfea80bab55167a277ce8e189761e9b780420
--- /dev/null
+++ b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini
@@ -0,0 +1,28 @@
+__name = sumfact_poisson_fastdg_3d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 8 8 8
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = explicit, none | expand gradvec
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+
+[formcompiler.ufl_variants]
+degree = 2
diff --git a/test/sumfact/poisson/poisson_fastdg_volumes_3d_benchmark.mini b/test/sumfact/poisson/poisson_fastdg_volumes_3d_benchmark.mini
new file mode 100644
index 0000000000000000000000000000000000000000..253c36bbb6910588b3cabedfa73f3c950ed46e1f
--- /dev/null
+++ b/test/sumfact/poisson/poisson_fastdg_volumes_3d_benchmark.mini
@@ -0,0 +1,32 @@
+__name = sumfact_poisson_fastdg_volumes_3d_benchmark_{__exec_suffix}
+__exec_suffix = {deg_suffix}_{diff_suffix}_{quadvec_suffix}_{gradvec_suffix}
+
+deg_suffix = deg{formcompiler.ufl_variants.degree}
+diff_suffix = symdiff
+quadvec_suffix = quadvec
+gradvec_suffix = autotunevec
+
+cells = 8 8 8
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+reference = poisson_ref
+extension = vtu
+
+[formcompiler]
+# Since this test makes a DG scheme without skeletons the solution is garbage.
+# This test just tests generation of microbenchmarks.
+# compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = autotune
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+
+[formcompiler.ufl_variants]
+degree = 1
diff --git a/test/sumfact/preconditioner/CMakeLists.txt b/test/sumfact/preconditioner/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ea77ba5dca71ce5afdff567be1f872dc0dc604e0
--- /dev/null
+++ b/test/sumfact/preconditioner/CMakeLists.txt
@@ -0,0 +1,11 @@
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl
+                                  BASENAME sumfact_preconditioner_2d
+                                  INIFILE preconditioner_2d.mini
+                                  SOURCE test_preconditioner_2d.cc
+                                  )
+
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
+                                  BASENAME sumfact_preconditioner_3d
+                                  INIFILE preconditioner_3d.mini
+                                  SOURCE test_preconditioner_3d.cc
+                                  )
diff --git a/test/sumfact/preconditioner/poisson_dg_2d.ufl b/test/sumfact/preconditioner/poisson_dg_2d.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..ff41f6164df46aa1909db03768aef4041011afae
--- /dev/null
+++ b/test/sumfact/preconditioner/poisson_dg_2d.ufl
@@ -0,0 +1,38 @@
+cell = "quadrilateral"
+dim = 2
+degree = 1
+
+x = SpatialCoordinate(cell)
+c = (0.5-x[0])**2 + (0.5-x[1])**2
+g = exp(-1.*c)
+f = 2*(2.-2*c)*g
+
+V = FiniteElement("DG", cell, degree)
+
+u = TrialFunction(V)
+v = TestFunction(V)
+
+n = FacetNormal(cell)('+')
+
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = 1.0
+
+r = inner(grad(u), grad(v))*dx \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - inner(n, grad(u))*v*ds \
+  + gamma_ext*u*v*ds \
+  + theta*u*inner(grad(v), n)*ds \
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
+
+exact_solution = g
diff --git a/test/sumfact/preconditioner/poisson_dg_3d.ufl b/test/sumfact/preconditioner/poisson_dg_3d.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..cc6594573762ee12e1b11206eeba07a4bd038726
--- /dev/null
+++ b/test/sumfact/preconditioner/poisson_dg_3d.ufl
@@ -0,0 +1,38 @@
+cell = hexahedron
+dim = 3
+degree = 1
+
+x = SpatialCoordinate(cell)
+c = (0.5-x[0])**2 + (0.5-x[1])**2 + (0.5-x[2])**2
+g = exp(-1.*c)
+f = 2*(3.-2*c)*g
+
+V = FiniteElement("DG", cell, degree)
+
+u = TrialFunction(V)
+v = TestFunction(V)
+
+n = FacetNormal(cell)('+')
+
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = 1.0
+
+r = inner(grad(u), grad(v))*dx \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - inner(n, grad(u))*v*ds \
+  + gamma_ext*u*v*ds \
+  + theta*u*inner(grad(v), n)*ds \
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
+
+exact_solution = g
diff --git a/test/sumfact/preconditioner/preconditioner_2d.mini b/test/sumfact/preconditioner/preconditioner_2d.mini
new file mode 100644
index 0000000000000000000000000000000000000000..adcf214dde7018a9c5aa093cff89c51cf67d6a3a
--- /dev/null
+++ b/test/sumfact/preconditioner/preconditioner_2d.mini
@@ -0,0 +1,46 @@
+__name = preconditioner_2d
+__exec_suffix = exec
+
+cells = 2 2
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+operators = r, blockdiag, blockoffdiag, pointdiag
+
+[formcompiler.r]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+classname = FullOperator
+filename = full_2d_operator.hh
+
+[formcompiler.blockdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_diagonal = 1
+form = r
+classname = BlockDiagonalOperator
+filename = block_diagonal_2d_operator.hh
+
+[formcompiler.blockoffdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_offdiagonal = 1
+form = r
+classname = BlockOffDiagonalOperator
+filename = block_offdiagonal_2d_operator.hh
+
+[formcompiler.pointdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_pointdiagonal = 1
+form = r
+classname = PointDiagonalOperator
+filename = point_diagonal_2d_operator.hh
diff --git a/test/sumfact/preconditioner/preconditioner_3d.mini b/test/sumfact/preconditioner/preconditioner_3d.mini
new file mode 100644
index 0000000000000000000000000000000000000000..304d6e28f0e6f152d6327929984c0121dd55e978
--- /dev/null
+++ b/test/sumfact/preconditioner/preconditioner_3d.mini
@@ -0,0 +1,46 @@
+__name = preconditioner_3d
+__exec_suffix = exec
+
+cells = 2 2 2
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+operators = r, blockdiag, blockoffdiag, pointdiag
+
+[formcompiler.r]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+classname = FullOperator
+filename = full_3d_operator.hh
+
+[formcompiler.blockdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_diagonal = 1
+form = r
+classname = BlockDiagonalOperator
+filename = block_diagonal_3d_operator.hh
+
+[formcompiler.blockoffdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_offdiagonal = 1
+form = r
+classname = BlockOffDiagonalOperator
+filename = block_offdiagonal_3d_operator.hh
+
+[formcompiler.pointdiag]
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_pointdiagonal = 1
+form = r
+classname = PointDiagonalOperator
+filename = point_diagonal_3d_operator.hh
diff --git a/test/sumfact/preconditioner/test_preconditioner_2d.cc b/test/sumfact/preconditioner/test_preconditioner_2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2781b32399f800d1d6334565aefbede360fdbfa9
--- /dev/null
+++ b/test/sumfact/preconditioner/test_preconditioner_2d.cc
@@ -0,0 +1,129 @@
+#include "config.h"
+
+#include "dune/common/parallel/mpihelper.hh"
+#include "dune/pdelab/stationary/linearproblem.hh"
+#include "dune/pdelab/backend/istl.hh"
+#include "dune/grid/yaspgrid.hh"
+#include "dune/pdelab/finiteelementmap/qkdg.hh"
+#include "dune/pdelab/gridoperator/fastdg.hh"
+#include "dune/testtools/gridconstruction.hh"
+#include "dune/common/parametertree.hh"
+#include "dune/common/parametertreeparser.hh"
+#include <random>
+#include "dune/pdelab/gridfunctionspace/vtk.hh"
+#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh"
+#include "string"
+#include "dune/codegen/vtkpredicate.hh"
+
+// Include all the generated operators
+#include "full_2d_operator.hh"
+#include "block_diagonal_2d_operator.hh"
+#include "block_offdiagonal_2d_operator.hh"
+#include "point_diagonal_2d_operator.hh"
+
+int main(int argc, char** argv){  
+  try
+  {    
+    // Initialize basic stuff...    
+    Dune::MPIHelper& mpihelper = Dune::MPIHelper::instance(argc, argv);
+    using RangeType = double;
+    Dune::ParameterTree initree;
+    Dune::ParameterTreeParser::readINITree(argv[1], initree);
+    
+    // Setup grid (view)...    
+    using Grid = Dune::YaspGrid<2, Dune::EquidistantCoordinates<RangeType, 2>>;
+    using GV = Grid::LeafGridView;
+    using DF = Grid::ctype;
+    IniGridFactory<Grid> factory(initree);
+    std::shared_ptr<Grid> grid = factory.getGrid();
+    GV gv = grid->leafGridView();
+    
+    // Set up finite element maps...    
+    using DG2_FEM = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RangeType, 1, 2>;
+    DG2_FEM dg2_fem;
+    
+    // Set up grid function spaces...    
+    using VectorBackendDG2 = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::fixed>;
+    using NoConstraintsAssembler = Dune::PDELab::NoConstraints;
+    using DG2_GFS = Dune::PDELab::GridFunctionSpace<GV, DG2_FEM, NoConstraintsAssembler, VectorBackendDG2>;
+    DG2_GFS dg2_gfs_(gv, dg2_fem);
+    dg2_gfs_.name("dg2_gfs_");
+    
+    // Set up constraints container...    
+    using DG2_GFS_CC = DG2_GFS::ConstraintsContainer<RangeType>::Type;
+    DG2_GFS_CC dg2_gfs__cc;
+    dg2_gfs__cc.clear();
+    Dune::PDELab::constraints(dg2_gfs_, dg2_gfs__cc);
+    
+    // Set up grid grid operators...    
+    using FullLOP = FullOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
+    using FullGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, FullLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    FullLOP fulllop(dg2_gfs_, dg2_gfs_, initree);
+    dg2_gfs_.update();
+    MatrixBackend mb(5);
+    FullGO fullgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, fulllop, mb);
+
+    // Additional grid operators for preconditioner
+    using BDLOP = BlockDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using BDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BDLOP bdlop(dg2_gfs_, dg2_gfs_, initree);
+    BDGO bdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bdlop, mb);
+
+    using BODLOP = BlockOffDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using BODGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BODLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BODLOP bodlop(dg2_gfs_, dg2_gfs_, initree);
+    BODGO bodgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bodlop, mb);
+
+    using PDLOP = PointDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using PDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, PDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    PDLOP pdlop(dg2_gfs_, dg2_gfs_, initree);
+    PDGO pdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, pdlop, mb);
+
+    // Set up solution vectors...    
+    using V_R = Dune::PDELab::Backend::Vector<DG2_GFS,DF>;
+    V_R x(dg2_gfs_, 0.0);
+    
+    // Testing!
+    
+    // Assemble all those matrices
+    using Dune::PDELab::Backend::native;
+    using M = typename FullGO::Traits::Jacobian;
+    M m(fullgo);
+    fullgo.jacobian(x, m);
+    Dune::printmatrix(std::cout, native(m),"full matrix","row",9,1);
+
+    using BDM = typename BDGO::Traits::Jacobian;
+    BDM bdm(bdgo);
+    bdgo.jacobian(x, bdm);
+    Dune::printmatrix(std::cout, native(bdm),"blockdiagonal matrix","row",9,1);
+
+    using BODM = typename BODGO::Traits::Jacobian;
+    BODM bodm(bodgo);
+    bodgo.jacobian(x, bodm);
+    Dune::printmatrix(std::cout, native(bodm),"blockoffdiagonal matrix","row",9,1);
+
+    V_R pd(dg2_gfs_, 0.0);
+    pdgo.residual(x, pd);
+    Dune::printvector(std::cout, native(pd), "point diagonal vector", "row");
+
+    // test failure boolean
+    bool testfail(false);
+
+    // TODO: Properly test this stuff given the above matrices.
+    //       Right now, visuals need to suffice.
+
+    // Return statement...    
+    return testfail;
+    
+  }  
+  catch (Dune::Exception& e)
+  {    std::cerr << "Dune reported error: " << e << std::endl;
+    return 1;
+  }  
+  catch (std::exception& e)
+  {    std::cerr << "Unknown exception thrown!" << std::endl;
+    return 1;
+  }  
+}
+
diff --git a/test/sumfact/preconditioner/test_preconditioner_3d.cc b/test/sumfact/preconditioner/test_preconditioner_3d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec84a1efbf7e19f46d8531622374f2902a3139eb
--- /dev/null
+++ b/test/sumfact/preconditioner/test_preconditioner_3d.cc
@@ -0,0 +1,129 @@
+#include "config.h"
+
+#include "dune/common/parallel/mpihelper.hh"
+#include "dune/pdelab/stationary/linearproblem.hh"
+#include "dune/pdelab/backend/istl.hh"
+#include "dune/grid/yaspgrid.hh"
+#include "dune/pdelab/finiteelementmap/qkdg.hh"
+#include "dune/pdelab/gridoperator/fastdg.hh"
+#include "dune/testtools/gridconstruction.hh"
+#include "dune/common/parametertree.hh"
+#include "dune/common/parametertreeparser.hh"
+#include <random>
+#include "dune/pdelab/gridfunctionspace/vtk.hh"
+#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh"
+#include "string"
+#include "dune/codegen/vtkpredicate.hh"
+
+// Include all the generated operators
+#include "full_3d_operator.hh"
+#include "block_diagonal_3d_operator.hh"
+#include "block_offdiagonal_3d_operator.hh"
+#include "point_diagonal_3d_operator.hh"
+
+int main(int argc, char** argv){  
+  try
+  {    
+    // Initialize basic stuff...    
+    Dune::MPIHelper& mpihelper = Dune::MPIHelper::instance(argc, argv);
+    using RangeType = double;
+    Dune::ParameterTree initree;
+    Dune::ParameterTreeParser::readINITree(argv[1], initree);
+    
+    // Setup grid (view)...    
+    using Grid = Dune::YaspGrid<3, Dune::EquidistantCoordinates<RangeType, 3>>;
+    using GV = Grid::LeafGridView;
+    using DF = Grid::ctype;
+    IniGridFactory<Grid> factory(initree);
+    std::shared_ptr<Grid> grid = factory.getGrid();
+    GV gv = grid->leafGridView();
+    
+    // Set up finite element maps...    
+    using DG2_FEM = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RangeType, 1, 3>;
+    DG2_FEM dg2_fem;
+    
+    // Set up grid function spaces...    
+    using VectorBackendDG2 = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::fixed>;
+    using NoConstraintsAssembler = Dune::PDELab::NoConstraints;
+    using DG2_GFS = Dune::PDELab::GridFunctionSpace<GV, DG2_FEM, NoConstraintsAssembler, VectorBackendDG2>;
+    DG2_GFS dg2_gfs_(gv, dg2_fem);
+    dg2_gfs_.name("dg2_gfs_");
+    
+    // Set up constraints container...    
+    using DG2_GFS_CC = DG2_GFS::ConstraintsContainer<RangeType>::Type;
+    DG2_GFS_CC dg2_gfs__cc;
+    dg2_gfs__cc.clear();
+    Dune::PDELab::constraints(dg2_gfs_, dg2_gfs__cc);
+    
+    // Set up grid grid operators...    
+    using FullLOP = FullOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
+    using FullGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, FullLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    FullLOP fulllop(dg2_gfs_, dg2_gfs_, initree);
+    dg2_gfs_.update();
+    MatrixBackend mb(5);
+    FullGO fullgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, fulllop, mb);
+
+    // Additional grid operators for preconditioner
+    using BDLOP = BlockDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using BDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BDLOP bdlop(dg2_gfs_, dg2_gfs_, initree);
+    BDGO bdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bdlop, mb);
+
+    using BODLOP = BlockOffDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using BODGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BODLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BODLOP bodlop(dg2_gfs_, dg2_gfs_, initree);
+    BODGO bodgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bodlop, mb);
+
+    using PDLOP = PointDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>;
+    using PDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, PDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    PDLOP pdlop(dg2_gfs_, dg2_gfs_, initree);
+    PDGO pdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, pdlop, mb);
+
+    // Set up solution vectors...    
+    using V_R = Dune::PDELab::Backend::Vector<DG2_GFS,DF>;
+    V_R x(dg2_gfs_, 0.0);
+    
+    // Testing!
+    
+    // Assemble all those matrices
+    using Dune::PDELab::Backend::native;
+    using M = typename FullGO::Traits::Jacobian;
+    M m(fullgo);
+    fullgo.jacobian(x, m);
+    Dune::printmatrix(std::cout, native(m),"full matrix","row",9,1);
+
+    using BDM = typename BDGO::Traits::Jacobian;
+    BDM bdm(bdgo);
+    bdgo.jacobian(x, bdm);
+    Dune::printmatrix(std::cout, native(bdm),"blockdiagonal matrix","row",9,1);
+
+    using BODM = typename BODGO::Traits::Jacobian;
+    BODM bodm(bodgo);
+    bodgo.jacobian(x, bodm);
+    Dune::printmatrix(std::cout, native(bodm),"blockoffdiagonal matrix","row",9,1);
+
+    V_R pd(dg2_gfs_, 0.0);
+    pdgo.residual(x, pd);
+    Dune::printvector(std::cout, native(pd), "point diagonal vector", "row");
+
+    // test failure boolean
+    bool testfail(false);
+
+    // TODO: Properly test this stuff given the above matrices.
+    //       Right now, visuals need to suffice.
+
+    // Return statement...    
+    return testfail;
+    
+  }  
+  catch (Dune::Exception& e)
+  {    std::cerr << "Dune reported error: " << e << std::endl;
+    return 1;
+  }  
+  catch (std::exception& e)
+  {    std::cerr << "Unknown exception thrown!" << std::endl;
+    return 1;
+  }  
+}
+