diff --git a/CMakeLists.txt b/CMakeLists.txt
index a496109044e05590d8458a73f2c2a5e7c2d78074..20e7d8a8b1ee12d80245e296a726d6eceffa5040 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.1.3)
 project(dune-codegen CXX)
 if(NOT (dune-common_DIR OR dune-common_ROOT OR
 # start a dune project with information from dune.module
 dune_add_library(dunecodegen dune/codegen/common/tsc.cc)
diff --git a/README.md b/README.md
index 8896b12177ec1532e69d64f1748b81f140123643..5856f3514a91a64db9734af6cf7825952b3935c0 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,25 @@ ctest
 Note that this takes quite a while.
+## Building and Running dune-codegen in an offline environment
+dune-codegen relies on installing Python packages into self-contained environments
+during its configuration and build process. In order to do this in an offline
+environment, we recommend using the tool `devpi`. One of its use cases is to provide
+a local mirror for the Python package index. A quickstart tutorial for this use case
+is available [5]. It boils down to the following:
+* Installing the `devpi-server` package through your favorite method
+* Setting up a local server with `devpi-server --init`
+* Making sure it is running in the background (explicitly with `devpi-server --start/stop` or by configuring a systemd service.
+* Have the environment variable `PIP_INDEX_URL` to its index, e.g. by adding this line to your `~/.bashrc` (where `http://localhost:3141` might differ depending on your devpi configuration):
+export PIP_INDEX_URL=http://localhost:3141/root/pypi/+simple/
+At first installation, the locally mirrored package index will access PyPI.
+Later on, it will install packages from its local cache.
 ## Links
 [0]: https://git-lfs.github.com/
@@ -112,3 +131,4 @@ Note that this takes quite a while.
 [2]: https://gitlab.dune-project.org/quality/dune-testtools
 [3]: http://isl.gforge.inria.fr/
 [4]: https://www.dune-project.org/doc/installation/
+[5]: https://github.com/devpi/devpi/blob/master/doc/quickstart-pypimirror.rst
diff --git a/cmake/modules/DuneCodegenMacros.cmake b/cmake/modules/DuneCodegenMacros.cmake
index da3225866785c75a8cf73e6aa78b6e3e0eea42f9..cff09c5ec49c0675680b5f49c1f24f9008c93cc5 100644
--- a/cmake/modules/DuneCodegenMacros.cmake
+++ b/cmake/modules/DuneCodegenMacros.cmake
@@ -116,6 +116,11 @@ function(dune_add_generated_executable)
     message(FATAL_ERROR "Unrecognized arguments in dune_add_generated_executable. This usually indicates a typo.")
+  set(MPI_OPTION "0")
+  if(MPI_FOUND)
+    set(MPI_OPTION "1")
+  endif()
   # Apply defaults and enforce requirements
     message(FATAL_ERROR "Need to specify the TARGET parameter for dune_add_generated_executable")
@@ -139,6 +144,7 @@ function(dune_add_generated_executable)
                                --target-name ${GEN_TARGET}
                                --driver-file ${GEN_SOURCE}
                                --project-basedir ${CMAKE_BINARY_DIR}
+                               --with-mpi ${MPI_OPTION}
                        COMMENT "Generating driver for the target ${GEN_TARGET}"
@@ -172,10 +178,8 @@ function(dune_add_generated_executable)
   # Parse a mapping of operators to build and their respective filenames
-  dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET}
-                       OUTPUT_VARIABLE depdata
-                       )
-  parse_python_data(PREFIX depdata INPUT ${depdata})
+  dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET} ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
+  parse_python_data(PREFIX depdata FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
     # This is a bit silly, but cProfile only finds entry point scripts
@@ -199,6 +203,7 @@ function(dune_add_generated_executable)
                                --ini-file ${GEN_INIFILE}
                                --target-name ${GEN_TARGET}
                                --operator-to-build ${op}
+                               --with-mpi ${MPI_OPTION}
                        COMMENT "Generating operator file ${depdata___${op}} for the target ${GEN_TARGET}"
diff --git a/cmake/modules/GeneratedSystemtests.cmake b/cmake/modules/GeneratedSystemtests.cmake
index 87af7161c8b9c8196dbddc3ae42e55bcffd3bd81..2a1907950a7f5bd68e85a5c536465a73634828cf 100644
--- a/cmake/modules/GeneratedSystemtests.cmake
+++ b/cmake/modules/GeneratedSystemtests.cmake
@@ -119,9 +119,14 @@ function(dune_add_formcompiler_system_test)
   # expand the given meta ini file into the build tree
-  execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py --cmake --ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} --dir ${CMAKE_CURRENT_BINARY_DIR} --section formcompiler
-                  OUTPUT_VARIABLE output)
-  parse_python_data(PREFIX INIINFO INPUT "${output}")
+  execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py
+                          --cmake
+                          --ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE}
+                          --dir ${CMAKE_CURRENT_BINARY_DIR}
+                          --section formcompiler
+                          --file ${CMAKE_CURRENT_BINARY_DIR}/interface.log
+                          )
+  parse_python_data(PREFIX INIINFO FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
   foreach(inifile ${INIINFO_names})
     if(${INIINFO_${inifile}_suffix} STREQUAL "__empty")
@@ -147,10 +152,11 @@ function(dune_add_formcompiler_system_test)
     # just the way that dune-testtools does.
     dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_extract_static.py
                                --ini ${inifile}
+                               --file ${CMAKE_CURRENT_BINARY_DIR}/interface.log
                          OUTPUT_VARIABLE output
                          ERROR_MESSAGE "Error extracting static info from ${inifile}")
-    parse_python_data(PREFIX STAT INPUT "${output}")
+    parse_python_data(PREFIX STAT FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
     foreach(config ${STAT___CONFIGS})
       foreach(cd ${STAT___STATIC_DATA})
diff --git a/cmake/modules/deplist.py b/cmake/modules/deplist.py
index 9cb5d7d42cbfc712e18e37798dffc2d553416f8c..7b0afac3c7c5f216e9710cef394d175c0c6c9ee9 100755
--- a/cmake/modules/deplist.py
+++ b/cmake/modules/deplist.py
@@ -22,5 +22,5 @@ def get_filename(operator):
 result = {"__{}".format(o): get_filename(o) for o in operators}
 result["__operators"] = ";".join(operators)
+printForCMake(result, sys.argv[3])
diff --git a/dune/codegen/sumfact/horizontaladd.hh b/dune/codegen/sumfact/horizontaladd.hh
index fc62dc47dab77330a125bb5f91805c89808480b0..7dd122af4c877b7206e99d71ba8aa05fc2fc448d 100644
--- a/dune/codegen/sumfact/horizontaladd.hh
+++ b/dune/codegen/sumfact/horizontaladd.hh
@@ -4,6 +4,50 @@
+// Only use our custom implementations if we have AVX2 or later!
+#if INSTRSET >= 8
+/** Implement a variant of horizontal_add(Vec2d) that avoids the haddpd
+ *  instruction and instead uses the shuffle port.
+ */
+static inline double permuting_horizontal_add (const Vec2d & a)
+    return _mm_cvtsd_f64(_mm_add_pd(_mm_permute_pd(a,1),a));
+/** Implement a variant of horizontal_add(Vec4d) that avoids the haddpd
+ *  instruction and instead uses the shuffle port.
+ */
+static inline double permuting_horizontal_add (const Vec4d& a)
+    __m128d valupper = _mm256_extractf128_pd(a, 1);
+    __m128d vallower = _mm256_castpd256_pd128(a);
+    __m128d valval = _mm_add_pd(valupper, vallower);
+    __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
+    return _mm_cvtsd_f64(res);
+#if MAX_VECTOR_SIZE >= 512
+/** Implement a variant of horizontal_add(Vec8d) that avoids the haddpd
+ *  instruction and instead uses the shuffle port.
+ */
+static inline double permuting_horizontal_add(const Vec8d& a)
+  return permuting_horizontal_add(a.get_low() + a.get_high());
+template<typename V>
+static inline double permuting_horizontal_add (const V& a)
+    return horizontal_add(a);
 template<class V>
 typename base_floatingpoint<V>::value horizontal_add_lower(const V& x)
@@ -16,4 +60,16 @@ typename base_floatingpoint<V>::value horizontal_add_upper(const V& x)
   return horizontal_add(x.get_high());
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x)
+  return permuting_horizontal_add(x.get_low());
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x)
+  return permuting_horizontal_add(x.get_high());
diff --git a/dune/codegen/sumfact/oc_horizontaladd.hh b/dune/codegen/sumfact/oc_horizontaladd.hh
new file mode 100644
index 0000000000000000000000000000000000000000..d136aaf5ffc226e8e2463c5c40b3d64c8a476fe7
--- /dev/null
+++ b/dune/codegen/sumfact/oc_horizontaladd.hh
@@ -0,0 +1,25 @@
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x)
+  return horizontal_add(x.get_low());
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x)
+  return horizontal_add(x.get_high());
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add(const V& x)
+  return horizontal_add(x);
diff --git a/patches/apply_patches.sh b/patches/apply_patches.sh
index 7d1d45112d392a13a3dec4881ba43222cf60054b..2d0cdc6f543f2cb43d3a28bf55f562db0e64f13a 100755
--- a/patches/apply_patches.sh
+++ b/patches/apply_patches.sh
@@ -5,10 +5,6 @@ git apply ../../patches/loopy/Current.patch
 git apply ../../patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
-pushd dune/codegen/vectorclass
-git apply ../../../patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
 pushd python/ufl
 git apply ../../patches/ufl/0001-Remove-special-case-for-variable-in-ufl2dot.patch
diff --git a/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch b/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch
deleted file mode 100644
index c5ca6dc30e2135ab30a28c7373b94da344b8a7ac..0000000000000000000000000000000000000000
--- a/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch
+++ /dev/null
@@ -1,44 +0,0 @@
-From a324181d74fd8cd81fb945a4f66e4502ffbc68a0 Mon Sep 17 00:00:00 2001
-From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
-Date: Thu, 30 Nov 2017 18:51:49 +0100
-Subject: [PATCH] Alternative implementation of horizontal_add on AVX512
- vectorf512.h | 19 +++++++++++++------
- 1 file changed, 13 insertions(+), 6 deletions(-)
-diff --git a/vectorf512.h b/vectorf512.h
-index 0845d12..6a15ac2 100644
---- a/vectorf512.h
-+++ b/vectorf512.h
-@@ -1339,14 +1339,21 @@ static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b)
- // General arithmetic functions, etc.
-+#if __GNUC__ < 7
-+extern __inline double
-+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-+_mm512_cvtsd_f64 (__m512d __A)
-+  return __A[0];
- // Horizontal add: Calculates the sum of all vector elements.
--static inline double horizontal_add (Vec8d const & a) {
--#if defined(__INTEL_COMPILER)
--    return _mm512_reduce_add_pd(a);
--    return horizontal_add(a.get_low() + a.get_high());
-+static inline double horizontal_add (Vec8d const & x) {
-+    __m512d intermediate = _mm512_add_pd(x, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(x), _mm512_castpd_si512(x), 1)));
-+    intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 2)));
-+    intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 4)));
-+    return _mm512_cvtsd_f64(intermediate);
- }
- // function max: a > b ? a : b
diff --git a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch b/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
deleted file mode 100644
index fee83d7ad7cedbacc588c530ad9581b49cfa3b54..0000000000000000000000000000000000000000
--- a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-From 69f4ea4dcd018eb74c39a076a60fc27c0496e1dd Mon Sep 17 00:00:00 2001
-From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
-Date: Mon, 19 Jun 2017 13:07:22 +0200
-Subject: [PATCH] Better implementation of horizontal_add
- vectorf256.h | 9 +++++----
- 1 file changed, 5 insertions(+), 4 deletions(-)
-diff --git a/vectorf256.h b/vectorf256.h
-index db509f8..2bbd9de 100644
---- a/vectorf256.h
-+++ b/vectorf256.h
-@@ -1692,10 +1692,11 @@ static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b)
- // Horizontal add: Calculates the sum of all vector elements.
- static inline double horizontal_add (Vec4d const & a) {
--    __m256d t1 = _mm256_hadd_pd(a,a);
--    __m128d t2 = _mm256_extractf128_pd(t1,1);
--    __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
--    return _mm_cvtsd_f64(t3);        
-+    const __m128d valupper = _mm256_extractf128_pd(a, 1);
-+    const __m128d vallower = _mm256_castpd256_pd128(a);
-+    const __m128d valval = _mm_add_pd(valupper, vallower);
-+    const __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
-+    return _mm_cvtsd_f64(res);
- }
- // function max: a > b ? a : b
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 8881504f934e7b1d755580ffc46d47f2a1467346..67694937376b1b6cdaa66d44db07289652f62bcc 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -24,6 +24,7 @@ add_subdirectory(test)
 add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc)
 target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing)
-  target_link_libraries(_autotune_target benchmark)
+if(benchmark_FOUND AND Threads_FOUND)
+  target_link_libraries(_autotune_target benchmark Threads::Threads)
diff --git a/python/dune/codegen/blockstructured/accumulation.py b/python/dune/codegen/blockstructured/accumulation.py
index 9af9759d5e3426df12a10db39f94048e48006867..ec2e3d9917998a7a024796d48aaf6be6488a271c 100644
--- a/python/dune/codegen/blockstructured/accumulation.py
+++ b/python/dune/codegen/blockstructured/accumulation.py
@@ -1,12 +1,12 @@
-from dune.codegen.blockstructured.tools import sub_element_inames
-from dune.codegen.generation import accumulation_mixin, instruction
+from dune.codegen.blockstructured.tools import sub_element_inames, name_accumulation_alias
+from dune.codegen.generation import accumulation_mixin, instruction, get_global_context_value
 from dune.codegen.loopy.target import dtype_floatingpoint
 from dune.codegen.options import get_form_option
 from dune.codegen.pdelab.geometry import world_dimension, name_intersection_geometry_wrapper
 from dune.codegen.pdelab.localoperator import determine_accumulation_space, GenericAccumulationMixin
 from dune.codegen.pdelab.argument import name_accumulation_variable
 from dune.codegen.pdelab.localoperator import boundary_predicates
-from dune.codegen.generation.loopy import function_mangler, globalarg, temporary_variable
+from dune.codegen.generation.loopy import function_mangler, temporary_variable
 import loopy as lp
 import pymbolic.primitives as prim
@@ -16,32 +16,10 @@ from loopy.match import Writes
 class BlockStructuredAccumulationMixin(GenericAccumulationMixin):
     def generate_accumulation_instruction(self, expr):
-        if get_form_option('vectorization_blockstructured'):
-            return generate_accumulation_instruction_vectorized(expr, self)
-        else:
+        if get_global_context_value("form_type") == "jacobian":
             return generate_accumulation_instruction(expr, self)
-def name_accumulation_alias(container, accumspace):
-    name = container + "_" + accumspace.lfs.name + "_alias"
-    name_tail = container + "_" + accumspace.lfs.name + "_alias_tail"
-    k = get_form_option("number_of_blocks")
-    p = accumspace.element.degree()
-    def _add_alias_insn(name):
-        dim = world_dimension()
-        element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim))
-        index_stride = tuple((p * k + 1)**i for i in range(0, dim))
-        globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True)
-        code = "auto {} = &{}.container()({},0);".format(name, container, accumspace.lfs.name)
-        instruction(within_inames=frozenset(),
-                    code=code,
-                    read_variables=frozenset({container}),
-                    assignees=frozenset({name}))
-    _add_alias_insn(name)
-    _add_alias_insn(name_tail)
-    return name
+        else:
+            return generate_accumulation_instruction_vectorized(expr, self)
diff --git a/python/dune/codegen/blockstructured/argument.py b/python/dune/codegen/blockstructured/argument.py
index 420773e85bea93ee55cb310255da7fa60d55d9de..deff1b8415d246d0ed197692df46b61db2dbd5cc 100644
--- a/python/dune/codegen/blockstructured/argument.py
+++ b/python/dune/codegen/blockstructured/argument.py
@@ -1,29 +1,11 @@
-from dune.codegen.generation import (kernel_cached,
-                                     valuearg, instruction, globalarg)
+from dune.codegen.generation import kernel_cached, valuearg
 from dune.codegen.options import get_form_option
 from dune.codegen.pdelab.argument import CoefficientAccess
-from dune.codegen.blockstructured.tools import micro_index_to_macro_index, sub_element_inames
-from dune.codegen.pdelab.geometry import world_dimension
+from dune.codegen.blockstructured.tools import micro_index_to_macro_index, sub_element_inames, name_container_alias
 from loopy.types import NumpyType
 import pymbolic.primitives as prim
-def name_alias(container, lfs, element):
-    name = container + "_" + lfs.name + "_alias"
-    k = get_form_option("number_of_blocks")
-    p = element.degree()
-    dim = world_dimension()
-    element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim))
-    index_stride = tuple((p * k + 1)**i for i in range(0, dim))
-    globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True)
-    code = "const auto {} = &{}({},0);".format(name, container, lfs.name)
-    instruction(within_inames=frozenset(),
-                code=code,
-                read_variables=frozenset({container}),
-                assignees=frozenset({name}))
-    return name
 # TODO remove the need for element
 def pymbolic_coefficient(container, lfs, element, index):
@@ -36,9 +18,6 @@ def pymbolic_coefficient(container, lfs, element, index):
         lfs = prim.Variable(lfs)
     # use higher order FEM index instead of Q1 index
-    if get_form_option("vectorization_blockstructured"):
-        subelem_inames = sub_element_inames()
-        coeff_alias = name_alias(container, lfs, element)
-        return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index))
-    else:
-        return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),))
+    subelem_inames = sub_element_inames()
+    coeff_alias = name_container_alias(container, lfs, element)
+    return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index))
diff --git a/python/dune/codegen/blockstructured/basis.py b/python/dune/codegen/blockstructured/basis.py
index 5af8bb48f2bb625a0b336c5cae6d42863f0063c5..3781ac2a35855b03ed76c6fdb1c1b3db8c73559e 100644
--- a/python/dune/codegen/blockstructured/basis.py
+++ b/python/dune/codegen/blockstructured/basis.py
@@ -7,7 +7,7 @@ from dune.codegen.generation import (basis_mixin,
-                                     include_file,)
+                                     include_file, preamble)
 from dune.codegen.tools import get_pymbolic_basename, get_pymbolic_indices
 from dune.codegen.loopy.target import type_floatingpoint
 from dune.codegen.pdelab.basis import (GenericBasisMixin,
@@ -22,7 +22,7 @@ from dune.codegen.pdelab.geometry import world_dimension, component_iname
 from dune.codegen.pdelab.spaces import type_leaf_gfs, name_lfs
 from dune.codegen.pdelab.restriction import restricted_name
 from dune.codegen.blockstructured.spaces import lfs_inames
-from dune.codegen.blockstructured.tools import tensor_index_to_sequential_index, sub_element_inames
+from dune.codegen.blockstructured.tools import tensor_index_to_sequential_index, remove_sub_element_inames
 from ufl import MixedElement
@@ -38,11 +38,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
         assert not isinstance(element, MixedElement)
         name = "phi_{}".format(FEM_name_mangling(element))
         name = restricted_name(name, restriction)
+        self.init_basis_cache(element, restriction)
         self.evaluate_basis(element, name, restriction)
         inames = self.lfs_inames(element, restriction, number, context=context)
         return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0))
+    @preamble(kernel='operator')
+    def init_basis_cache(self, element, restriction):
+        if not restriction:
+            cache = name_localbasis_cache(element)
+            localbasis = name_localbasis(element)
+            qp_name = get_pymbolic_basename(self.quadrature_position_in_micro())
+            return ["for (int i=0; i < {}.size(); ++i)".format(qp_name),
+                    "{",
+                    "  {}.evaluateFunction({}[i], {});".format(cache, qp_name, localbasis),
+                    "}"]
+        else:
+            return []
     def evaluate_basis(self, element, name, restriction):
         temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1),
@@ -50,7 +64,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
         cache = name_localbasis_cache(element)
         qp = self.to_cell(self.quadrature_position_in_micro())
         localbasis = name_localbasis(element)
-        instruction(inames=self.quadrature_inames(),
+        instruction(inames=remove_sub_element_inames(self.quadrature_inames()),
                     code='{} = {}.evaluateFunction({}, {});'.format(name, cache, str(qp), localbasis),
@@ -60,11 +74,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
         assert not isinstance(element, MixedElement)
         name = "js_{}".format(FEM_name_mangling(element))
         name = restricted_name(name, restriction)
+        self.init_gradient_cache(element, restriction)
         self.evaluate_reference_gradient(element, name, restriction)
         inames = self.lfs_inames(element, restriction, number, context=context)
         return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0))
+    @preamble(kernel='operator')
+    def init_gradient_cache(self, element, restriction):
+        if not restriction:
+            cache = name_localbasis_cache(element)
+            localbasis = name_localbasis(element)
+            qp_name = get_pymbolic_basename(self.quadrature_position_in_micro())
+            return ["for (int i=0; i < {}.size(); ++i)".format(qp_name),
+                    "{",
+                    "  {}.evaluateJacobian({}[i], {});".format(cache, qp_name, localbasis),
+                    "}"]
+        else:
+            return []
     def evaluate_reference_gradient(self, element, name, restriction):
         temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1, world_dimension()),
@@ -72,7 +100,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
         cache = name_localbasis_cache(element)
         qp = self.to_cell(self.quadrature_position_in_micro())
         localbasis = name_localbasis(element)
-        instruction(inames=self.quadrature_inames(),
+        instruction(inames=remove_sub_element_inames(self.quadrature_inames()),
                     code='{} = {}.evaluateJacobian({}, {});'.format(name, cache, str(qp), localbasis),
@@ -104,7 +132,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
         instruction(expression=Reduction("sum", basisindex, reduction_expr, allow_simultaneous=True),
-                    forced_iname_deps=frozenset(self.quadrature_inames() + (dimindex,) + sub_element_inames()),
+                    forced_iname_deps=frozenset(self.quadrature_inames() + (dimindex,)),
@@ -131,7 +159,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
         instruction(expression=Reduction("sum", basisindex, reduction_expr, allow_simultaneous=True),
-                    forced_iname_deps=frozenset(self.quadrature_inames() + sub_element_inames()),
+                    forced_iname_deps=frozenset(self.quadrature_inames()),
diff --git a/python/dune/codegen/blockstructured/geometry.py b/python/dune/codegen/blockstructured/geometry.py
index 3ad3a9026d8a5287e54e3950aef0aa199d08cc4e..975345eb572cd3adad443c879cff29220c6d7f65 100644
--- a/python/dune/codegen/blockstructured/geometry.py
+++ b/python/dune/codegen/blockstructured/geometry.py
@@ -1,9 +1,13 @@
+import pymbolic.primitives as prim
+from loopy.match import Writes
+from dune.codegen.blockstructured.tools import name_point_in_macro
 from dune.codegen.generation import (geometry_mixin,
-from dune.codegen.tools import get_pymbolic_basename
+from dune.codegen.loopy.symbolic import FusedMultiplyAdd as FMA
 from dune.codegen.options import get_form_option
 from dune.codegen.pdelab.geometry import (AxiparallelGeometryMixin,
@@ -15,12 +19,9 @@ from dune.codegen.pdelab.geometry import (AxiparallelGeometryMixin,
-from dune.codegen.blockstructured.tools import (sub_element_inames,
-                                                name_point_in_macro,
-                                                )
+from dune.codegen.pdelab.tensors import name_matrix_inverse, name_determinant
+from dune.codegen.tools import get_pymbolic_basename
 from dune.codegen.ufl.modified_terminals import Restriction
-import pymbolic.primitives as prim
-from loopy.match import Writes
@@ -49,43 +50,25 @@ class BlockStructuredGeometryMixin(GenericPDELabGeometryMixin):
                              prim.Power(get_form_option("number_of_blocks"), local_dimension()))
     def jacobian_determinant(self, o):
-        name = 'detjac'
-        self.define_jacobian_determinant(name)
-        return prim.Quotient(prim.Variable(name),
-                             prim.Power(get_form_option("number_of_blocks"), local_dimension()))
-    def define_jacobian_determinant(self, name):
-        temporary_variable(name, shape=(), managed=True)
-        determinant_signed = name_jacobian_determinant_signed(self)
+        jacobian = name_jacobian_matrix(self)
+        name = name_determinant(jacobian, (world_dimension(), world_dimension()), self)
-        return instruction(expression=prim.Call(prim.Variable("abs"), (prim.Variable(determinant_signed),)),
-                           assignee=prim.Variable(name),
-                           within_inames=frozenset(sub_element_inames() + self.quadrature_inames()),
-                           depends_on=frozenset({Writes(determinant_signed)})
-                           )
+        return prim.Quotient(prim.Call(prim.Variable("abs"), (prim.Variable(name),)),
+                             prim.Power(get_form_option("number_of_blocks"), local_dimension()))
     def jacobian_inverse(self, o):
-        restriction = enforce_boundary_restriction(self)
         assert(len(self.indices) == 2)
         i, j = self.indices
         self.indices = None
-        name = restricted_name("jit", restriction)
-        self.define_jacobian_inverse_transposed(name, restriction)
+        restriction = enforce_boundary_restriction(self)
+        jacobian = restricted_name(name_jacobian_matrix(self), restriction)
+        name = name_matrix_inverse(jacobian, (world_dimension(), world_dimension()), self)
         return prim.Product((prim.Subscript(prim.Variable(name), (j, i)),
-    def define_jacobian_inverse_transposed(self, name, restriction):
-        temporary_variable(name, shape=(world_dimension(), world_dimension()), managed=True)
-        jacobian = name_jacobian_matrix(self)
-        det_inv = name_jacobian_determinant_inverse(self)
-        compute_inverse_transposed(name, det_inv, jacobian, self)
 class AxiparallelBlockStructuredGeometryMixin(AxiparallelGeometryMixin, BlockStructuredGeometryMixin):
@@ -167,12 +150,12 @@ def compute_jacobian(name, visitor):
         a, b, c = coefficients
         expr_jac = [None, None]
-        expr_jac[0] = prim.Sum((prim.Product((prim.Subscript(pymbolic_pos, (1,)),
-                                              prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)))),
-                                prim.Subscript(prim.Variable(b), (prim.Variable(jac_iname),))))
-        expr_jac[1] = prim.Sum((prim.Product((prim.Subscript(pymbolic_pos, (0,)),
-                                              prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)))),
-                                prim.Subscript(prim.Variable(c), (prim.Variable(jac_iname),))))
+        expr_jac[0] = FMA(prim.Subscript(pymbolic_pos, (1,)),
+                          prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)),
+                          prim.Subscript(prim.Variable(b), (prim.Variable(jac_iname),)))
+        expr_jac[1] = FMA(prim.Subscript(pymbolic_pos, (0,)),
+                          prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)),
+                          prim.Subscript(prim.Variable(c), (prim.Variable(jac_iname),)))
     elif world_dimension() == 3:
         a, b, c, d, e, f, g = coefficients
@@ -183,21 +166,20 @@ def compute_jacobian(name, visitor):
         # with k, l in {0,1,2} != i and k<l and vj = terms[i][j]
         for i in range(3):
             k, l = sorted(set(range(3)) - {i})
-            expr_jac[i] = prim.Sum((prim.Product((prim.Subscript(pymbolic_pos, (k,)), prim.Subscript(pymbolic_pos, (l,)),
-                                                  prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)))),
-                                    prim.Product((prim.Subscript(pymbolic_pos, (k,)),
-                                                  prim.Subscript(prim.Variable(terms[i][0]), (prim.Variable(jac_iname),)))),
-                                    prim.Product((prim.Subscript(pymbolic_pos, (l,)),
-                                                  prim.Subscript(prim.Variable(terms[i][1]), (prim.Variable(jac_iname),)))),
-                                    prim.Subscript(prim.Variable(terms[i][2]), (prim.Variable(jac_iname),))
-                                    ))
+            expr_jac[i] = FMA(prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)),
+                              prim.Subscript(pymbolic_pos, (k,)) * prim.Subscript(pymbolic_pos, (l,)),
+                              FMA(prim.Subscript(prim.Variable(terms[i][0]), (prim.Variable(jac_iname),)),
+                                  prim.Subscript(pymbolic_pos, (k,)),
+                                  FMA(prim.Subscript(prim.Variable(terms[i][1]), (prim.Variable(jac_iname),)),
+                                      prim.Subscript(pymbolic_pos, (l,)),
+                                      prim.Subscript(prim.Variable(terms[i][2]), (prim.Variable(jac_iname),)))))
         raise NotImplementedError()
     for i, expr in enumerate(expr_jac):
                     assignee=prim.Subscript(prim.Variable(name), (prim.Variable(jac_iname), i)),
-                    within_inames=frozenset((jac_iname, ) + sub_element_inames() + visitor.quadrature_inames()),
+                    within_inames=frozenset((jac_iname, ) + visitor.quadrature_inames()),
                     depends_on=frozenset({Writes(get_pymbolic_basename(pymbolic_pos))} | {Writes(cd) for cd in coefficients})
@@ -213,121 +195,6 @@ def name_jacobian_matrix(visitor):
     return name
-def compute_determinant(name, matrix, visitor):
-    dim = world_dimension()
-    matrix_entry = [[prim.Subscript(prim.Variable(matrix), (i, j)) for j in range(dim)] for i in range(dim)]
-    if dim == 2:
-        expr_determinant = prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[1][1])),
-                                     -1 * prim.Product((matrix_entry[1][0], matrix_entry[0][1]))))
-    elif dim == 3:
-        expr_determinant = prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[1][1], matrix_entry[2][2])),
-                                     prim.Product((matrix_entry[0][1], matrix_entry[1][2], matrix_entry[2][0])),
-                                     prim.Product((matrix_entry[0][2], matrix_entry[1][0], matrix_entry[2][1])),
-                                     -1 * prim.Product((matrix_entry[0][2], matrix_entry[1][1], matrix_entry[2][0])),
-                                     -1 * prim.Product((matrix_entry[0][0], matrix_entry[1][2], matrix_entry[2][1])),
-                                     -1 * prim.Product((matrix_entry[0][1], matrix_entry[1][0], matrix_entry[2][2]))
-                                     ))
-    else:
-        raise NotImplementedError()
-    instruction(expression=expr_determinant,
-                assignee=prim.Variable(name),
-                within_inames=frozenset(sub_element_inames() + visitor.quadrature_inames()),
-                depends_on=frozenset({Writes(matrix)})
-                )
-def define_jacobian_determinant(name, visitor):
-    temporary_variable(name, shape=(), managed=True)
-    jacobian = name_jacobian_matrix(visitor)
-    compute_determinant(name, jacobian, visitor)
-def define_jacobian_determinant_inverse(name, visitor):
-    temporary_variable(name, shape=(), managed=True)
-    determinant = name_jacobian_determinant_signed(visitor)
-    return instruction(expression=prim.Quotient(1., prim.Variable(determinant)),
-                       assignee=prim.Variable(name),
-                       within_inames=frozenset(sub_element_inames() + visitor.quadrature_inames()),
-                       depends_on=frozenset({Writes(determinant)})
-                       )
-def name_jacobian_determinant_signed(visitor):
-    name = "detjac_signed"
-    define_jacobian_determinant(name, visitor)
-    return name
-def name_jacobian_determinant_inverse(visitor):
-    name = "detjac_inverse"
-    define_jacobian_determinant_inverse(name, visitor)
-    return name
-def compute_inverse_transposed(name, det_inv, matrix, visitor):
-    dim = world_dimension()
-    matrix_entry = [[prim.Subscript(prim.Variable(matrix), (i, j)) for j in range(dim)] for i in range(dim)]
-    assignee = [[prim.Subscript(prim.Variable(name), (i, j)) for j in range(dim)] for i in range(dim)]
-    exprs = [[None for _ in range(dim)] for _ in range(dim)]
-    if dim == 2:
-        for i in range(2):
-            for j in range(2):
-                sign = 1. if i == j else -1.
-                exprs[i][j] = prim.Product((sign, prim.Variable(det_inv), matrix_entry[1 - i][1 - j]))
-    elif dim == 3:
-        exprs[0][0] = prim.Product((1., prim.Variable(det_inv),
-                                    prim.Sum((prim.Product((matrix_entry[1][1], matrix_entry[2][2])),
-                                              -1 * prim.Product((matrix_entry[1][2], matrix_entry[2][1]))))))
-        exprs[1][0] = prim.Product((-1., prim.Variable(det_inv),
-                                    prim.Sum((prim.Product((matrix_entry[0][1], matrix_entry[2][2])),
-                                              -1 * prim.Product((matrix_entry[0][2], matrix_entry[2][1]))))))
-        exprs[2][0] = prim.Product((1., prim.Variable(det_inv),
-                                    prim.Sum((prim.Product((matrix_entry[0][1], matrix_entry[1][2])),
-                                              -1 * prim.Product((matrix_entry[0][2], matrix_entry[1][1]))))))
-        exprs[0][1] = prim.Product((-1., prim.Variable(det_inv),
-                                    prim.Sum((prim.Product((matrix_entry[1][0], matrix_entry[2][2])),
-                                              -1 * prim.Product((matrix_entry[1][2], matrix_entry[2][0]))))))
-        exprs[1][1] = prim.Product((1., prim.Variable(det_inv),
-                                    prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[2][2])),
-                                              -1 * prim.Product((matrix_entry[0][2], matrix_entry[2][0]))))))
-        exprs[2][1] = prim.Product((-1., prim.Variable(det_inv),
-                                    prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[1][2])),
-                                              -1 * prim.Product((matrix_entry[0][2], matrix_entry[1][0]))))))
-        exprs[0][2] = prim.Product((1., prim.Variable(det_inv),
-                                    prim.Sum((prim.Product((matrix_entry[1][0], matrix_entry[2][1])),
-                                              -1 * prim.Product((matrix_entry[1][1], matrix_entry[2][0]))))))
-        exprs[1][2] = prim.Product((-1., prim.Variable(det_inv),
-                                    prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[2][1])),
-                                              -1 * prim.Product((matrix_entry[0][1], matrix_entry[2][0]))))))
-        exprs[2][2] = prim.Product((1., prim.Variable(det_inv),
-                                    prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[1][1])),
-                                              -1 * prim.Product((matrix_entry[0][1], matrix_entry[1][0]))))))
-    else:
-        raise NotImplementedError
-    for j in range(dim):
-        for i in range(dim):
-            instruction(expression=exprs[i][j],
-                        assignee=assignee[i][j],
-                        within_inames=frozenset(sub_element_inames() + visitor.quadrature_inames()),
-                        depends_on=frozenset({Writes(matrix), Writes(det_inv)}))
-def define_jacobian_inverse_transposed(name, visitor):
-    temporary_variable(name, shape=(world_dimension(), world_dimension()), managed=True)
-    jacobian = name_jacobian_matrix(visitor)
-    det_inv = name_jacobian_determinant_inverse(visitor)
-    compute_inverse_transposed(name, det_inv, jacobian, visitor)
-def name_jacobian_inverse_transposed(restriction, visitor):
-    name = restricted_name("jit", restriction)
-    define_jacobian_inverse_transposed(name, visitor)
-    return name
 def compute_multilinear_to_global_transformation(name, local, visitor):
     dim = world_dimension()
     temporary_variable(name, shape=(dim,), managed=True)
@@ -347,19 +214,22 @@ def compute_multilinear_to_global_transformation(name, local, visitor):
     # global[d] = T(local)[d]
     if dim == 2:
         a_pym, b_pym, c_pym = coeffs_pym
-        expr = a_pym * local_pym[0] * local_pym[1] + b_pym * local_pym[0] + c_pym * local_pym[1] + corner_0_pym
+        expr = FMA(a_pym, local_pym[0] * local_pym[1], FMA(b_pym, local_pym[0], FMA(c_pym, local_pym[1], corner_0_pym)))
     elif dim == 3:
         a_pym, b_pym, c_pym, d_pym, e_pym, f_pym, g_pym = coeffs_pym
-        expr = (a_pym * local_pym[0] * local_pym[1] * local_pym[2] + b_pym * local_pym[0] * local_pym[1] +
-                c_pym * local_pym[0] * local_pym[2] + d_pym * local_pym[1] * local_pym[2] +
-                e_pym * local_pym[0] + f_pym * local_pym[1] + g_pym * local_pym[2] + corner_0_pym)
+        expr = FMA(a_pym * local_pym[0], local_pym[1] * local_pym[2],
+                   FMA(b_pym, local_pym[0] * local_pym[1],
+                       FMA(c_pym, local_pym[0] * local_pym[2],
+                           FMA(d_pym, local_pym[1] * local_pym[2],
+                               FMA(e_pym, local_pym[0],
+                                   FMA(f_pym, local_pym[1], FMA(g_pym, local_pym[2], corner_0_pym)))))))
         raise NotImplementedError
     assignee = prim.Subscript(prim.Variable(name), (dim_pym,))
     instruction(assignee=assignee, expression=expr,
-                within_inames=frozenset(sub_element_inames() + visitor.quadrature_inames() + (dim_pym.name,)),
+                within_inames=frozenset(visitor.quadrature_inames() + (dim_pym.name,)),
                 depends_on=frozenset({Writes(get_pymbolic_basename(local)), Writes(corners)})
@@ -376,13 +246,14 @@ def compute_axiparallel_to_global_transformation(name, local, visitor):
     dim_pym = prim.Variable(component_iname('to_global'))
     # global[d] = lower_left[d] + local[d] * (upper_right[d] - lower_left[d])
-    expr = (prim.Subscript(prim.Variable(corners), (0, dim_pym)) +
-            prim.Subscript(local, (dim_pym,)) * (prim.Subscript(prim.Variable(corners), (2**dim - 1, dim_pym)) -
-                                                 prim.Subscript(prim.Variable(corners), (0, dim_pym))))
+    expr = FMA(prim.Subscript(prim.Variable(corners), (2**dim - 1, dim_pym)) -
+               prim.Subscript(prim.Variable(corners), (0, dim_pym)),
+               prim.Subscript(local, (dim_pym,)), prim.Subscript(prim.Variable(corners), (0, dim_pym)))
     assignee = prim.Subscript(prim.Variable(name), (dim_pym,))
     instruction(assignee=assignee, expression=expr,
-                within_inames=frozenset(sub_element_inames() + visitor.quadrature_inames() + (dim_pym.name,)),
+                within_inames=frozenset(visitor.quadrature_inames() + (dim_pym.name,)),
                 depends_on=frozenset({Writes(get_pymbolic_basename(local)), Writes(corners)})
diff --git a/python/dune/codegen/blockstructured/quadrature.py b/python/dune/codegen/blockstructured/quadrature.py
index d3ba4950e8034232ebd41258f571b887caf198a4..5028bd35646fc82d9e529ba8305e67cf5ee7add8 100644
--- a/python/dune/codegen/blockstructured/quadrature.py
+++ b/python/dune/codegen/blockstructured/quadrature.py
@@ -1,14 +1,18 @@
 from dune.codegen.error import CodegenError
 from dune.codegen.generation import quadrature_mixin
 from dune.codegen.pdelab.quadrature import GenericQuadratureMixin
-from dune.codegen.blockstructured.tools import name_point_in_macro
+from dune.codegen.blockstructured.tools import name_point_in_macro, sub_element_inames, remove_sub_element_inames
 import pymbolic.primitives as prim
 class BlockstructuredQuadratureMixin(GenericQuadratureMixin):
+    @staticmethod
+    def _subscript_without_sub_element_inames(s):
+        return prim.Subscript(s.aggregate, remove_sub_element_inames(s.index_tuple))
     def quadrature_position_in_micro(self, index=None):
-        return GenericQuadratureMixin.quadrature_position(self, index)
+        return self._subscript_without_sub_element_inames(super().quadrature_position(index))
     def quadrature_position_in_macro(self, index=None):
         original = self.quadrature_position_in_micro(index)
@@ -20,3 +24,9 @@ class BlockstructuredQuadratureMixin(GenericQuadratureMixin):
     def quadrature_position(self, index=None):
         raise CodegenError('Decide if the quadrature point is in the macro or micro element')
+    def quadrature_inames(self):
+        return super().quadrature_inames() + sub_element_inames()
+    def quadrature_weight(self, o):
+        return self._subscript_without_sub_element_inames(super().quadrature_weight(o))
diff --git a/python/dune/codegen/blockstructured/tools.py b/python/dune/codegen/blockstructured/tools.py
index 802b819f23754aa0862e122f4127a3db18593117..c929aa1e447960ce831af54c188bdcf645cc05a7 100644
--- a/python/dune/codegen/blockstructured/tools.py
+++ b/python/dune/codegen/blockstructured/tools.py
@@ -2,7 +2,7 @@ from dune.codegen.tools import get_pymbolic_basename
 from dune.codegen.generation import (iname,
-                                     instruction)
+                                     instruction, globalarg, preamble)
 from dune.codegen.pdelab.geometry import world_dimension
 from dune.codegen.options import get_form_option
 import pymbolic.primitives as prim
@@ -22,6 +22,11 @@ def sub_element_inames():
     return inames
+def remove_sub_element_inames(indices):
+    assert isinstance(indices, tuple)
+    return tuple(set(indices) - set(sub_element_inames()) - set(prim.Variable(i) for i in sub_element_inames()))
 # compute sequential index for given tensor index, the same as index in base-k to base-10
 def tensor_index_to_sequential_index(indices, k):
     return prim.Sum(tuple(prim.Variable(index) * k ** i for i, index in enumerate(indices)))
@@ -64,7 +69,7 @@ def define_point_in_macro(name, point_in_micro, visitor):
         # TODO relax within inames
         instruction(assignee=prim.Subscript(prim.Variable(name), (i,)),
-                    within_inames=frozenset(subelem_inames + visitor.quadrature_inames()),
+                    within_inames=frozenset(visitor.quadrature_inames()),
@@ -74,3 +79,33 @@ def name_point_in_macro(point_in_micro, visitor):
     name = get_pymbolic_basename(point_in_micro) + "_macro"
     define_point_in_macro(name, point_in_micro, visitor)
     return name
+def define_container_alias(name, container, lfs, element, is_const):
+    k = get_form_option("number_of_blocks")
+    p = element.degree()
+    dim = world_dimension()
+    element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim))
+    index_stride = tuple((p * k + 1)**i for i in range(0, dim))
+    globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True)
+    if is_const:
+        return "const auto {} = &{}({},0);".format(name, container, lfs.name)
+    else:
+        return "auto {} = &{}.container()({},0);".format(name, container, lfs.name)
+def name_accumulation_alias(container, accumspace):
+    name = container + "_" + accumspace.lfs.name + "_alias"
+    name_tail = container + "_" + accumspace.lfs.name + "_alias_tail"
+    define_container_alias(name, container, accumspace.lfs, accumspace.element, is_const=False)
+    define_container_alias(name_tail, container, accumspace.lfs, accumspace.element, is_const=False)
+    return name
+def name_container_alias(container, lfs, element):
+    name = container + "_" + lfs.name + "_alias"
+    define_container_alias(name, container, lfs, element, is_const=True)
+    return name
diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py
index d5224e6ffdd19ae88ccbdc6a798a8b416a6fd36d..94bc4875f262c72d695007be04c0b4107208f2f5 100644
--- a/python/dune/codegen/blockstructured/vectorization.py
+++ b/python/dune/codegen/blockstructured/vectorization.py
@@ -26,20 +26,22 @@ def add_vcl_temporaries(knl, vcl_size):
     init_iname = 'init_vec{}'.format(vcl_size)
     from islpy import BasicSet
     init_domain = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(init_iname, get_vcl_type_size(dtype_floatingpoint())))
+    silenced_warnings = []
     for alias in vector_alias:
         vector_name = alias.replace('alias', 'vec{}'.format(vcl_size))
         new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64,
                                                                  shape=(vcl_size,), managed=True,
                                                                  scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',))
-        # write once to the vector such that loopy won't complain
-        new_insns.append(lp.Assignment(assignee=prim.Subscript(prim.Variable(vector_name), prim.Variable(init_iname)),
-                                       expression=0, within_inames=frozenset({init_iname}),
-                                       id='init_{}'.format(vector_name)))
+        # silence warning such that loopy won't complain
+        silenced_warnings.append("read_no_write({})".format(vector_name))
     from loopy.kernel.data import VectorizeTag
     return knl.copy(instructions=knl.instructions + new_insns, domains=knl.domains + [init_domain],
                     temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries),
-                    iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()}))
+                    iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()}),
+                    silenced_warnings=knl.silenced_warnings + silenced_warnings)
 def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size, level):
@@ -248,6 +250,7 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
         load_insns.append(lp.CallInstruction(assignees=(), expression=call_load,
                                              id=load_id, within_inames=insn.within_inames | insn.reduction_inames(),
                                              depends_on=insn.depends_on | write_ids,
+                                             depends_on_is_final=True,
         read_dependencies.setdefault(id, set())
@@ -277,6 +280,7 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
                                               id=store_id, within_inames=insn.within_inames,
                                               depends_on=(insn.depends_on | frozenset({id}) | read_dependencies[id] |
+                                              depends_on_is_final=True,
     # replace alias with vcl vector, except for accumulation assignee
@@ -341,10 +345,12 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
                                            depends_on=(insn.depends_on | read_dependencies[insn.id] |
+                                           depends_on_is_final=True,
                                            tags=insn.tags | frozenset({'vectorized_{}'.format(level)})))
                 new_insns.append(insn.copy(depends_on=(insn.depends_on | read_dependencies[insn.id] |
+                                           depends_on_is_final=True,
                                            tags=insn.tags | frozenset({'vectorized_{}'.format(level)})))
     return knl.copy(instructions=new_insns + load_insns + store_insns)
diff --git a/python/dune/codegen/error.py b/python/dune/codegen/error.py
index 04484f25a9874fe1c753b9a0adb40e6501dfe5ab..047e8becc306cfda369c787175a45503aabcdd97 100644
--- a/python/dune/codegen/error.py
+++ b/python/dune/codegen/error.py
@@ -23,3 +23,7 @@ class CodegenVectorizationError(CodegenCodegenError):
 class CodegenAutotuneError(CodegenVectorizationError):
+class CodegenUnsupportedFiniteElementError(CodegenUFLError):
+    pass
diff --git a/python/dune/codegen/generation/__init__.py b/python/dune/codegen/generation/__init__.py
index bed0256407b7259bab61b6e932c4a17761097e75..97090e18852359b10d1a2d3f74a268a3abac60f1 100644
--- a/python/dune/codegen/generation/__init__.py
+++ b/python/dune/codegen/generation/__init__.py
@@ -24,6 +24,7 @@ from dune.codegen.generation.cpp import (base_class,
+                                         dump_ssc_marks
 from dune.codegen.generation.hooks import (hook,
diff --git a/python/dune/codegen/generation/cpp.py b/python/dune/codegen/generation/cpp.py
index b918291067f45c5f988bc8fdcea55651d538a9db..2ea4c346590ee80ef329fdc9394b9fbc3c59db9c 100644
--- a/python/dune/codegen/generation/cpp.py
+++ b/python/dune/codegen/generation/cpp.py
@@ -55,3 +55,10 @@ def dump_accumulate_timer(name):
 def register_liwkid_timer(name):
     return "LIKWID_MARKER_REGISTER(\"{}\");".format(name)
+def dump_ssc_marks(name):
+    from dune.codegen.pdelab.driver.timings import get_region_marks
+    return 'std::cout << "{}: " << {} << " <--> " << {} << std::endl;'.format(name,
+                                                                              *get_region_marks(name, driver=False))
diff --git a/python/dune/codegen/loopy/vcl.py b/python/dune/codegen/loopy/vcl.py
index e0943a69ac35136abfe59a7f641e0f076681643c..2431275a5f18a3bc87272711ab5ed71c038ced0b 100644
--- a/python/dune/codegen/loopy/vcl.py
+++ b/python/dune/codegen/loopy/vcl.py
@@ -119,9 +119,17 @@ def vcl_function_mangler(knl, func, arg_dtypes):
         return lp.CallMangleInfo("select", (vcl,), (vcl, vcl, vcl))
     if func in ("horizontal_add", "horizontal_add_lower", "horizontal_add_upper"):
+        if get_option("permuting_horizontal_add"):
+            func = "permuting_{}".format(func)
         dtype = arg_dtypes[0]
         vcl = lp.types.NumpyType(get_vcl_type(dtype))
-        include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile")
+        if get_option("opcounter"):
+            include_file("dune/codegen/sumfact/oc_horizontaladd.hh", filetag="operatorfile")
+        else:
+            include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile")
         return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype.dtype),), (vcl,))
     if isinstance(func, VCLPermute):
diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index c8c02a47e982ed64c9a0504ec8204e40183a7587..724f80df6554a9d977bf21210fd044daf9644274 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -8,6 +8,7 @@ import yaml
 import pkg_resources
 from six.moves import configparser
 from six import StringIO
+from contextlib import contextmanager
 class CodegenOptionsValidator(cerberus.Validator):
@@ -44,6 +45,41 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
         ImmutableRecord.__init__(self, **opts)
+    # Arguments that are to be set from the outside
+    uflfile = CodegenOption(helpstr="the UFL file to compile")
+    debug_cache_with_stack = CodegenOption(default=False, helpstr="Store stack along with cache objects. Makes debugging caching issues easier.")
+    driver_file = CodegenOption(helpstr="The filename for the generated driver header")
+    explicit_time_stepping = CodegenOption(default=False, helpstr="use explicit time stepping")
+    time_stepping_order = CodegenOption(default=1, helpstr="Order of the time stepping method")
+    exact_solution_expression = CodegenOption(helpstr="name of the exact solution expression in the ufl file")
+    compare_l2errorsquared = CodegenOption(helpstr="maximal allowed l2 error squared of difference between numerical solution and interpolation of exact solution (NOTE: requires --exact-solution-expression)")
+    grid_info = CodegenOption(default=None, helpstr="Path to file with information about facedir and facemod variations. This can be used to limit the generation of skeleton kernels.")
+    l2error_tree_path = CodegenOption(default=None, helpstr="Tree pathes that should be considered for l2 error calculation. Default None means we take all of them into account.")
+    ini_file = CodegenOption(helpstr="An inifile to use. A generated driver will be hard-coded to it, a [formcompiler] section will be used as default values to form compiler arguments (use snake case)")
+    opcounter = CodegenOption(default=False, helpstr="Count operations. Note: In this case only operator applications are generated since solving and operator counting does not work. You probably want to set instrumentation level>0.")
+    performance_measuring = CodegenOption(default=False, helpstr="Generate opcounter codepath, but only measure times!")
+    instrumentation_level = CodegenOption(default=0, helpstr="Control time/opcounter measurements. 0-do nothing, 1-measure program as a whole, 2-operator applications, 3-measure kernel (eg. alpha-volume, ...), 4-parts of kernel (eg. stage 1-3 of SF)")
+    project_basedir = CodegenOption(helpstr="The base (build) directory of the dune-codegen project")
+    architecture = CodegenOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl|skylake")
+    yaspgrid_offset = CodegenOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.")
+    grid_unstructured = CodegenOption(default=False, helpstr="Set to true if you want to use an unstructured grid.")
+    grid_consistent = CodegenOption(default=False, helpstr="The used grid is already consistent.")
+    precision_bits = CodegenOption(default=64, helpstr="The number of bits for the floating point type")
+    overlapping = CodegenOption(default=False, helpstr="Use an overlapping solver and constraints. You still need to make sure to construct a grid with overlap! The parallel option will be set automatically.")
+    operators = CodegenOption(default="r", helpstr="A comma separated list of operators, each name will be interpreted as a subsection name within the formcompiler section")
+    target_name = CodegenOption(default=None, helpstr="The target name from CMake")
+    operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
+    debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).")
+    use_likwid = CodegenOption(default=False, helpstr="Use likwid instead of own performance measurements.")
+    use_sde = CodegenOption(default=False, helpstr="Use sde instead of own performance measurements.")
+    autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
+    with_mpi = CodegenOption(default=True, helpstr="The module was configured with mpi")
+    permuting_horizontal_add = CodegenOption(default=True, helpstr="Whether SIMD horizontal_add should use a permuting implementation.")
+    # Arguments that are mainly to be set by logic depending on other options
+    max_vector_width = CodegenOption(default=256, helpstr=None)
+    parallel = CodegenOption(default=False, helpstr="Mark that this program should be run in parallel. If set to true the c++ code will check that there are more than 1 MPI-ranks involved and the error computation will use communication.")
 class CodegenFormOptionsArray(ImmutableRecord):
     """ A collection of form-specific form compiler arguments """
@@ -54,12 +90,80 @@ class CodegenFormOptionsArray(ImmutableRecord):
         ImmutableRecord.__init__(self, **opts)
+    # Form specific options
+    form = CodegenOption(default=None, helpstr="The name of the UFL object representing the form in the UFL file")
+    filename = CodegenOption(default=None, helpstr="The filename to use for this LocalOperator")
+    classname = CodegenOption(default=None, helpstr="The name of the C++ class to generate")
+    numerical_jacobian = CodegenOption(default=False, helpstr="use numerical jacobians (only makes sense, if uflpdelab for some reason fails to generate analytic jacobians)")
+    matrix_free = CodegenOption(default=False, helpstr="Generate jacobian_apply_* methods for matrix free solvers")
+    print_transformations = CodegenOption(default=False, helpstr="print out dot files after ufl tree transformations")
+    print_transformations_dir = CodegenOption(default=".", helpstr="place where to put dot files (can be omitted)")
+    quadrature_order = CodegenOption(_type=int, helpstr="Quadrature order used for all integrals.")
+    diagonal_transformation_matrix = CodegenOption(default=False, helpstr="set option if the jacobian of the transformation is diagonal (axiparallel grids)")
+    constant_transformation_matrix = CodegenOption(default=False, helpstr="set option if the jacobian of the transformation is constant on a cell")
+    fastdg = CodegenOption(default=False, helpstr="Use FastDGGridOperator from PDELab.")
+    sumfact = CodegenOption(default=False, helpstr="Use sumfactorization")
+    sumfact_regular_jacobians = CodegenOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)")
+    sumfact_on_boundary = CodegenOption(default=True, helpstr="Whether boundary integrals should be vectorized. It might not be worth the hassle...")
+    sumfact_optimize_loop_order = CodegenOption(default=False, helpstr="Optimize order of loops in sumf factorization function using autotuning.")
+    sumfact_performance_transformations = CodegenOption(default=False, helpstr="Apply sum factorization specific performance transformations.")
+    sumfact_performance_transformations_testrun = CodegenOption(default=0, helpstr="If larger than zero determines test case to run.")
+    vectorization_quadloop = CodegenOption(default=False, helpstr="whether to generate code with explicit vectorization")
+    vectorization_strategy = CodegenOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune")
+    vectorization_not_fully_vectorized_error = CodegenOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize")
+    vectorization_horizontal = CodegenOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy")
+    vectorization_vertical = CodegenOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy")
+    vectorization_padding = CodegenOption(default=None, helpstr="an explicit value for the allowed padding in vectorization")
+    vectorization_allow_quadrature_changes = CodegenOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers")
+    vectorization_list_index = CodegenOption(default=None, helpstr="Which vectorization to pick from a list (only valid with vectorization_strategy=fromlist).")
+    vectorization_jacobians = CodegenOption(default=True, helpstr="Whether to attempt to vectorize jacobians (takes time, often not needed)")
+    vectorization_target = CodegenOption(_type=float, helpstr="The cost function target for the 'target' cost model. Only needed to verify the cost model itself, do not use light-heartedly!!!")
+    simplify = CodegenOption(default=False, helpstr="Whether to simplify expressions using sympy")
+    generate_jacobians = CodegenOption(default=True, helpstr="Whether jacobian_* methods should be generated. This is set to false automatically, when numerical_jacobian is set to true.")
+    generate_jacobian_apply = CodegenOption(default=False, helpstr="Wether jacobian_allpy_* methods should be generated.")
+    generate_residuals = CodegenOption(default=True, helpstr="Whether alpha_* methods should be generated.")
+    unroll_dimension_loops = CodegenOption(default=False, helpstr="whether loops over the geometric dimension should be unrolled")
+    blockstructured = CodegenOption(default=False, helpstr="Use block structure")
+    number_of_blocks = CodegenOption(default=1, helpstr="Number of sub blocks in one direction")
+    vectorization_blockstructured = CodegenOption(default=False, helpstr="Vectorize block structuring")
+    vectorization_blockstructured_tail = CodegenOption(default=True, helpstr="Try to fully vectorize block structuring even when 'nunmber_of_blocks' is not divisible by vector length")
+    vectorization_blockstructured_tail_ordering = CodegenOption(default='consecutive', helpstr="Ordering of the tail w.r.t the vectorized loop. Possible values: consecutive|blocked")
+    adjoint = CodegenOption(default=False, helpstr="Generate adjoint operator")
+    control = CodegenOption(default=False, helpstr="Generate operator of derivative w.r.t. the control variable")
+    objective_function = CodegenOption(default=None, helpstr="Name of form representing the objective function in UFL file")
+    control_variable = CodegenOption(default=None, helpstr="Name of control variable in UFL file")
+    block_preconditioner_diagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the diagonal part of a block preconditioner")
+    block_preconditioner_offdiagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the off-diagonal part of a block preconditioner")
+    block_preconditioner_pointdiagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the point diagonal part of a block preconditioner")
+    geometry_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for geometries. Currently implemented mixins: generic, axiparallel, equidistant, sumfact_multilinear, sumfact_axiparallel, sumfact_equidistant")
+    quadrature_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for quadrature. Currently implemented: generic, sumfact")
+    basis_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for basis function evaluation. Currently implemented: generic, sumfact")
+    accumulation_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for accumulation. Currently implemented: generic, sumfact, control, blockstructured")
+    enable_volume = CodegenOption(default=True, helpstr="Whether to assemble volume integrals")
+    enable_skeleton = CodegenOption(default=True, helpstr="Whether to assemble skeleton integrals")
+    enable_boundary = CodegenOption(default=True, helpstr="Whether to assemble boundary integrals")
 # Until more sophisticated logic is needed, we keep the actual option data in this module
 _global_options = CodegenGlobalOptionsArray()
 _form_options = {}
+def show_options():
+    # TODO: This needs to be adjusted to options-validation
+    def subopt(arr):
+        for k, v in arr.__dict__.items():
+            if isinstance(v, CodegenOption) and v.helpstr is not None:
+                print("{}\n    {}".format(k, v.helpstr))
+    print("This is a summary of options available for the code generation process:\n")
+    print("The following options can be given in the [formcompiler] section:")
+    subopt(CodegenGlobalOptionsArray)
+    print("\nThefollowing options can be given in a form-specific subsection of [formcompiler]:")
+    subopt(CodegenFormOptionsArray)
 def initialize_options():
     """ Initialize the options from the command line """
     global _global_options
@@ -192,10 +296,16 @@ def process_form_options(opt, form):
     if opt.filename is None:
         opt = opt.copy(filename="{}_{}_file.hh".format(get_option("target_name"), opt.classname))
+    if opt.block_preconditioner_pointdiagonal:
+        opt = opt.copy(generate_jacobians=False,
+                       basis_mixins="sumfact_pointdiagonal",
+                       accumulation_mixins="sumfact_pointdiagonal",
+                       )
     if opt.block_preconditioner_diagonal or opt.block_preconditioner_offdiagonal:
         assert opt.numerical_jacobian is False
         opt = opt.copy(generate_residuals=False,
-                       generate_jacobians=False,
+                       generate_jacobians=True,
@@ -251,21 +361,39 @@ def get_form_option(key, form=None):
     return getattr(processed_form_opts, key)
-def option_switch(opt):
-    def _switch():
-        if isinstance(opt, tuple):
-            opts = opt
-        else:
-            assert isinstance(opt, str)
-            opts = (opt,)
-        try:
-            for o in opts:
-                if get_option(o):
-                    return o
-            return "default"
-        except AttributeError:
-            for o in opts:
-                if get_form_option(o):
-                    return o
-            return "default"
-    return _switch
+def option_context(conditional=True, **opts):
+    """ A context manager that sets a given option and restores it on exit. """
+    # Backup old values and set to new ones
+    if conditional:
+        backup = {}
+        for k, v in opts.items():
+            backup[k] = get_option(k)
+            set_option(k, v)
+    yield
+    if conditional:
+        # Restore old values
+        for k in opts.keys():
+            set_option(k, backup[k])
+def form_option_context(conditional=True, **opts):
+    """ A context manager that sets a given form option and restores it on exit """
+    if conditional:
+        form = opts.pop("form", None)
+        # Backup old values and set to new ones
+        backup = {}
+        for k, v in opts.items():
+            backup[k] = get_form_option(k, form=form)
+            set_form_option(k, v, form=form)
+    yield
+    # Restore old values
+    if conditional:
+        for k in opts.keys():
+            set_form_option(k, backup[k], form=form)
diff --git a/python/dune/codegen/pdelab/argument.py b/python/dune/codegen/pdelab/argument.py
index c3cc48298c1b2efb70c6c7716b29b221486044e1..dc1acd660c137c42be6fb65bb687bafe03fbc730 100644
--- a/python/dune/codegen/pdelab/argument.py
+++ b/python/dune/codegen/pdelab/argument.py
@@ -19,6 +19,7 @@ from dune.codegen.pdelab.spaces import (lfs_iname,
 from dune.codegen.pdelab.restriction import restricted_name
 from dune.codegen.ufl.modified_terminals import Restriction
+from dune.codegen.options import get_form_option
 from pymbolic.primitives import Call, Subscript, Variable
@@ -116,6 +117,10 @@ def type_coefficientcontainer():
     return "X"
+def type_linearizationpointcontainer():
+    return "Z"
 def name_jacobian(restriction1, restriction2):
     # Restrictions may only differ if NONE
     if (restriction1 == Restriction.NONE) or (restriction2 == Restriction.NONE):
@@ -144,6 +149,8 @@ def name_accumulation_variable(restrictions=None):
                 restrictions = (Restriction.NONE,)
                 restrictions = (Restriction.POSITIVE,)
+        if get_form_option("block_preconditioner_pointdiagonal"):
+            restrictions = restrictions[:1]
         return name_residual(*restrictions)
     if ft == 'jacobian':
         if restrictions is None:
diff --git a/python/dune/codegen/pdelab/basis.py b/python/dune/codegen/pdelab/basis.py
index 03c069ac518f01efaec8304356fddcf34915668f..885194af5817bd30d6cc339603ab84f00bc7b990 100644
--- a/python/dune/codegen/pdelab/basis.py
+++ b/python/dune/codegen/pdelab/basis.py
@@ -8,9 +8,7 @@ from dune.codegen.generation import (basis_mixin,
-from dune.codegen.options import (option_switch,
-                                  get_form_option,
-                                  )
+from dune.codegen.options import get_form_option
 from dune.codegen.pdelab.argument import (name_applycontainer,
@@ -39,6 +37,7 @@ from dune.codegen.pdelab.restriction import restricted_name
 from dune.codegen.pdelab.driver import (isPk,
+from dune.codegen.ufl.modified_terminals import Restriction
 from pymbolic.primitives import Product, Subscript, Variable
 import pymbolic.primitives as prim
@@ -81,7 +80,11 @@ class BasisMixinBase(object):
 class GenericBasisMixin(BasisMixinBase):
     def initialize_function_spaces(self, expr):
-        return initialize_function_spaces(expr, self)
+        restriction = self.restriction
+        if self.measure == 'exterior_facet':
+            restriction = Restriction.POSITIVE
+        return initialize_function_spaces(expr, restriction, self.indices)
     def lfs_inames(self, element, restriction, number, context=""):
         return (lfs_iname(element, restriction, number, context),)
diff --git a/python/dune/codegen/pdelab/driver/__init__.py b/python/dune/codegen/pdelab/driver/__init__.py
index b60544c1c78242f1490c76c46d4be6a2c4448501..12b5af6d9da9344b99064ee6fc071c52ac35dcd2 100644
--- a/python/dune/codegen/pdelab/driver/__init__.py
+++ b/python/dune/codegen/pdelab/driver/__init__.py
@@ -22,6 +22,8 @@ from dune.codegen.generation import (generator_factory,
 from dune.codegen.options import (get_form_option,
+from ufl import TensorProductCell
 # The following functions are not doing anything useful, but providing easy access
@@ -39,7 +41,10 @@ def get_form_ident():
 def get_form():
     data = get_global_context_value("data")
-    return data.object_by_name[get_form_option("form", get_form_ident())]
+    form = get_form_option("form")
+    if form is None:
+        form = get_form_ident()
+    return data.object_by_name[form]
 def get_dimension():
@@ -81,6 +86,9 @@ def isLagrange(fem):
 def isSimplical(cell):
+    if isinstance(cell, TensorProductCell):
+        return False
     # Cells can be identified through strings *or* ufl objects
     if not isinstance(cell, str):
         cell = cell.cellname()
@@ -88,6 +96,9 @@ def isSimplical(cell):
 def isQuadrilateral(cell):
+    if isinstance(cell, TensorProductCell):
+        return all(tuple(isSimplical(c) for c in cell.sub_cells()))
     # Cells can be identified through strings *or* ufl objects
     if not isinstance(cell, str):
         cell = cell.cellname()
@@ -120,20 +131,10 @@ def FEM_name_mangling(fem):
             name = name + FEM_name_mangling(elem)
         return name
     if isinstance(fem, FiniteElement):
-        if isPk(fem):
-            return "P" + str(fem._degree)
-        if isQk(fem):
-            return "Q" + str(fem._degree)
-        if isDG(fem):
-            return "DG" + str(fem._degree)
+        return "{}{}".format(fem._short_name, fem.degree())
     if isinstance(fem, TensorProductElement):
         assert(len(set(subel._short_name for subel in fem.sub_elements())) == 1)
-        if isLagrange(fem.sub_elements()[0]):
-            return "TensorQ" + '_'.join(map(str, fem._degree))
-        if isDG(fem.sub_elements()[0]):
-            return "TensorDG" + '_'.join(map(str, fem._degree))
-        raise NotImplementedError("fem name mangling")
+        return "TP_{}".format("_".join(FEM_name_mangling(subel) for subel in fem.sub_elements()))
     raise NotImplementedError("FEM NAME MANGLING")
@@ -212,7 +213,10 @@ def name_initree():
 def define_mpihelper(name):
     include_file("dune/common/parallel/mpihelper.hh", filetag="driver")
-    return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name)
+    if get_option("with_mpi"):
+        return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name)
+    else:
+        return "Dune::FakeMPIHelper& {} = Dune::FakeMPIHelper::instance(argc, argv);".format(name)
 def name_mpihelper():
@@ -282,6 +286,13 @@ def generate_driver():
     contents = []
+    # Assert that this program was called with ini file
+    contents += ['if (argc != 2){',
+                 '  std::cerr << "This program needs to be called with an ini file" << std::endl;',
+                 '  return 1;',
+                 '}',
+                 '']
     def add_section(tag, comment):
         tagcontents = [i for i in retrieve_cache_items("preamble and {}".format(tag), make_generable=True)]
         if tagcontents:
diff --git a/python/dune/codegen/pdelab/driver/gridfunctionspace.py b/python/dune/codegen/pdelab/driver/gridfunctionspace.py
index 377b304b03499aa3c109390cf72f4af0f6327e9b..49789b794d1cad5e84b69da898fc13c7e823a9fd 100644
--- a/python/dune/codegen/pdelab/driver/gridfunctionspace.py
+++ b/python/dune/codegen/pdelab/driver/gridfunctionspace.py
@@ -1,3 +1,4 @@
+from dune.codegen.error import CodegenUnsupportedFiniteElementError
 from dune.codegen.generation import (include_file,
@@ -9,10 +10,6 @@ from dune.codegen.pdelab.driver import (FEM_name_mangling,
-                                        isLagrange,
-                                        isDG,
-                                        isPk,
-                                        isQk,
@@ -20,7 +17,7 @@ from dune.codegen.pdelab.driver import (FEM_name_mangling,
 from dune.codegen.loopy.target import type_floatingpoint
-from ufl import FiniteElement, MixedElement, TensorElement, VectorElement, TensorProductElement
+from ufl import FiniteElement, MixedElement, TensorElement, VectorElement, TensorProductElement, TensorProductCell
@@ -82,7 +79,8 @@ def define_grid(name):
     _type = type_grid()
     # TODO: In principle this is only necessary if we use sum factorization in
     # one of the operators. So this could be turned off if that is not the case.
-    if isQuadrilateral(get_trial_element().cell()) and get_option("grid_unstructured"):
+    if isQuadrilateral(get_trial_element().cell()) and get_option("grid_unstructured") and not \
+            get_option("grid_consistent"):
         include_file("dune/consistent-edge-orientation/createconsistentgrid.hh", filetag="driver")
         return ["IniGridFactory<{}> factory({});".format(_type, ini),
                 "std::shared_ptr<{}> grid_nonconsistent = factory.getGrid();".format(_type),
@@ -122,61 +120,98 @@ def name_leafview():
     return name
+def get_short_name(element):
+    if isinstance(element, TensorProductElement):
+        assert len(set(subel._short_name for subel in element.sub_elements())) == 1
+        return get_short_name(element.sub_elements()[0])
+    return element._short_name
 def typedef_fem(element, name):
     gv = type_leafview()
     df = type_domainfield()
     r = type_range()
     dim = get_dimension()
+    cell = element.cell()
+    degree = element.degree()
+    short = get_short_name(element)
+    # We currently only support TensorProductElement from UFL if it aliases another finite element
+    # available from UFL. Here, we check this condition and recover the aliases element
+    if isinstance(element, TensorProductElement):
+        subels = set(subel._short_name for subel in element.sub_elements())
+        if len(subels) != 1 or len(set(element.degree())) != 1:
+            raise CodegenUnsupportedFiniteElementError(element)
+        degree = element.degree()[0]
+        cell = TensorProductCell(*tuple(subel.cell() for subel in element.sub_elements()))
+    # The blockstructured code branch has its own handling of finite element selection
     if get_form_option("blockstructured"):
         include_file("dune/codegen/blockstructured/blockstructuredqkfem.hh", filetag="driver")
-        degree = element.degree() * get_form_option("number_of_blocks")
+        degree = degree * get_form_option("number_of_blocks")
         return "using {} = Dune::PDELab::BlockstructuredQkLocalFiniteElementMap<{}, {}, {}, {}>;" \
             .format(name, gv, df, r, degree)
-    if isinstance(element, TensorProductElement):
-        # Only allow TensorProductElements where all subelements are
-        # of the same type ('CG' or 'DG')
-        assert(len(set(subel._short_name for subel in element.sub_elements())) == 1)
-        # Anisotropic degree is not yet supported in Dune
-        degrees = element.degree()
-        for deg in degrees:
-            assert (deg == degrees[0])
-        # TensorProductElements have Qk structure -> no Pk
-        if isLagrange(element.sub_elements()[0]):
-            include_file("dune/pdelab/finiteelementmap/qkfem.hh", filetag="driver")
-            return "using {} = Dune::PDELab::QkLocalFiniteElementMap<{}, {}, {}, {}>;" \
-                .format(name, gv, df, r, degrees[0])
-        elif isDG(element.sub_elements()[0]):
-            include_file("dune/pdelab/finiteelementmap/qkdg.hh", filetag="driver")
-            # TODO allow switching the basis here!
-            return "using {} = Dune::PDELab::QkDGLocalFiniteElementMap<{}, {}, {}, {}>;" \
-                .format(name, df, r, degrees[0], dim)
-        raise NotImplementedError("FEM not implemented in dune-codegen")
-    elif isQk(element):
-        include_file("dune/pdelab/finiteelementmap/qkfem.hh", filetag="driver")
-        return "using {} = Dune::PDELab::QkLocalFiniteElementMap<{}, {}, {}, {}>;" \
-            .format(name, gv, df, r, element.degree())
-    elif isPk(element):
-        include_file("dune/pdelab/finiteelementmap/pkfem.hh", filetag="driver")
-        return "using {} = Dune::PDELab::PkLocalFiniteElementMap<{}, {}, {}, {}>;" \
-            .format(name, gv, df, r, element.degree())
-    elif isDG(element):
-        if isQuadrilateral(element.cell()):
-            include_file("dune/pdelab/finiteelementmap/qkdg.hh", filetag="driver")
-            # TODO allow switching the basis here!
-            return "using {} = Dune::PDELab::QkDGLocalFiniteElementMap<{}, {}, {}, {}>;" \
-                .format(name, df, r, element.degree(), dim)
-        if isSimplical(element.cell()):
-            include_file("dune/pdelab/finiteelementmap/opbfem.hh", filetag="driver")
-            return "using {} = Dune::PDELab::OPBLocalFiniteElementMap<{}, {}, {}, {}, Dune::GeometryType::simplex>;" \
-                .format(name, df, r, element.degree(), dim)
-        raise NotImplementedError("Geometry type not known in code generation")
-    raise NotImplementedError("FEM not implemented in dune-codegen")
+    # This is a backward-compatibility hack: So far we silently used OPBFem for DG with simplices:
+    if short == "DG" and isSimplical(cell):
+        short = "OPB"
+    # Choose the correct finite element implementation
+    if short == "CG":
+        if isSimplical(cell):
+            if dim in (1, 2, 3):
+                include_file("dune/pdelab/finiteelementmap/pkfem.hh", filetag="driver")
+                return "using {} = Dune::PDELab::PkLocalFiniteElementMap<{}, {}, {}, {}>;" \
+                    .format(name, gv, df, r, degree)
+            else:
+                raise CodegenUnsupportedFiniteElementError(element)
+        elif isQuadrilateral(cell):
+            if dim in (2, 3) and degree < 3:
+                include_file("dune/pdelab/finiteelementmap/qkfem.hh", filetag="driver")
+                return "using {} = Dune::PDELab::QkLocalFiniteElementMap<{}, {}, {}, {}>;" \
+                    .format(name, gv, df, r, degree)
+            else:
+                raise CodegenUnsupportedFiniteElementError(element)
+        else:
+            raise CodegenUnsupportedFiniteElementError(element)
+    elif short == "DG":
+        if isQuadrilateral(cell):
+            if dim < 4:
+                include_file("dune/pdelab/finiteelementmap/qkdg.hh", filetag="driver")
+                return "using {} = Dune::PDELab::QkDGLocalFiniteElementMap<{}, {}, {}, {}>;" \
+                    .format(name, df, r, degree, dim)
+            else:
+                raise CodegenUnsupportedFiniteElementError(element)
+        else:
+            raise CodegenUnsupportedFiniteElementError(element)
+    elif short == "GL":
+        raise NotImplementedError("Gauss-Legendre polynomials not implemented")
+    elif short == "DGLL":
+        raise NotImplementedError("Discontinuous Gauss-Lobatto-Legendre polynomials not implemented")
+    elif short == "OPB":
+        if isQuadrilateral(cell):
+            gt = "Dune::GeometryType::cube"
+        elif isSimplical(cell):
+            gt = "Dune::GeometryType::simplex"
+        else:
+            raise CodegenUnsupportedFiniteElementError(element)
+        include_file("dune/pdelab/finiteelementmap/opbfem.hh", filetag="driver")
+        return "using {} = Dune::PDELab::OPBLocalFiniteElementMap<{}, {}, {}, {}, {}>;" \
+            .format(name, df, r, degree, dim, gt)
+    elif short == "Monom":
+        raise NotImplementedError("Monomials basis DG not implemented")
+    elif short == "RaTu":
+        raise NotImplementedError("Rannacher-Turek elements not implemented")
+    elif short == "RT":
+        raise NotImplementedError("Raviart-Thomas elements not implemented")
+    elif short == "BDM":
+        raise NotImplementedError("Brezzi-Douglas-Marini elements not implemented")
+    else:
+        raise CodegenUnsupportedFiniteElementError(element)
 def type_fem(element):
@@ -188,23 +223,13 @@ def type_fem(element):
 def define_fem(element, name):
     femtype = type_fem(element)
-    from dune.codegen.pdelab.driver import isDG
-    if isinstance(element, TensorProductElement):
-        # Only allow TensorProductElements where all subelements are
-        # of the same type ('CG' or 'DG')
-        assert(len(set(subel._short_name for subel in element.sub_elements())) == 1)
-        if isDG(element.sub_elements()[0]):
-            return "{} {};".format(femtype, name)
-        else:
-            assert(isLagrange(element.sub_elements()[0]))
-            gv = name_leafview()
-            return "{} {}({});".format(femtype, name, gv)
-    elif isDG(element):
-        return "{} {};".format(femtype, name)
-    else:
-        assert(isLagrange(element))
+    # Determine whether the FEM is grid-dependent - currently on the Lagrange elements are
+    if get_short_name(element) == "CG":
         gv = name_leafview()
         return "{} {}({});".format(femtype, name, gv)
+    else:
+        return "{} {};".format(femtype, name)
 def name_fem(element):
diff --git a/python/dune/codegen/pdelab/driver/gridoperator.py b/python/dune/codegen/pdelab/driver/gridoperator.py
index abc878f0715dbcb4a0501d56247c4f03c98f20df..bdf8b2f6245acb63b2a67c518ab98fef597672af 100644
--- a/python/dune/codegen/pdelab/driver/gridoperator.py
+++ b/python/dune/codegen/pdelab/driver/gridoperator.py
@@ -77,8 +77,7 @@ def typedef_localoperator(name, form_ident):
     filename = get_form_option("filename", form_ident)
     include_file(filename, filetag="driver")
     lopname = localoperator_basename(form_ident)
-    range_type = type_range()
-    return "using {} = {}<{}, {}, {}>;".format(name, lopname, ugfs, vgfs, range_type)
+    return "using {} = {}<{}, {}>;".format(name, lopname, ugfs, vgfs)
 def type_localoperator(form_ident):
diff --git a/python/dune/codegen/pdelab/driver/instationary.py b/python/dune/codegen/pdelab/driver/instationary.py
index 5cf7170f1dca6f4274db4bf77b4b1627466e20cb..355fd0e743f6a59716319241018d3c5794c784bd 100644
--- a/python/dune/codegen/pdelab/driver/instationary.py
+++ b/python/dune/codegen/pdelab/driver/instationary.py
@@ -132,10 +132,25 @@ def name_time():
 def typedef_timesteppingmethod(name):
     r_type = type_range()
     explicit = get_option('explicit_time_stepping')
+    order = get_option('time_stepping_order')
     if explicit:
-        return "using {} = Dune::PDELab::ExplicitEulerParameter<{}>;".format(name, r_type)
+        if order == 1:
+            return "using {} = Dune::PDELab::ExplicitEulerParameter<{}>;".format(name, r_type)
+        elif order == 2:
+            return "using {} = Dune::PDELab::HeunParameter<{}>;".format(name, r_type)
+        elif order == 3:
+            return "using {} = Dune::PDELab::Shu3Parameter<{}>;".format(name, r_type)
+        elif order == 4:
+            return "using {} = Dune::PDELab::RK4Parameter<{}>;".format(name, r_type)
+        else:
+            raise NotImplementedError("Time stepping method not supported")
-        return "using {} = Dune::PDELab::OneStepThetaParameter<{}>;".format(name, r_type)
+        if order == 1:
+            return "using {} = Dune::PDELab::OneStepThetaParameter<{}>;".format(name, r_type)
+        elif order == 2:
+            return "using {} = Dune::PDELab::Alexander2Parameter<{}>;".format(name, r_type)
+        elif order == 3:
+            return "using {} = Dune::PDELab::Alexander3Parameter<{}>;".format(name, r_type)
 def type_timesteppingmethod():
@@ -150,8 +165,12 @@ def define_timesteppingmethod(name):
     if explicit:
         return "{} {};".format(tsm_type, name)
-        ini = name_initree()
-        return "{} {}({}.get<double>(\"instat.theta\",1.0));".format(tsm_type, name, ini)
+        order = get_option('time_stepping_order')
+        if order == 1:
+            ini = name_initree()
+            return "{} {}({}.get<double>(\"instat.theta\",1.0));".format(tsm_type, name, ini)
+        else:
+            return "{} {};".format(tsm_type, name)
 def name_timesteppingmethod():
diff --git a/python/dune/codegen/pdelab/driver/timings.py b/python/dune/codegen/pdelab/driver/timings.py
index aeca64d46c73f2327b48c22f07dca7a85a044104..6bbbd07e4b7701fe516eff9509525165ac23a5eb 100644
--- a/python/dune/codegen/pdelab/driver/timings.py
+++ b/python/dune/codegen/pdelab/driver/timings.py
@@ -4,7 +4,7 @@ from dune.codegen.generation import (cached,
-                                     )
+                                     post_include)
 from dune.codegen.options import get_option
 from dune.codegen.pdelab.driver import (get_form_ident,
@@ -24,6 +24,9 @@ from dune.codegen.pdelab.driver.solve import (name_vector,
+_sde_marks = {}
 def define_timing_identifier(name):
     ini = name_initree()
@@ -125,6 +128,17 @@ def local_operator_likwid():
     return "{}.register_likwid_timers();".format(lop_name)
+def local_operator_ssc_marks():
+    lop_name = name_localoperator(get_form_ident())
+    return "{}.dump_ssc_marks();".format(lop_name)
+def ssc_macro():
+    return '#define __SSC_MARK(x) do{ __asm__ __volatile__' \
+           '("movl %0, %%ebx; .byte 100, 103, 144" : :"i"(x) : "%ebx"); } while(0)'
 def setup_timer():
     # TODO check that we are using YASP?
@@ -138,6 +152,10 @@ def setup_timer():
             logger.warning("timings: using instrumentation level >= 3 with likwid will slow down your code considerably")
+    elif get_option("use_sde"):
+        post_include(ssc_macro(), filetag='driver')
+        if get_option('instrumentation_level') >= 3:
+            local_operator_ssc_marks()
         from dune.codegen.loopy.target import type_floatingpoint
         pre_include("#define HP_TIMER_OPCOUNTER {}".format(type_floatingpoint()), filetag="driver")
@@ -156,14 +174,26 @@ def init_region_timer(region):
     if get_option("use_likwid"):
+    elif get_option("use_sde"):
+        pass
         from dune.codegen.generation import post_include
         post_include("HP_DECLARE_TIMER({});".format(region), filetag="driver")
+def get_region_marks(region, driver):
+    if driver:
+        return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 11, (2 * (len(_sde_marks) + 1) + 1) * 11))
+    else:
+        return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 1, (2 * (len(_sde_marks) + 1) + 1) * 1))
 def start_region_timer(region):
     if get_option("use_likwid"):
         return ["LIKWID_MARKER_START(\"{}\");".format(region)]
+    elif get_option("use_sde"):
+        marks = get_region_marks(region, driver=True)
+        return ["__SSC_MARK(0x{});".format(marks[0])]
         return ["HP_TIMER_START({});".format(region)]
@@ -171,6 +201,10 @@ def start_region_timer(region):
 def stop_region_timer(region):
     if get_option("use_likwid"):
         return ["LIKWID_MARKER_STOP(\"{}\");".format(region)]
+    elif get_option("use_sde"):
+        marks = get_region_marks(region, driver=True)
+        return ["__SSC_MARK(0x{});".format(marks[1]),
+                "std::cout << \"Timed region {}: {} <--> {}\" << std::endl;".format(region, *marks)]
         timestream = name_timing_stream()
         return ["HP_TIMER_STOP({});".format(region),
@@ -207,7 +241,7 @@ def timed_region(region, actions):
-        if get_option('instrumentation_level') >= 3 and not get_option('use_likwid'):
+        if get_option('instrumentation_level') >= 3 and not (get_option('use_likwid') or get_option("use_sde")):
             timestream = name_timing_stream()
             lop_name = name_localoperator(get_form_ident())
             print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
diff --git a/python/dune/codegen/pdelab/geometry.py b/python/dune/codegen/pdelab/geometry.py
index b9d5b0d03e9c137c30fa49bfefee18d835e7f003..df6b91548a5ccf217b3e36a7d176be828a35f374 100644
--- a/python/dune/codegen/pdelab/geometry.py
+++ b/python/dune/codegen/pdelab/geometry.py
@@ -12,9 +12,7 @@ from dune.codegen.generation import (class_member,
-from dune.codegen.options import (get_form_option,
-                                  option_switch,
-                                  )
+from dune.codegen.options import get_form_option
 from dune.codegen.loopy.target import dtype_floatingpoint, type_floatingpoint
 from dune.codegen.pdelab.quadrature import (quadrature_preamble,
@@ -89,6 +87,10 @@ class GenericPDELabGeometryMixin(GeometryMixinBase):
         if restriction == Restriction.NONE:
             return local
+        return self._to_cell(local, restriction)
+    @kernel_cached
+    def _to_cell(self, local, restriction):
         basename = get_pymbolic_basename(local)
         name = "{}_in_{}side".format(basename, "in" if restriction is Restriction.POSITIVE else "out")
         temporary_variable(name, shape=(world_dimension(),), shape_impl=("fv",))
@@ -269,9 +271,9 @@ class EquidistantGeometryMixin(AxiparallelGeometryMixin):
     def _define_jacobian_determinant_eval(self, name):
-        from dune.codegen.pdelab.localoperator import name_ansatz_gfs_constructor_param, lop_template_range_field
+        from dune.codegen.pdelab.localoperator import name_ansatz_gfs_constructor_param
         gfs = name_ansatz_gfs_constructor_param()
-        rft = lop_template_range_field()
+        rft = type_floatingpoint()
         return "{} = {}.gridView().template begin<0>()->geometry().integrationElement(Dune::FieldVector<{}, {}>());".format(name, gfs, rft, world_dimension())
     def define_jacobian_inverse_transposed(self, name, restriction):
@@ -289,9 +291,9 @@ class EquidistantGeometryMixin(AxiparallelGeometryMixin):
     def _define_jacobian_inverse_transposed_eval(self, name):
-        from dune.codegen.pdelab.localoperator import name_ansatz_gfs_constructor_param, lop_template_range_field
+        from dune.codegen.pdelab.localoperator import name_ansatz_gfs_constructor_param
         gfs = name_ansatz_gfs_constructor_param()
-        rft = lop_template_range_field()
+        rft = type_floatingpoint()
         return "{} = {}.gridView().template begin<0>()->geometry().jacobianInverseTransposed(Dune::FieldVector<{}, {}>());".format(name, gfs, rft, world_dimension())
diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py
index dfc553fbefee7fa8c2e6aa1d92ae7ea047acdd38..835fe44e24df9f0849391e1f50422512df43c044 100644
--- a/python/dune/codegen/pdelab/localoperator.py
+++ b/python/dune/codegen/pdelab/localoperator.py
@@ -7,8 +7,8 @@ import numpy as np
 from dune.codegen.options import (get_form_option,
-                                  option_switch,
-                                  set_form_option)
+                                  form_option_context,
+                                  )
 from dune.codegen.generation import (accumulation_mixin,
@@ -26,18 +26,22 @@ from dune.codegen.generation import (accumulation_mixin,
+                                     kernel_cached,
+                                     dump_ssc_marks
 from dune.codegen.cgen.clazz import (AccessModifier,
+from dune.codegen.loopy.target import type_floatingpoint
 from dune.codegen.ufl.modified_terminals import Restriction
+from frozendict import frozendict
 import dune.codegen.loopy.mangler
@@ -72,11 +76,6 @@ def name_test_gfs_constructor_param():
     return "gfsv"
-def lop_template_range_field():
-    return "RF"
 def lop_domain_field(name):
     # TODO: Rethink for not Galerkin Method
@@ -153,6 +152,11 @@ def enum_alpha():
     return _enum_alpha(ufl_measure_to_pdelab_measure(integral_type))
+def enum_skeleton_twosided():
+    return "enum { doSkeletonTwoSided = true };"
 def name_initree_member():
     return "_iniParams"
@@ -279,7 +283,7 @@ def determine_accumulation_space(info, number):
     return AccumulationSpace(lfs=lfs,
-                             element=element
+                             element=subel
@@ -344,7 +348,11 @@ class AccumulationMixinBase(object):
 class GenericAccumulationMixin(AccumulationMixinBase):
     def get_accumulation_info(self, expr):
-        return get_accumulation_info(expr, self)
+        restriction = self.restriction
+        if self.measure == 'exterior_facet':
+            restriction = Restriction.POSITIVE
+        return get_accumulation_info(expr, restriction, self.indices, self)
     def list_accumulation_infos(self, expr):
         return list_accumulation_infos(expr, self)
@@ -402,19 +410,16 @@ def list_accumulation_infos(expr, visitor):
     return itertools.product(testgen, trialgen)
-def get_accumulation_info(expr, visitor):
+def get_accumulation_info(expr, restriction, indices, visitor):
     element = expr.ufl_element()
     leaf_element = element
     element_index = 0
     from ufl import MixedElement
     if isinstance(expr.ufl_element(), MixedElement):
-        element_index = visitor.indices[0]
+        element_index = indices[0]
         leaf_element = element.extract_component(element_index)[1]
-    restriction = visitor.restriction
-    if visitor.measure == 'exterior_facet':
-        restriction = Restriction.POSITIVE
     inames = visitor.lfs_inames(leaf_element,
@@ -448,7 +453,6 @@ def generate_accumulation_instruction(expr, visitor):
     from dune.codegen.generation import instruction
-    from dune.codegen.options import option_switch
     quad_inames = visitor.quadrature_inames()
     lfs_inames = frozenset(visitor.test_info.inames)
     if visitor.trial_info:
@@ -495,7 +499,8 @@ def visit_integral(integral):
     # Start the visiting process!
     visitor = get_visitor(measure, subdomain_id)
-    visitor.accumulate(integrand)
+    with global_context(visitor=visitor):
+        visitor.accumulate(integrand)
@@ -505,24 +510,35 @@ def visit_integral(integral):
 def generate_kernel(integrals):
     logger = logging.getLogger(__name__)
-    # Visit all integrals once to collect information (dry-run)!
-    logger.debug('generate_kernel: visit_integrals (dry run)')
-    with global_context(dry_run=True):
+    # Assert that metadata for a given measure type agrees. This is a limitation
+    # of our current approach that is hard to overcome.
+    def remove_nonuser_metadata(d):
+        return frozendict({k: v for k, v in d.items() if k != "estimated_polynomial_degree"})
+    meta_dicts = [remove_nonuser_metadata(i.metadata()) for i in integrals]
+    if len(set(meta_dicts)) > 1:
+        measure = get_global_context_value("measure")
+        raise CodegenUFLError("Measure {} used with varying metadata! dune-codegen does not currently support this.")
+    with form_option_context(**meta_dicts[0]):
+        # Visit all integrals once to collect information (dry-run)!
+        logger.debug('generate_kernel: visit_integrals (dry run)')
+        with global_context(dry_run=True):
+            for integral in integrals:
+                visit_integral(integral)
+        # Now perform some checks on what should be done
+        from dune.codegen.sumfact.vectorization import decide_vectorization_strategy
+        logger.debug('generate_kernel: decide_vectorization_strategy')
+        decide_vectorization_strategy()
+        # Delete the cache contents and do the real thing!
+        logger.debug('generate_kernel: visit_integrals (no dry run)')
+        from dune.codegen.generation import delete_cache_items
+        delete_cache_items("kernel_default")
         for integral in integrals:
-    # Now perform some checks on what should be done
-    from dune.codegen.sumfact.vectorization import decide_vectorization_strategy
-    logger.debug('generate_kernel: decide_vectorization_strategy')
-    decide_vectorization_strategy()
-    # Delete the cache contents and do the real thing!
-    logger.debug('generate_kernel: visit_integrals (no dry run)')
-    from dune.codegen.generation import delete_cache_items
-    delete_cache_items("kernel_default")
-    for integral in integrals:
-        visit_integral(integral)
     from dune.codegen.pdelab.signatures import kernel_name, assembly_routine_signature
     name = kernel_name()
     signature = assembly_routine_signature()
@@ -691,6 +707,19 @@ class RegisterLikwidMethod(ClassMember):
         ClassMember.__init__(self, content)
+class RegisterSSCMarksMethod(ClassMember):
+    def __init__(self):
+        knl = name_example_kernel()
+        assert(knl is not None)
+        content = ["void dump_ssc_marks()"
+                   "{"]
+        register_liwkid_timers = [i for i in retrieve_cache_items(condition='register_ssc_marks')]
+        content.extend(map(lambda x: '  ' + x, register_liwkid_timers))
+        content += ["}"]
+        ClassMember.__init__(self, content)
 class LoopyKernelMethod(ClassMember):
     def __init__(self, signature, kernel, add_timings=True, initializer_list=[]):
         from loopy import generate_body
@@ -718,6 +747,12 @@ class LoopyKernelMethod(ClassMember):
                     content.append('  ' + 'LIKWID_MARKER_START(\"{}\");'.format(timer_name))
+                elif get_option('use_sde'):
+                    from dune.codegen.pdelab.driver.timings import get_region_marks, ssc_macro
+                    post_include(ssc_macro(), filetag='operatorfile')
+                    marks = get_region_marks(timer_name, driver=False)
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(marks[0]))
+                    dump_ssc_marks(timer_name)
                     post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
                     content.append('  ' + 'HP_TIMER_START({});'.format(timer_name))
@@ -730,6 +765,11 @@ class LoopyKernelMethod(ClassMember):
                         content.append('  ' + 'LIKWID_MARKER_START(\"{}\");'.format(setuptimer))
+                    elif get_option('use_sde'):
+                        from dune.codegen.pdelab.driver.timings import get_region_marks
+                        setup_marks = get_region_marks(setuptimer, driver=False)
+                        content.append('  ' + '__SSC_MARK(0x{});'.format(setup_marks[0]))
+                        dump_ssc_marks(setuptimer)
                         post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile')
                         content.append('  HP_TIMER_START({});'.format(setuptimer))
@@ -742,6 +782,8 @@ class LoopyKernelMethod(ClassMember):
             if add_timings and get_option('instrumentation_level') >= 4:
                 if get_option('use_likwid'):
                     content.append('  ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(setuptimer))
+                elif get_option('use_sde'):
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(setup_marks[1]))
                     content.append('  ' + 'HP_TIMER_STOP({});'.format(setuptimer))
@@ -752,6 +794,8 @@ class LoopyKernelMethod(ClassMember):
             if add_timings and get_option('instrumentation_level') >= 3:
                 if get_option('use_likwid'):
                     content.append('  ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(timer_name))
+                elif get_option('use_sde'):
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(marks[1]))
                     content.append('  ' + 'HP_TIMER_STOP({});'.format(timer_name))
@@ -823,7 +867,6 @@ def local_operator_default_settings(operator, form):
-    lop_template_range_field()
     # Make sure there is always the same constructor arguments, even if some of them are
     # not strictly needed. Also ensure the order.
@@ -837,10 +880,18 @@ def local_operator_default_settings(operator, form):
     base_class('Dune::PDELab::LocalOperatorDefaultFlags', classtag="operator")
     from dune.codegen.pdelab.driver import is_stationary
     if not is_stationary():
-        rf = lop_template_range_field()
+        rf = type_floatingpoint()
                    .format(rf), classtag="operator")
+    # *always* add the volume pattern, PDELab cannot handle matrices without diagonal blocks
+    with global_context(integral_type="cell"):
+        enum_pattern()
+        pattern_baseclass()
+    if get_form_option("block_preconditioner_diagonal") or get_form_option("block_preconditioner_pointdiagonal"):
+        enum_skeleton_twosided()
 def measure_is_enabled(measure):
     option_dict = {"cell": "enable_volume",
@@ -855,6 +906,16 @@ def generate_residual_kernels(form, original_form):
     if not get_form_option("generate_residuals"):
         return {}
+    if get_form_option("block_preconditioner_pointdiagonal"):
+        from ufl import derivative
+        jacform = derivative(original_form, original_form.coefficients()[0])
+        from dune.codegen.ufl.preprocess import preprocess_form
+        jacform = preprocess_form(jacform).preprocessed_form
+        from dune.codegen.ufl.transformations.blockpreconditioner import diagonal_block_jacobian
+        form = diagonal_block_jacobian(jacform)
     logger = logging.getLogger(__name__)
     with global_context(form_type='residual'):
         operator_kernels = {}
@@ -957,6 +1018,12 @@ def generate_jacobian_kernels(form, original_form):
                     from dune.codegen.pdelab.signatures import assembler_routine_name
                     with global_context(kernel=assembler_routine_name()):
                         kernel = [k for k in generate_kernels_per_integral(jac_apply_form.integrals_by_type(measure))]
+                        if kernel:
+                            enum_pattern()
+                            pattern_baseclass()
+                            enum_alpha()
                 operator_kernels[(measure, 'jacobian_apply')] = kernel
                 # Generate dummy functions for those kernels, that vanished in the differentiation process
@@ -970,13 +1037,9 @@ def generate_jacobian_kernels(form, original_form):
                         operator_kernels[(it, 'jacobian_apply')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)]
     if get_form_option("generate_jacobians"):
         with global_context(form_type="jacobian"):
-            if get_form_option("generate_jacobians"):
-                if get_form_option("sumfact"):
-                    was_sumfact = True
-                    if get_form_option("sumfact_regular_jacobians"):
-                        old_geometry_mixins = get_form_option("geometry_mixins")
-                        set_form_option("geometry_mixins", "generic")
-                        set_form_option("sumfact", False)
+            with form_option_context(conditional=get_form_option("sumfact") and get_form_option("sumfact_regular_jacobians"),
+                                     geometry_mixins="generic",
+                                     sumfact=False):
                 for measure in set(i.integral_type() for i in jacform.integrals()):
                     if not measure_is_enabled(measure):
@@ -986,6 +1049,12 @@ def generate_jacobian_kernels(form, original_form):
                         from dune.codegen.pdelab.signatures import assembler_routine_name
                         with global_context(kernel=assembler_routine_name()):
                             kernel = [k for k in generate_kernels_per_integral(jacform.integrals_by_type(measure))]
+                            if kernel:
+                                enum_pattern()
+                                pattern_baseclass()
+                                enum_alpha()
                     operator_kernels[(measure, 'jacobian')] = kernel
                 # Generate dummy functions for those kernels, that vanished in the differentiation process
@@ -997,10 +1066,6 @@ def generate_jacobian_kernels(form, original_form):
                     with global_context(integral_type=it):
                         from dune.codegen.pdelab.signatures import assembly_routine_signature
                         operator_kernels[(it, 'jacobian')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)]
-                if get_form_option("sumfact_regular_jacobians"):
-                    if was_sumfact:
-                        set_form_option("sumfact", True)
-                        set_form_option("geometry_mixins", old_geometry_mixins)
     return operator_kernels
@@ -1184,6 +1249,8 @@ def generate_localoperator_file(kernels, filename):
         include_file('dune/codegen/common/timer.hh', filetag='operatorfile')
         if get_option('use_likwid'):
+        elif get_option('use_sde'):
+            operator_methods.append(RegisterSSCMarksMethod())
     elif get_option('opcounter'):
diff --git a/python/dune/codegen/pdelab/quadrature.py b/python/dune/codegen/pdelab/quadrature.py
index ae6a7e2db7212b254e3857bc86813ea420ed0251..ca44056ce82037124cc3ea2a9781e6eb1feb9692 100644
--- a/python/dune/codegen/pdelab/quadrature.py
+++ b/python/dune/codegen/pdelab/quadrature.py
@@ -7,12 +7,14 @@ from dune.codegen.generation import (class_member,
+                                     kernel_cached,
-from dune.codegen.pdelab.localoperator import lop_template_range_field, name_ansatz_gfs_constructor_param
+from dune.codegen.loopy.target import type_floatingpoint
+from dune.codegen.pdelab.localoperator import name_ansatz_gfs_constructor_param
 from dune.codegen.options import get_form_option
 from pymbolic.primitives import Variable, Subscript
@@ -51,7 +53,7 @@ class GenericQuadratureMixin(QuadratureMixinBase):
     def define_quadrature_weights(self, name):
-        rf = lop_template_range_field()
+        rf = type_floatingpoint()
         from dune.codegen.pdelab.geometry import local_dimension
         dim = local_dimension()
@@ -70,6 +72,7 @@ class GenericQuadratureMixin(QuadratureMixinBase):
     def quadrature_inames(self):
         return (quadrature_iname(),)
+    @kernel_cached
     def quadrature_position(self, index=None):
         from dune.codegen.pdelab.geometry import local_dimension
         dim = local_dimension()
@@ -89,7 +92,7 @@ class GenericQuadratureMixin(QuadratureMixinBase):
     def define_quadrature_points(self, name):
-        rf = lop_template_range_field()
+        rf = type_floatingpoint()
         from dune.codegen.pdelab.geometry import local_dimension
         dim = local_dimension()
@@ -200,7 +203,7 @@ def quadrature_order():
       possible to use a different quadrature_order per direction.
     if get_form_option("quadrature_order"):
-        quadrature_order = tuple(map(int, get_form_option("quadrature_order").split(',')))
+        quadrature_order = tuple(map(int, str(get_form_option("quadrature_order")).split(',')))
         quadrature_order = _estimate_quadrature_order()
diff --git a/python/dune/codegen/pdelab/signatures.py b/python/dune/codegen/pdelab/signatures.py
index 09b832ac252138181d8be19c9fd4c098b5bb9b68..34acea22ea1fda1b8369bf499d7ce8ed0c37859d 100644
--- a/python/dune/codegen/pdelab/signatures.py
+++ b/python/dune/codegen/pdelab/signatures.py
@@ -9,6 +9,7 @@ from dune.codegen.pdelab.argument import (name_accumulation_variable,
+                                          type_linearizationpointcontainer,
 from dune.codegen.pdelab.spaces import (name_testfunctionspace,
@@ -293,8 +294,9 @@ def nonlinear_jacobian_apply_volume_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, avt)
 def nonlinear_jacobian_apply_volume_args():
@@ -312,8 +314,9 @@ def nonlinear_jacobian_apply_boundary_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, avt)
 def nonlinear_jacobian_apply_boundary_args():
@@ -331,8 +334,9 @@ def nonlinear_jacobian_apply_skeleton_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, lfsut, cct, cct, lfsvt, avt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, lfsut, cct, lpt, lfsvt, avt, avt)
 def nonlinear_jacobian_apply_skeleton_args():
diff --git a/python/dune/codegen/pdelab/spaces.py b/python/dune/codegen/pdelab/spaces.py
index d8c3c5684ce95e5a80baca6b8da11753f997607c..19de9bc7e7d0fa8bf14327eea39ff559d3450da7 100644
--- a/python/dune/codegen/pdelab/spaces.py
+++ b/python/dune/codegen/pdelab/spaces.py
@@ -5,6 +5,7 @@ from dune.codegen.generation import (class_member,
+                                     kernel_cached,
@@ -122,15 +123,12 @@ name_lfs = partial(_function_space_traversal, defaultname=available_lfs_names, r
 type_gfs = partial(_function_space_traversal, defaultname=available_gfs_names, recfunc=_type_gfs)
-def initialize_function_spaces(expr, visitor):
-    restriction = visitor.restriction
-    if visitor.measure == 'exterior_facet':
-        restriction = Restriction.POSITIVE
+def initialize_function_spaces(expr, restriction, indices):
     index = None
     from ufl import MixedElement
     if isinstance(expr.ufl_element(), MixedElement):
-        index = visitor.indices[0]
+        index = indices[0]
     from ufl.classes import Argument, Coefficient
     if isinstance(expr, Argument) and expr.number() == 0:
diff --git a/python/dune/codegen/pdelab/tensors.py b/python/dune/codegen/pdelab/tensors.py
index a924a39ae1bba19a94c47e5542567d8717cedf57..7a86ba52eff5f255c3d040d31951043a65266890 100644
--- a/python/dune/codegen/pdelab/tensors.py
+++ b/python/dune/codegen/pdelab/tensors.py
@@ -1,12 +1,12 @@
 """ Code generation for explicitly specified tensors """
 from dune.codegen.generation import (get_counted_variable,
-                                     domain,
-                                     iname,
+from dune.codegen.loopy.symbolic import FusedMultiplyAdd as FMA
+from loopy.match import Writes
 import pymbolic.primitives as prim
 import numpy as np
@@ -14,6 +14,116 @@ import loopy as lp
 import itertools as it
+def define_determinant(name, matrix, shape, visitor):
+    temporary_variable(name, managed=True)
+    assert len(shape) == 2 and shape[0] == shape[1]
+    dim = shape[0]
+    matrix_entry = [[prim.Subscript(prim.Variable(matrix), (i, j)) for j in range(dim)] for i in range(dim)]
+    if dim == 2:
+        expr_determinant = FMA(matrix_entry[0][0], matrix_entry[1][1], -1 * matrix_entry[1][0] * matrix_entry[0][1])
+    elif dim == 3:
+        fma_A = FMA(matrix_entry[1][1], matrix_entry[2][2], -1 * matrix_entry[1][2] * matrix_entry[2][1])
+        fma_B = FMA(matrix_entry[1][0], matrix_entry[2][2], -1 * matrix_entry[1][2] * matrix_entry[2][0])
+        fma_C = FMA(matrix_entry[1][0], matrix_entry[2][1], -1 * matrix_entry[1][1] * matrix_entry[2][0])
+        expr_determinant = FMA(matrix_entry[0][2], fma_C,
+                               FMA(matrix_entry[0][0], fma_A, -1 * matrix_entry[0][1] * fma_B))
+    else:
+        raise NotImplementedError()
+    instruction(expression=expr_determinant,
+                assignee=prim.Variable(name),
+                within_inames=frozenset(visitor.quadrature_inames()),
+                depends_on=frozenset({Writes(matrix)})
+                )
+def define_determinant_inverse(name, matrix, shape, visitor):
+    det = name_determinant(matrix, shape, visitor)
+    temporary_variable(name, managed=True)
+    instruction(expression=prim.Quotient(1, prim.Variable(det)),
+                assignee=prim.Variable(name),
+                within_inames=frozenset(visitor.quadrature_inames()),
+                depends_on=frozenset({Writes(matrix), Writes(det)})
+                )
+def define_matrix_inverse(name, name_inv, shape, visitor):
+    temporary_variable(name_inv, shape=shape, managed=True)
+    det_inv = name_determinant_inverse(name, shape, visitor)
+    assert len(shape) == 2 and shape[0] == shape[1]
+    dim = shape[0]
+    matrix_entry = [[prim.Subscript(prim.Variable(name), (i, j)) for j in range(dim)] for i in range(dim)]
+    assignee = [[prim.Subscript(prim.Variable(name_inv), (i, j)) for j in range(dim)] for i in range(dim)]
+    exprs = [[None for _ in range(dim)] for _ in range(dim)]
+    if dim == 2:
+        for i in range(2):
+            for j in range(2):
+                sign = 1. if i == j else -1.
+                exprs[i][j] = prim.Product((sign, prim.Variable(det_inv), matrix_entry[1 - i][1 - j]))
+    elif dim == 3:
+        exprs[0][0] = prim.Variable(det_inv) * FMA(matrix_entry[1][1], matrix_entry[2][2],
+                                                   -1 * matrix_entry[1][2] * matrix_entry[2][1])
+        exprs[1][0] = prim.Variable(det_inv) * FMA(matrix_entry[0][1], matrix_entry[2][2],
+                                                   -1 * matrix_entry[0][2] * matrix_entry[2][1]) * -1
+        exprs[2][0] = prim.Variable(det_inv) * FMA(matrix_entry[0][1], matrix_entry[1][2],
+                                                   -1 * matrix_entry[0][2] * matrix_entry[1][1])
+        exprs[0][1] = prim.Variable(det_inv) * FMA(matrix_entry[1][0], matrix_entry[2][2],
+                                                   -1 * matrix_entry[1][2] * matrix_entry[2][0]) * -1
+        exprs[1][1] = prim.Variable(det_inv) * FMA(matrix_entry[0][0], matrix_entry[2][2],
+                                                   -1 * matrix_entry[0][2] * matrix_entry[2][0])
+        exprs[2][1] = prim.Variable(det_inv) * FMA(matrix_entry[0][0], matrix_entry[1][2],
+                                                   -1 * matrix_entry[0][2] * matrix_entry[1][0]) * -1
+        exprs[0][2] = prim.Variable(det_inv) * FMA(matrix_entry[1][0], matrix_entry[2][1],
+                                                   -1 * matrix_entry[1][1] * matrix_entry[2][0])
+        exprs[1][2] = prim.Variable(det_inv) * FMA(matrix_entry[0][0], matrix_entry[2][1],
+                                                   -1 * matrix_entry[0][1] * matrix_entry[2][0]) * -1
+        exprs[2][2] = prim.Variable(det_inv) * FMA(matrix_entry[0][0], matrix_entry[1][1],
+                                                   -1 * matrix_entry[0][1] * matrix_entry[1][0])
+    else:
+        raise NotImplementedError
+    for j in range(dim):
+        for i in range(dim):
+            instruction(expression=exprs[i][j],
+                        assignee=assignee[i][j],
+                        within_inames=frozenset(visitor.quadrature_inames()),
+                        depends_on=frozenset({Writes(name), Writes(det_inv)}))
+def name_determinant(matrix, shape, visitor):
+    name = matrix + "_det"
+    define_determinant(name, matrix, shape, visitor)
+    return name
+def name_determinant_inverse(matrix, shape, visitor):
+    name = matrix + "_det_inv"
+    define_determinant_inverse(name, matrix, shape, visitor)
+    return name
+def name_matrix_inverse(name, shape, visitor):
+    name_inv = name + "_inv"
+    define_matrix_inverse(name, name_inv, shape, visitor)
+    return name_inv
 def define_assembled_tensor(name, expr, visitor):
@@ -22,7 +132,7 @@ def define_assembled_tensor(name, expr, visitor):
         visitor.indices = indices
         instruction(assignee=prim.Subscript(prim.Variable(name), indices),
-                    forced_iname_deps=frozenset(visitor.interface.quadrature_inames()),
+                    forced_iname_deps=frozenset(visitor.quadrature_inames()),
@@ -36,18 +146,47 @@ def name_assembled_tensor(o, visitor):
+def code_generation_time_inversion(expr, visitor):
+    mat = np.ndarray(expr.ufl_shape)
+    for indices in it.product(*tuple(range(i) for i in expr.ufl_shape)):
+        visitor.indices = indices
+        val = visitor.call(expr.ufl_operands[0])
+        if not isinstance(val, (float, int)):
+            visitor.indices = None
+            return None
+        mat[indices] = val
+    visitor.indices = None
+    return np.linalg.inv(mat)
 def pymbolic_matrix_inverse(o, visitor):
+    # Try to evaluate the matrix at code generation time.
+    # If this works (it does e.g. for Maxwell on structured grids)
+    # we can invert the matrix at code generation time!!!
     indices = visitor.indices
     visitor.indices = None
-    name = name_assembled_tensor(o.ufl_operands[0], visitor)
-    instruction(code="{}.invert();".format(name),
-                within_inames=frozenset(visitor.interface.quadrature_inames()),
-                depends_on=frozenset({lp.match.Writes(name),
-                                      lp.match.Tagged("sumfact_stage1"),
-                                      }),
-                tags=frozenset({"quad"}),
-                )
+    mat = code_generation_time_inversion(o, visitor)
+    if mat is not None:
+        return mat[indices]
+    # If code generation time inversion failed, we assemble it in C++
+    # and invert it there.
+    expr = o.ufl_operands[0]
+    name = name_assembled_tensor(expr, visitor)
+    if expr.shape[0] <= 3:
+        name = name_matrix_inverse(name, expr.ufl_shape, visitor)
+    else:
+        instruction(code="{}.invert();".format(name),
+                    within_inames=frozenset(visitor.quadrature_inames()),
+                    depends_on=frozenset({lp.match.Writes(name),
+                                          lp.match.Tagged("sumfact_stage1"),
+                                          }),
+                    tags=frozenset({name}),
+                    )
     visitor.indices = indices
     return prim.Variable(name)
diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py
index c1ccd13a5552980db0be3788651065dc8a39654c..25be6f7eb80b365d8eead39c3cbb93c0e36c52bc 100644
--- a/python/dune/codegen/sumfact/accumulation.py
+++ b/python/dune/codegen/sumfact/accumulation.py
@@ -39,9 +39,6 @@ from dune.codegen.sumfact.permutation import (permute_backward,
 from dune.codegen.sumfact.tabulation import (basis_functions_per_direction,
-from dune.codegen.sumfact.switch import (get_facedir,
-                                         get_facemod,
-                                         )
 from dune.codegen.sumfact.symbolic import SumfactKernel, SumfactKernelInterfaceBase
 from dune.codegen.ufl.modified_terminals import extract_modified_arguments
 from dune.codegen.tools import get_pymbolic_basename, get_leaf, ImmutableCuttingRecord
@@ -55,6 +52,7 @@ import pymbolic.primitives as prim
 from loopy.symbolic import WalkMapper
 import ufl.classes as uc
 from ufl import FiniteElement, MixedElement, TensorProductElement
+import itertools
 basis_sf_kernels = generator_factory(item_tags=("basis_sf_kernels",), context_tags='kernel', no_deco=True)
@@ -161,8 +159,10 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
         if self.trial_element is None:
             return ()
-            from dune.codegen.sumfact.basis import SumfactBasisMixin
-            return SumfactBasisMixin.lfs_inames(SumfactBasisMixin(), get_leaf(self.trial_element, self.trial_element_index), self.restriction)
+            mixin = get_form_option("basis_mixins")
+            from dune.codegen.generation import construct_from_mixins
+            MixinType = construct_from_mixins(mixins=[mixin], mixintype="basis", name="MixinType")
+            return MixinType.lfs_inames(MixinType(), get_leaf(self.trial_element, self.trial_element_index), self.restriction)
     def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # The result of stage 2 has the correct quadrature permutation but no
@@ -351,11 +351,84 @@ class SumfactAccumulationMixin(AccumulationMixinBase):
         return get_accumulation_info(expr, self)
     def list_accumulation_infos(self, expr):
-        return list_accumulation_infos(expr, self)
+        return itertools.product(_gradsplitting_generator(expr, self),
+                                 _trial_generator(expr, self),
+                                 )
     def generate_accumulation_instruction(self, expr):
         return generate_accumulation_instruction(expr, self)
+    def get_facedir(self, restriction):
+        from dune.codegen.pdelab.restriction import Restriction
+        if restriction == Restriction.POSITIVE or get_global_context_value("integral_type") == "exterior_facet":
+            return get_global_context_value("facedir_s")
+        if restriction == Restriction.NEGATIVE:
+            return get_global_context_value("facedir_n")
+        return None
+    def get_facemod(self, restriction):
+        from dune.codegen.pdelab.restriction import Restriction
+        if restriction == Restriction.POSITIVE or get_global_context_value("integral_type") == "exterior_facet":
+            return get_global_context_value("facemod_s")
+        if restriction == Restriction.NEGATIVE:
+            return get_global_context_value("facemod_n")
+        return None
+    def additional_matrix_sequence(self):
+        return None
+    @property
+    def prohibit_jacobian(self):
+        return False
+class SumfactPointDiagonalAccumulationMixin(SumfactAccumulationMixin):
+    def additional_matrix_sequence(self):
+        info = self.current_info[1]
+        return construct_basis_matrix_sequence(transpose=True,
+                                               derivative=info.grad_index,
+                                               facedir=self.get_facedir(info.restriction),
+                                               facemod=self.get_facemod(info.restriction),
+                                               basis_size=get_basis_size(info),
+                                               )
+    def get_accumulation_info(self, expr):
+        element = expr.ufl_element()
+        leaf_element = element
+        element_index = 0
+        from ufl import MixedElement
+        if isinstance(expr.ufl_element(), MixedElement):
+            element_index = self.indices[0]
+            leaf_element = element.extract_component(element_index)[1]
+        restriction = self.restriction
+        if self.measure == 'exterior_facet':
+            from dune.codegen.pdelab.restriction import Restriction
+            restriction = Restriction.POSITIVE
+        grad_index = None
+        if self.reference_grad:
+            if isinstance(expr.ufl_element(), MixedElement):
+                grad_index = self.indices[1]
+            else:
+                grad_index = self.indices[0]
+        return SumfactAccumulationInfo(element=expr.ufl_element(),
+                                       element_index=element_index,
+                                       restriction=restriction,
+                                       grad_index=grad_index,
+                                       )
+    def list_accumulation_infos(self, expr):
+        return itertools.product(_gradsplitting_generator(expr, self, number=0),
+                                 _gradsplitting_generator(expr, self, number=1),
+                                 )
+    @property
+    def prohibit_jacobian(self):
+        return True
 class SumfactAccumulationInfo(ImmutableRecord):
     def __init__(self,
@@ -422,9 +495,9 @@ def _get_childs(element):
             yield (i, element.extract_component(i)[1])
-def _test_generator(expr, visitor):
+def _gradsplitting_generator(expr, visitor, number=0):
     from dune.codegen.ufl.modified_terminals import extract_modified_arguments
-    ma = extract_modified_arguments(expr, argnumber=0)
+    ma = extract_modified_arguments(expr, argnumber=number)
     if len(ma) == 0:
     element = ma[0].argexpr.ufl_element()
@@ -440,7 +513,8 @@ def _test_generator(expr, visitor):
     for res in restrictions:
         for ei, e in _get_childs(element):
             for grad in (None,) + tuple(range(dim)):
-                yield SumfactAccumulationInfo(element_index=ei,
+                yield SumfactAccumulationInfo(element=element,
+                                              element_index=ei,
@@ -465,13 +539,20 @@ def _trial_generator(expr, visitor):
             yield SumfactAccumulationInfo(element_index=ei, restriction=res, element=e)
-def list_accumulation_infos(expr, visitor):
-    import itertools
-    return itertools.product(_test_generator(expr, visitor), _trial_generator(expr, visitor))
+def get_basis_size(info):
+    leaf_element = info.element
+    element_index = info.element_index
+    dim = world_dimension()
+    from ufl import MixedElement
+    if isinstance(leaf_element, MixedElement):
+        leaf_element = leaf_element.extract_component(element_index)[1]
+    degree = leaf_element._degree
+    if isinstance(degree, int):
+        degree = (degree,) * dim
+    return tuple(deg + 1 for deg in degree)
 def generate_accumulation_instruction(expr, visitor):
-    dim = world_dimension()
     test_info = visitor.test_info
     trial_info = visitor.trial_info
@@ -480,14 +561,7 @@ def generate_accumulation_instruction(expr, visitor):
     # Number of basis functions per direction
-    leaf_element = test_info.element
-    from ufl import MixedElement
-    if isinstance(leaf_element, MixedElement):
-        leaf_element = leaf_element.extract_component(test_info.element_index)[1]
-    degree = leaf_element._degree
-    if isinstance(degree, int):
-        degree = (degree,) * dim
-    basis_size = tuple(deg + 1 for deg in degree)
+    basis_size = get_basis_size(test_info)
     # Anisotropic finite elements are not (yet) supported by Dune
     assert(size == basis_size[0] for size in basis_size)
@@ -521,22 +595,29 @@ def generate_accumulation_instruction(expr, visitor):
     matrix_sequence = construct_basis_matrix_sequence(
-        facedir=get_facedir(test_info.restriction),
-        facemod=get_facemod(test_info.restriction),
-        basis_size=basis_size)
+        facedir=visitor.get_facedir(test_info.restriction),
+        facemod=visitor.get_facemod(test_info.restriction),
+        basis_size=basis_size,
+        additional_sequence=visitor.additional_matrix_sequence())
     jacobian_inames = trial_info.inames
     priority = test_info.grad_index
     if priority is None:
         priority = 3
+    trial_element = trial_info.element
+    trial_element_index = trial_info.element_index
+    if visitor.prohibit_jacobian:
+        trial_element = None
+        trial_element_index = 0
     output = AccumulationOutput(matrix_sequence=matrix_sequence,
                                 restriction=(test_info.restriction, trial_info.restriction),
-                                trial_element=trial_info.element,
-                                trial_element_index=trial_info.element_index,
+                                trial_element=trial_element,
+                                trial_element_index=trial_element_index,
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index 1e801f4550d17d89ef8972fe3ffa867d4a44ea25..7908ab3515eb43c1f667216481821024076bcf88 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -20,7 +20,7 @@ from cgen import ArrayOf, AlignedAttribute, Initializer
 from dune.codegen.generation import cache_restoring, delete_cache_items
 from dune.codegen.loopy.target import DuneTarget, type_floatingpoint
 from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
-from dune.codegen.options import get_option, set_option
+from dune.codegen.options import get_option, option_context
 from dune.codegen.error import CodegenAutotuneError
@@ -185,296 +185,281 @@ def generate_standalone_code_google_benchmark(sf, filename):
     # Turn off opcounting
-    opcounting = get_option("opcounter")
-    set_option("opcounter", False)
-    # Extract sum factorization kernel
-    from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
-    knl = realize_sumfact_kernel_function(sf)
-    # Add the implementation of the kernel.
-    # TODO: This can probably done in a safer way?
-    first_line = knl.member.lines[0]
-    arguments = first_line[first_line.find("(") + 1:first_line.find(")")]
-    with open(filename, "w") as f:
-        f.writelines(["// {}".format(first_line),
-                      "\n",
-                      "#include \"config.h\"\n",
-                      "#include \"benchmark/benchmark.h\"\n",
-                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
-                      "#include<dune/codegen/common/vectorclass.hh>\n",
-                      "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
-                      "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
-                      "#include<random>\n",
-                      "#include<fstream>\n",
-                      "#include<iostream>\n",
-                      "\n"
-                      ])
-    write_global_data(sf, filename)
-    with open(filename, "a") as f:
-        arguments = ', '.join(sf.interface.signature_args)
-        if len(arguments) > 0:
-            arguments = ', ' + arguments
-        arguments = 'const char* buffer0, const char* buffer1' + arguments
-        f.write("void sumfact_kernel({})\n".format(arguments))
-        for line in knl.member.lines[1:]:
-            f.write("{}\n".format(line))
-        f.write("\n\n")
-        f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
-    write_setup_code(sf, filename, define_thetas=False)
-    additional_arguments = [i.split()[-1] for i in sf.interface.signature_args]
-    additional_arguments = ', '.join(additional_arguments)
-    if len(additional_arguments) > 0:
-        additional_arguments = ', ' + additional_arguments
-    with open(filename, "a") as f:
-        f.writelines(["  for (auto _ : state){\n",
-                      "    sumfact_kernel(buffer0, buffer1{});\n".format(additional_arguments),
-                      "  }\n",
-                      "}\n",
-                      "BENCHMARK(BM_sumfact_kernel);\n",
-                      "\n",
-                      "BENCHMARK_MAIN();"
-                      ])
+    with option_context(opcounter=False):
+        # Extract sum factorization kernel
+        from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
+        knl = realize_sumfact_kernel_function(sf)
-    # Maybe turn opcounting on again
-    set_option("opcounter", opcounting)
+        # Add the implementation of the kernel.
+        # TODO: This can probably done in a safer way?
+        first_line = knl.member.lines[0]
+        arguments = first_line[first_line.find("(") + 1:first_line.find(")")]
+        with open(filename, "w") as f:
+            f.writelines(["// {}".format(first_line),
+                          "\n",
+                          "#include \"config.h\"\n",
+                          "#include \"benchmark/benchmark.h\"\n",
+                          "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
+                          "#include<dune/codegen/common/vectorclass.hh>\n",
+                          "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
+                          "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
+                          "#include<random>\n",
+                          "#include<fstream>\n",
+                          "#include<iostream>\n",
+                          "\n"
+                          ])
+        write_global_data(sf, filename)
+        with open(filename, "a") as f:
+            arguments = ', '.join(sf.interface.signature_args)
+            if len(arguments) > 0:
+                arguments = ', ' + arguments
+            arguments = 'const char* buffer0, const char* buffer1' + arguments
+            f.write("void sumfact_kernel({})\n".format(arguments))
+            for line in knl.member.lines[1:]:
+                f.write("{}\n".format(line))
+            f.write("\n\n")
+            f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
+        write_setup_code(sf, filename, define_thetas=False)
+        additional_arguments = [i.split()[-1] for i in sf.interface.signature_args]
+        additional_arguments = ', '.join(additional_arguments)
+        if len(additional_arguments) > 0:
+            additional_arguments = ', ' + additional_arguments
+        with open(filename, "a") as f:
+            f.writelines(["  for (auto _ : state){\n",
+                          "    sumfact_kernel(buffer0, buffer1{});\n".format(additional_arguments),
+                          "  }\n",
+                          "}\n",
+                          "BENCHMARK(BM_sumfact_kernel);\n",
+                          "\n",
+                          "BENCHMARK_MAIN();"
+                          ])
 def generate_standalone_code(sf, filename):
     # Turn off opcounting
-    opcounting = get_option("opcounter")
-    set_option("opcounter", False)
-    # Extract sum factorization kernel
-    from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
-    knl = realize_sumfact_kernel_function(sf)
-    first_line = knl.member.lines[0]
-    with open(filename, "w") as f:
-        f.writelines(["// {}".format(first_line),
-                      "\n",
-                      "#include \"config.h\"\n",
-                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
-                      "#include<dune/codegen/common/tsc.hh>\n",
-                      "#include<dune/codegen/common/vectorclass.hh>\n",
-                      "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
-                      "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
-                      "#include<random>\n",
-                      "#include<fstream>\n",
-                      "#include<iostream>\n",
-                      "\n"
-                      ])
-        f.writelines(["int main(int argc, char** argv)\n",
-                      "{\n",
-                      ])
-    write_setup_code(sf, filename)
-    # Write measurement
-    with open(filename, "a") as f:
-        # Start a TSC timer
-        f.writelines(["  auto start = Dune::PDELab::TSC::start();\n",
-                      ])
-        # Add the implementation of the kernel.
-        repeats = int(1e9 / sf.operations)
-        f.write("  for(int i=0; i<{}; ++i)\n".format(repeats))
-        f.write("  {\n")
-        for line in knl.member.lines[1:]:
-            f.write("    {}\n".format(line))
-        f.write("  }\n")
-        # Stop the TSC timer and write the result to a file
-        f.writelines(["  auto stop = Dune::PDELab::TSC::stop();\n",
-                      "  std::ofstream file;\n",
-                      "  file.open(argv[1]);\n",
-                      "  file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))),
-                      "  file.close();\n",
-                      "  accum += output[dis(rng)];\n",
-                      "  std::cout << accum;\n",
-                      "}\n",
-                      ])
-    # Maybe turn opcounting on again
-    set_option("opcounter", opcounting)
+    with option_context(opcounter=False):
+        # Extract sum factorization kernel
+        from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
+        knl = realize_sumfact_kernel_function(sf)
+        first_line = knl.member.lines[0]
+        with open(filename, "w") as f:
+            f.writelines(["// {}".format(first_line),
+                          "\n",
+                          "#include \"config.h\"\n",
+                          "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
+                          "#include<dune/codegen/common/tsc.hh>\n",
+                          "#include<dune/codegen/common/vectorclass.hh>\n",
+                          "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
+                          "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
+                          "#include<random>\n",
+                          "#include<fstream>\n",
+                          "#include<iostream>\n",
+                          "\n"
+                          ])
+            f.writelines(["int main(int argc, char** argv)\n",
+                          "{\n",
+                          ])
+        write_setup_code(sf, filename)
+        # Write measurement
+        with open(filename, "a") as f:
+            # Start a TSC timer
+            f.writelines(["  auto start = Dune::PDELab::TSC::start();\n",
+                          ])
+            # Add the implementation of the kernel.
+            repeats = int(1e9 / sf.operations)
+            f.write("  for(int i=0; i<{}; ++i)\n".format(repeats))
+            f.write("  {\n")
+            for line in knl.member.lines[1:]:
+                f.write("    {}\n".format(line))
+            f.write("  }\n")
+            # Stop the TSC timer and write the result to a file
+            f.writelines(["  auto stop = Dune::PDELab::TSC::stop();\n",
+                          "  std::ofstream file;\n",
+                          "  file.open(argv[1]);\n",
+                          "  file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))),
+                          "  file.close();\n",
+                          "  accum += output[dis(rng)];\n",
+                          "  std::cout << accum;\n",
+                          "}\n",
+                          ])
 def generate_standalone_kernel_code(kernel, signature, filename, transformations=None):
     # Turn off opcounting
-    opcounting = get_option("opcounter")
-    set_option("opcounter", False)
-    # Remove opcounter from signature
-    p = re.compile('OpCounter::OpCounter<([^>]*)>')
-    assert len(signature) == 1
-    sig = signature[0]
-    sig = p.sub(r'\1', sig)
-    assert 'OpCounter' not in signature
-    # Which transformations were applied
-    codegen_transformations = ''
-    if transformations:
+    with option_context(opcounter=False):
+        # Remove opcounter from signature
+        p = re.compile('OpCounter::OpCounter<([^>]*)>')
+        assert len(signature) == 1
+        sig = signature[0]
+        sig = p.sub(r'\1', sig)
+        assert 'OpCounter' not in signature
+        # Which transformations were applied
         codegen_transformations = ''
-        for trafo in transformations:
-            codegen_transformations += '// {}\n'.format(trafo)
-    template = 'kernel_benchmark_template1.cc.in'
-    use_datasets = True
-    # Old benchmark template
-    # template = 'kernel_benchmark_template0.cc.in'
-    # use_datasets = False
-    template_filename = pkg_resources.resource_filename(__name__, template)
-    with open(template_filename, 'r') as f:
-        benchmark = f.read()
-    # Find function arguments and global arguments
-    arguments = sig[sig.find('(') + 1:sig.find(')')].split(',')
-    arguments = [a.split(' ')[-1] for a in arguments]
-    global_args = [a for a in kernel.args if a.name not in arguments]
-    buffer_arguments = [a for a in arguments if a.startswith('buff')]
-    input_arguments = [a for a in arguments if a not in buffer_arguments]
-    # Declare global arguments
-    codegen_declare_global_arguments = ''
-    target = DuneTarget()
-    for g in global_args:
-        decl_info = g.decl_info(target, True, g.dtype)
-        for idi in decl_info:
-            ast_builder = target.get_device_ast_builder()
-            arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
-            arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
-            arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
-            codegen_declare_global_arguments += '{}\n'.format(arg_decl)
-    codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, '  ')
-    # Helper function for argument initialization
-    def _initialize_arg(arg):
-        if isinstance(arg, lp.ValueArg):
-            return []
-        real = type_floatingpoint()
-        size = reduce(mul, arg.shape)
-        fill_name = arg.name + '_fill'
-        lines = ['  {}* {} = (double *) {};'.format(real, fill_name, arg.name),
-                 '  for (std::size_t i=0; i<{}; ++i){{'.format(size),
-                 '    {}[i] = unif(re);'.format(fill_name),
-                 '  }']
-        return lines
-    # Initialize global arguments
-    codegen_initialize_global_arguments = ''
-    for arg in global_args:
-        lines = _initialize_arg(arg)
-        codegen_initialize_global_arguments += '\n'.join(lines) + '\n'
-    codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, '  ')
-    codegen_initialize_input = ''
-    # Function we want to benchmark
-    codegen_benchmark_function = ''
-    codegen_benchmark_function += sig[0:sig.find(')') + 1]
-    codegen_benchmark_function += lp.generate_body(kernel)
-    codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, '  ')
-    # Declare function arguments
-    codegen_declare_arguments = []
-    codegen_declare_input = []
-    function_arguments = [a for a in kernel.args if a.name in arguments]
-    for arg in function_arguments:
-        if 'buffer' in arg.name:
-            byte_size = reduce(mul, arg.shape) * 8
-            codegen_declare_arguments.append('  char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
-                                                                                                      byte_size,
-                                                                                                      arg.alignment),)
-        elif isinstance(arg, lp.ValueArg):
-            assert 'jacobian_offset' in arg.name
-            decl = arg.get_arg_decl(ast_builder)
-            decl = Initializer(decl, 'unif_int(re)')
-            codegen_declare_arguments.append(('  {}\n'.format(decl)))
-        else:
-            assert 'fastdg' in arg.name
-            size = reduce(mul, arg.shape)
-            min_stride = min([tag.stride for tag in arg.dim_tags])
-            size *= min_stride
-            alignment = arg.dtype.itemsize
+        if transformations:
+            codegen_transformations = ''
+            for trafo in transformations:
+                codegen_transformations += '// {}\n'.format(trafo)
+        template = 'kernel_benchmark_template1.cc.in'
+        use_datasets = True
+        # Old benchmark template
+        # template = 'kernel_benchmark_template0.cc.in'
+        # use_datasets = False
+        template_filename = pkg_resources.resource_filename(__name__, template)
+        with open(template_filename, 'r') as f:
+            benchmark = f.read()
+        # Find function arguments and global arguments
+        arguments = sig[sig.find('(') + 1:sig.find(')')].split(',')
+        arguments = [a.split(' ')[-1] for a in arguments]
+        global_args = [a for a in kernel.args if a.name not in arguments]
+        buffer_arguments = [a for a in arguments if a.startswith('buff')]
+        input_arguments = [a for a in arguments if a not in buffer_arguments]
+        # Declare global arguments
+        codegen_declare_global_arguments = ''
+        target = DuneTarget()
+        for g in global_args:
+            decl_info = g.decl_info(target, True, g.dtype)
+            for idi in decl_info:
+                ast_builder = target.get_device_ast_builder()
+                arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
+                arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
+                arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
+                codegen_declare_global_arguments += '{}\n'.format(arg_decl)
+        codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, '  ')
+        # Helper function for argument initialization
+        def _initialize_arg(arg):
+            if isinstance(arg, lp.ValueArg):
+                return []
             real = type_floatingpoint()
-            if use_datasets:
-                codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real,
-                                                                                                             arg.name,
-                                                                                                             size,
-                                                                                                             alignment)))
+            size = reduce(mul, arg.shape)
+            fill_name = arg.name + '_fill'
+            lines = ['  {}* {} = (double *) {};'.format(real, fill_name, arg.name),
+                     '  for (std::size_t i=0; i<{}; ++i){{'.format(size),
+                     '    {}[i] = unif(re);'.format(fill_name),
+                     '  }']
+            return lines
+        # Initialize global arguments
+        codegen_initialize_global_arguments = ''
+        for arg in global_args:
+            lines = _initialize_arg(arg)
+            codegen_initialize_global_arguments += '\n'.join(lines) + '\n'
+        codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, '  ')
+        codegen_initialize_input = ''
+        # Function we want to benchmark
+        codegen_benchmark_function = ''
+        codegen_benchmark_function += sig[0:sig.find(')') + 1]
+        codegen_benchmark_function += lp.generate_body(kernel)
+        codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, '  ')
+        # Declare function arguments
+        codegen_declare_arguments = []
+        codegen_declare_input = []
+        function_arguments = [a for a in kernel.args if a.name in arguments]
+        for arg in function_arguments:
+            if 'buffer' in arg.name:
+                byte_size = reduce(mul, arg.shape) * 8
+                codegen_declare_arguments.append('  char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
+                                                                                                          byte_size,
+                                                                                                          arg.alignment),)
+            elif isinstance(arg, lp.ValueArg):
+                assert 'jacobian_offset' in arg.name
+                decl = arg.get_arg_decl(ast_builder)
+                decl = Initializer(decl, 'unif_int(re)')
+                codegen_declare_arguments.append(('  {}\n'.format(decl)))
-                codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
-                                                                                                   arg.name,
-                                                                                                   size,
-                                                                                                   alignment)))
-    codegen_declare_arguments = ''.join(codegen_declare_arguments)
-    codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, '  ')
-    codegen_declare_input = ''.join(codegen_declare_input)
-    codegen_declare_input = textwrap.indent(codegen_declare_input, '  ')
-    # Initialize function arguments
-    codegen_initialize_arguments = ''
-    codegen_initialize_input = ''
-    for arg in function_arguments:
-        if 'fastdg' in arg.name:
-            if use_datasets:
-                lines = _initialize_arg(arg)
-                lines = ['  ' + a for a in lines]
-                lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines]
-                lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){')
-                lines.append('}')
-                codegen_initialize_input += '\n'.join(lines) + '\n'
+                assert 'fastdg' in arg.name
+                size = reduce(mul, arg.shape)
+                min_stride = min([tag.stride for tag in arg.dim_tags])
+                size *= min_stride
+                alignment = arg.dtype.itemsize
+                real = type_floatingpoint()
+                if use_datasets:
+                    codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                                                                 arg.name,
+                                                                                                                 size,
+                                                                                                                 alignment)))
+                else:
+                    codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                                                       arg.name,
+                                                                                                       size,
+                                                                                                       alignment)))
+        codegen_declare_arguments = ''.join(codegen_declare_arguments)
+        codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, '  ')
+        codegen_declare_input = ''.join(codegen_declare_input)
+        codegen_declare_input = textwrap.indent(codegen_declare_input, '  ')
+        # Initialize function arguments
+        codegen_initialize_arguments = ''
+        codegen_initialize_input = ''
+        for arg in function_arguments:
+            if 'fastdg' in arg.name:
+                if use_datasets:
+                    lines = _initialize_arg(arg)
+                    lines = ['  ' + a for a in lines]
+                    lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines]
+                    lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){')
+                    lines.append('}')
+                    codegen_initialize_input += '\n'.join(lines) + '\n'
+                else:
+                    lines = _initialize_arg(arg)
+                    codegen_initialize_arguments += '\n'.join(lines) + '\n'
                 lines = _initialize_arg(arg)
                 codegen_initialize_arguments += '\n'.join(lines) + '\n'
+        codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, '  ')
+        codegen_initialize_input = textwrap.indent(codegen_initialize_input, '  ')
+        # Call the benchmark function
+        if use_datasets:
+            arguments_with_datasets = arguments.copy()
+            arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments]
+            codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n'
+            codegen_call_benchmark_function += '  ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n'
+            for arg in input_arguments:
+                codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg)
+            codegen_call_benchmark_function += '}'
-            lines = _initialize_arg(arg)
-            codegen_initialize_arguments += '\n'.join(lines) + '\n'
-    codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, '  ')
-    codegen_initialize_input = textwrap.indent(codegen_initialize_input, '  ')
-    # Call the benchmark function
-    if use_datasets:
-        arguments_with_datasets = arguments.copy()
-        arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments]
-        codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n'
-        codegen_call_benchmark_function += '  ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n'
-        for arg in input_arguments:
-            codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg)
-        codegen_call_benchmark_function += '}'
-    else:
-        codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n'
-    codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, '    ')
-    # Replace placeholders in benchmark template
-    benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations)
-    benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments)
-    benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input)
-    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments)
-    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input)
-    benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function)
-    benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments)
-    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments)
-    benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function)
-    # Write benchmark source file
-    with open(filename, 'w') as f:
-        f.writelines(benchmark)
-    # Maybe turn opcounting on again
-    set_option("opcounter", opcounting)
+            codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n'
+        codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, '    ')
+        # Replace placeholders in benchmark template
+        benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations)
+        benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments)
+        benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input)
+        benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments)
+        benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input)
+        benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function)
+        benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments)
+        benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments)
+        benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function)
+        # Write benchmark source file
+        with open(filename, 'w') as f:
+            f.writelines(benchmark)
 def autotune_realization(sf=None, kernel=None, signature=None, transformations=None):
diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py
index 757d51870aa7ccf690b9362b9b0522a4892f792b..83e5206b5289719f3c4e85f4ca523976cb6feb79 100644
--- a/python/dune/codegen/sumfact/basis.py
+++ b/python/dune/codegen/sumfact/basis.py
@@ -31,9 +31,6 @@ from dune.codegen.sumfact.permutation import (permute_backward,
-from dune.codegen.sumfact.switch import (get_facedir,
-                                         get_facemod,
-                                         )
 from dune.codegen.pdelab.argument import name_coefficientcontainer, name_applycontainer
 from dune.codegen.pdelab.basis import GenericBasisMixin
 from dune.codegen.pdelab.geometry import (local_dimension,
@@ -86,7 +83,7 @@ class SumfactBasisMixin(GenericBasisMixin):
         temporary_variable(name, shape=())
         quad_inames = self.quadrature_inames()
         inames = self.lfs_inames(element, restriction)
-        facedir = get_facedir(restriction)
+        facedir = self.get_facedir(restriction)
         # Collect the pairs of lfs/quad inames that are in use
         # On facets, the normal direction of the facet is excluded
@@ -106,7 +103,7 @@ class SumfactBasisMixin(GenericBasisMixin):
         # Add the missing direction on facedirs by evaluating at either 0 or 1
         if facedir is not None:
-            facemod = get_facemod(restriction)
+            facemod = self.get_facemod(restriction)
             prod = prod + (prim.Call(PolynomialLookup(name_polynomials(element.degree()), False),
                                      (prim.Variable(inames[facedir]), facemod)),)
@@ -141,7 +138,7 @@ class SumfactBasisMixin(GenericBasisMixin):
         temporary_variable(name, shape=())
         quad_inames = self.quadrature_inames()
         inames = self.lfs_inames(element, restriction)
-        facedir = get_facedir(restriction)
+        facedir = self.get_facedir(restriction)
         # Map the direction to a quadrature iname
         quadinamemapping = {}
@@ -161,7 +158,7 @@ class SumfactBasisMixin(GenericBasisMixin):
                 prod.append(tab.pymbolic((prim.Variable(quadinamemapping[i]), prim.Variable(inames[i]))))
         if facedir is not None:
-            facemod = get_facemod(restriction)
+            facemod = self.get_facemod(restriction)
             prod.append(prim.Call(PolynomialLookup(name_polynomials(element.degree()), index == facedir),
                                   (prim.Variable(inames[facedir]), facemod)),)
@@ -197,8 +194,8 @@ class SumfactBasisMixin(GenericBasisMixin):
         # Construct the matrix sequence for this sum factorization
         matrix_sequence = construct_basis_matrix_sequence(derivative=derivative,
-                                                          facedir=get_facedir(restriction),
-                                                          facemod=get_facemod(restriction),
+                                                          facedir=self.get_facedir(restriction),
+                                                          facemod=self.get_facemod(restriction),
         inp = LFSSumfactKernelInput(matrix_sequence=matrix_sequence,
@@ -235,6 +232,28 @@ class SumfactBasisMixin(GenericBasisMixin):
         return prim.Subscript(var, vsf.quadrature_index(sf, self))
+class SumfactPointDiagonalBasisMixin(SumfactBasisMixin):
+    def lfs_inames(self, element, restriction, number=1):
+        return ()
+    def implement_basis(self, element, restriction, number):
+        info = self.current_info[number]
+        if element == info.element and restriction == info.restriction:
+            return 1
+        else:
+            return 0
+    def implement_reference_gradient(self, element, restriction, number):
+        index, = self.indices
+        self.indices = None
+        info = self.current_info[number]
+        if element == info.element and restriction == info.restriction and index == info.grad_index:
+            return 1
+        else:
+            return 0
 class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
     def __init__(self,
diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py
index 8ac8aaa4faa1a4849c3b88fb54f87b0f87ca7b2b..79ca2726bd3fab0dca0f7a21598ef13b9fe0dd0a 100644
--- a/python/dune/codegen/sumfact/geometry.py
+++ b/python/dune/codegen/sumfact/geometry.py
@@ -16,7 +16,8 @@ from dune.codegen.generation import (class_member,
 from dune.codegen.loopy.flatten import flatten_index
-from dune.codegen.options import get_option
+from dune.codegen.loopy.target import type_floatingpoint
+from dune.codegen.options import get_form_option, get_option
 from dune.codegen.pdelab.geometry import (enforce_boundary_restriction,
@@ -28,7 +29,6 @@ from dune.codegen.pdelab.geometry import (enforce_boundary_restriction,
 from dune.codegen.pdelab.localoperator import (name_ansatz_gfs_constructor_param,
-                                               lop_template_range_field,
 from dune.codegen.pdelab.restriction import restricted_name
 from dune.codegen.sumfact.accumulation import basis_sf_kernels, sumfact_iname
@@ -39,10 +39,8 @@ from dune.codegen.sumfact.permutation import (permute_backward,
 from dune.codegen.sumfact.quadrature import additional_inames
-from dune.codegen.sumfact.switch import get_facedir, get_facemod
 from dune.codegen.sumfact.symbolic import SumfactKernelInterfaceBase, SumfactKernel
 from dune.codegen.tools import get_pymbolic_basename, ImmutableCuttingRecord
-from dune.codegen.options import get_form_option, option_switch
 from dune.codegen.ufl.modified_terminals import Restriction
 from loopy.match import Writes
@@ -52,8 +50,13 @@ import numpy as np
 import loopy as lp
+class SumfactGeometryMixinBase(GenericPDELabGeometryMixin):
+    def nonsumfact_fallback(self):
+        return None
-class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin):
+class SumfactMultiLinearGeometryMixin(SumfactGeometryMixinBase):
     def nonsumfact_fallback(self):
         return "generic"
@@ -162,8 +165,8 @@ class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin):
     def outer_normal(self):
         """ This is the *unnormalized* outer normal """
         name = "outer_normal"
-        facedir_s = get_facedir(Restriction.POSITIVE)
-        facemod_s = get_facemod(Restriction.POSITIVE)
+        facedir_s = self.get_facedir(Restriction.POSITIVE)
+        facemod_s = self.get_facemod(Restriction.POSITIVE)
         temporary_variable(name, shape=(world_dimension(),))
         for i in range(world_dimension()):
@@ -210,8 +213,8 @@ class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin):
         restriction = enforce_boundary_restriction(self)
         # Generate sum factorization kernel and add vectorization info
-        matrix_sequence = construct_basis_matrix_sequence(facedir=get_facedir(restriction),
-                                                          facemod=get_facemod(restriction),
+        matrix_sequence = construct_basis_matrix_sequence(facedir=self.get_facedir(restriction),
+                                                          facemod=self.get_facemod(restriction),
                                                           basis_size=(2,) * world_dimension())
         inp = GeoCornersInput(matrix_sequence=matrix_sequence,
@@ -243,7 +246,7 @@ class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin):
-class SumfactAxiParallelGeometryMixin(AxiparallelGeometryMixin):
+class SumfactAxiParallelGeometryMixin(SumfactGeometryMixinBase, AxiparallelGeometryMixin):
     def nonsumfact_fallback(self):
         return "axiparallel"
@@ -253,8 +256,8 @@ class SumfactAxiParallelGeometryMixin(AxiparallelGeometryMixin):
         assert isinstance(i, int)
         # Use facemod_s and facedir_s
-        if i == get_facedir(Restriction.POSITIVE):
-            if get_facemod(Restriction.POSITIVE):
+        if i == self.get_facedir(Restriction.POSITIVE):
+            if self.get_facemod(Restriction.POSITIVE):
                 return 1
                 return -1
@@ -270,7 +273,7 @@ class SumfactEqudistantGeometryMixin(EquidistantGeometryMixin, SumfactAxiParalle
     def facet_jacobian_determinant(self, o):
         name = "fdetjac"
-        facedir = get_facedir(Restriction.POSITIVE)
+        facedir = self.get_facedir(Restriction.POSITIVE)
         globalarg(name, shape=(world_dimension(),))
         return prim.Subscript(prim.Variable(name), (facedir,))
@@ -283,7 +286,7 @@ class SumfactEqudistantGeometryMixin(EquidistantGeometryMixin, SumfactAxiParalle
     def _define_facet_jacobian_determinant_eval(self, name):
         gfs = name_ansatz_gfs_constructor_param()
-        rft = lop_template_range_field()
+        rft = type_floatingpoint()
         code = ["{",
                 "  auto e = *({}.gridView().template begin<0>());".format(gfs),
                 "  int dir=0;",
@@ -304,8 +307,7 @@ class SumfactEqudistantGeometryMixin(EquidistantGeometryMixin, SumfactAxiParalle
         restriction = Restriction.NONE
         if self.measure == "interior_facet":
             restriction = Restriction.POSITIVE
-        from dune.codegen.sumfact.switch import get_facedir
-        face = get_facedir(restriction)
+        face = self.get_facedir(restriction)
         lowcorner = name_lowerleft_corner()
         meshwidth = name_meshwidth()
@@ -483,8 +485,7 @@ def define_corner(name, low):
 def define_mesh_width(name):
-    from dune.codegen.pdelab.localoperator import lop_template_range_field
-    rft = lop_template_range_field()
+    rft = type_floatingpoint()
     return "Dune::FieldVector<{}, {}> {};".format(rft, world_dimension(), name)
@@ -528,8 +529,8 @@ def _name_jacobian(i, j, restriction, visitor):
     # Create matrix sequence with derivative in j direction
     matrix_sequence = construct_basis_matrix_sequence(derivative=j,
-                                                      facedir=get_facedir(restriction),
-                                                      facemod=get_facemod(restriction),
+                                                      facedir=visitor.get_facedir(restriction),
+                                                      facemod=visitor.get_facemod(restriction),
                                                       basis_size=(2,) * world_dimension())
     # Sum factorization input for the i'th component of the geometry mapping
diff --git a/python/dune/codegen/sumfact/permutation.py b/python/dune/codegen/sumfact/permutation.py
index 7f37dfeae795031ec81b292b96e65649f8fc78cc..916f7773592191a7dcc0731e56b93524daf4528e 100644
--- a/python/dune/codegen/sumfact/permutation.py
+++ b/python/dune/codegen/sumfact/permutation.py
@@ -3,7 +3,7 @@
 import itertools
 from dune.codegen.options import get_option
-from dune.codegen.sumfact.switch import get_facedir, get_facemod
+from dune.codegen.sumfact.tabulation import quadrature_points_per_direction
 from dune.codegen.ufl.modified_terminals import Restriction
@@ -125,8 +125,9 @@ def sumfact_quadrature_permutation_strategy(dim, restriction):
         # all others can be derived by rotating the cube and matching edge
         # directions.
         def _order_on_self(restriction):
-            facedir = get_facedir(restriction)
-            facemod = get_facemod(restriction)
+            from dune.codegen.sumfact.accumulation import SumfactAccumulationMixin
+            facedir = SumfactAccumulationMixin.get_facedir(None, restriction)
+            facemod = SumfactAccumulationMixin.get_facemod(None, restriction)
             quadrature_order = {
                 (0, 0): (0, 1, 2),
diff --git a/python/dune/codegen/sumfact/quadrature.py b/python/dune/codegen/sumfact/quadrature.py
index 91e99c4c3c0cd333d6e64353ed06ac0c129a7e05..ae387426ebda79a8435811c548e91761f2de1b3c 100644
--- a/python/dune/codegen/sumfact/quadrature.py
+++ b/python/dune/codegen/sumfact/quadrature.py
@@ -10,7 +10,6 @@ from dune.codegen.generation import (domain,
-from dune.codegen.sumfact.switch import get_facedir
 from dune.codegen.sumfact.tabulation import (quadrature_points_per_direction,
@@ -22,7 +21,6 @@ from dune.codegen.pdelab.geometry import (local_dimension,
 from dune.codegen.pdelab.quadrature import GenericQuadratureMixin
 from dune.codegen.options import get_form_option
-from dune.codegen.sumfact.switch import get_facedir
 from dune.codegen.loopy.target import dtype_floatingpoint
 from loopy import CallMangleInfo
diff --git a/python/dune/codegen/sumfact/switch.py b/python/dune/codegen/sumfact/switch.py
index a031790dd5321841eb8c100195cde675651fe1d3..6b8c43e4dac7bfd08f7bbbfb6a551b47fd02a499 100644
--- a/python/dune/codegen/sumfact/switch.py
+++ b/python/dune/codegen/sumfact/switch.py
@@ -12,7 +12,7 @@ from dune.codegen.pdelab.signatures import (assembly_routine_args,
-from dune.codegen.options import get_form_option, get_option, set_form_option
+from dune.codegen.options import get_form_option, get_option, form_option_context
 from dune.codegen.cgen.clazz import ClassMember
@@ -26,21 +26,12 @@ def sumfact_generate_kernels_per_integral(integrals):
     if measure == "exterior_facet":
         # Maybe skip sum factorization on boundary integrals
         if not get_form_option("sumfact_on_boundary"):
-            set_form_option("sumfact", False)
-            # Try to find a fallback for sum factorized geometry mixins
-            geometry_backup = get_form_option("geometry_mixins")
-            mixin = construct_from_mixins(mixins=[geometry_backup])()
-            if hasattr(mixin, "nonsumfact_fallback"):
-                set_form_option("geometry_mixins", mixin.nonsumfact_fallback())
-            for k in generate_kernels_per_integral(integrals):
-                yield k
-            # Reset state
-            set_form_option("geometry_mixins", geometry_backup)
-            set_form_option("sumfact", True)
-            return
+            mixin = construct_from_mixins(mixins=[get_form_option("geometry_mixins")])()
+            geometry = mixin.nonsumfact_fallback() or get_form_option("geometry_mixins")
+            with form_option_context(sumfact=False, geometry_mixins=geometry):
+                for k in generate_kernels_per_integral(integrals):
+                    yield k
+                return
         # Generate all necessary kernels
         for facedir in range(dim):
@@ -169,25 +160,3 @@ def generate_interior_facet_switch():
     return ClassMember(signature + block)
-def get_facedir(restriction):
-    from dune.codegen.pdelab.restriction import Restriction
-    if restriction == Restriction.POSITIVE or get_global_context_value("integral_type") == "exterior_facet":
-        return get_global_context_value("facedir_s")
-    if restriction == Restriction.NEGATIVE:
-        return get_global_context_value("facedir_n")
-    if restriction == Restriction.NONE:
-        return None
-    assert False
-def get_facemod(restriction):
-    from dune.codegen.pdelab.restriction import Restriction
-    if restriction == Restriction.POSITIVE or get_global_context_value("integral_type") == "exterior_facet":
-        return get_global_context_value("facemod_s")
-    if restriction == Restriction.NEGATIVE:
-        return get_global_context_value("facemod_n")
-    if restriction == Restriction.NONE:
-        return None
-    assert False
diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py
index dfd9383f93c79d3c50bfcca24cdd717fcc16aa58..8fdd1dd8d23ddca5745be5acb4e02837478d4c9d 100644
--- a/python/dune/codegen/sumfact/symbolic.py
+++ b/python/dune/codegen/sumfact/symbolic.py
@@ -14,7 +14,7 @@ from dune.codegen.sumfact.permutation import (flop_cost,
-from dune.codegen.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray
+from dune.codegen.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray, quadrature_points_per_direction
 from dune.codegen.loopy.target import dtype_floatingpoint, type_floatingpoint
 from dune.codegen.loopy.vcl import ExplicitVCLCast, VCLLowerUpperLoad
 from dune.codegen.tools import get_leaf, maybe_wrap_subscript, remove_duplicates
@@ -562,6 +562,10 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
         ImmutableRecord.__init__(self, **defaultdict)
         prim.Variable.__init__(self, "SUMFACT")
+        # Precompute and cache a number of keys
+        self._cached_cache_key = None
+        self._cached_flop_cost = {}
     # The methods/fields needed to get a well-formed pymbolic node
@@ -617,12 +621,15 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
         Any two sum factorization kernels having the same cache_key
         are realized simultaneously!
-        if self.buffer is None:
-            # During dry run, we return something unique to this kernel
-            return repr(self)
-        else:
-            # Later we identify parallely implemented kernels by the assigned buffer
-            return self.buffer
+        if self._cached_cache_key is None:
+            if self.buffer is None:
+                # During dry run, we return something unique to this kernel
+                self._cached_cache_key = repr(self)
+            else:
+                # Later we identify parallely implemented kernels by the assigned buffer
+                self._cached_cache_key = self.buffer
+        return self._cached_cache_key
     def inout_key(self):
@@ -818,7 +825,10 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     def operations(self):
         """ The total number of floating point operations for the kernel
         to be carried out """
-        return flop_cost(self.matrix_sequence_cost_permuted)
+        qp = quadrature_points_per_direction()
+        if qp not in self._cached_flop_cost:
+            self._cached_flop_cost[qp] = flop_cost(self.matrix_sequence_cost_permuted)
+        return self._cached_flop_cost[qp]
 # Extract the argument list and store it on the class. This needs to be done
@@ -865,6 +875,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         prim.Variable.__init__(self, "VecSUMFAC")
+        # Precompute and cache a number of keys
+        self._cached_cache_key = None
+        self._cached_flop_cost = {}
     def __getinitargs__(self):
         return (self.kernels, self.horizontal_width, self.vertical_width, self.buffer, self.insn_dep)
@@ -897,7 +911,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         Any two sum factorization kernels having the same cache_key
         are realized simulatenously!
-        return (self.matrix_sequence_quadrature_permuted, self.restriction, self.stage, self.buffer)
+        if self._cached_cache_key is None:
+            self._cached_cache_key = (self.matrix_sequence_quadrature_permuted, self.restriction, self.stage, self.buffer)
+        return self._cached_cache_key
     # Deduce all data fields of normal sum factorization kernels from the underlying kernels
@@ -1123,4 +1140,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     def operations(self):
         """ The total number of floating point operations for the kernel
         to be carried out """
-        return flop_cost(self.matrix_sequence_cost_permuted)
+        qp = quadrature_points_per_direction()
+        if qp not in self._cached_flop_cost:
+            self._cached_flop_cost[qp] = flop_cost(self.matrix_sequence_cost_permuted)
+        return self._cached_flop_cost[qp]
diff --git a/python/dune/codegen/sumfact/tabulation.py b/python/dune/codegen/sumfact/tabulation.py
index 9def97eb3ba4fdae280cc65e7f12ca73164d1146..0170c59463eed570fcce679636899e0d633d32fe 100644
--- a/python/dune/codegen/sumfact/tabulation.py
+++ b/python/dune/codegen/sumfact/tabulation.py
@@ -18,12 +18,10 @@ from dune.codegen.generation import (class_member,
-from dune.codegen.loopy.target import dtype_floatingpoint
+from dune.codegen.loopy.target import dtype_floatingpoint, type_floatingpoint
 from dune.codegen.loopy.vcl import ExplicitVCLCast, get_vcl_type_size
 from dune.codegen.options import get_option
-from dune.codegen.pdelab.localoperator import (name_domain_field,
-                                               lop_template_range_field,
-                                               )
+from dune.codegen.pdelab.localoperator import name_domain_field
 from dune.codegen.pdelab.quadrature import quadrature_order
 from dune.codegen.tools import maybe_wrap_subscript, ceildiv
 from loopy import CallMangleInfo
@@ -50,6 +48,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
+                 additional_tabulation=None,
@@ -61,6 +60,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
         direction: Direction corresponding to this matrix
         slice_size: Number of slices for this direction
         slice_index: To which slice does this belong
+        additional_tabulation: A factor to be multiplied with this basis tabulation matrix.
         assert(isinstance(basis_size, int))
@@ -71,6 +71,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
+                                 additional_tabulation=additional_tabulation,
@@ -90,6 +91,9 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
         if self.slice_size is not None:
+        if self.additional_tabulation is not None:
+            infos.append("prod{}".format(self.additional_tabulation._shortname))
         return "".join(infos)
     def __str__(self):
@@ -122,7 +126,9 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
     def pymbolic(self, indices):
         name = str(self)
         define_theta(name, self)
-        return prim.Subscript(prim.Variable(name), indices)
+        ret = prim.Subscript(prim.Variable(name), indices)
+        return ret
     def vectorized(self):
@@ -337,7 +343,7 @@ def name_oned_quadrature_points(bound):
 def typedef_polynomials(name, degree):
-    range_field = lop_template_range_field()
+    range_field = type_floatingpoint()
     domain_field = name_domain_field()
     include_file("dune/pdelab/finiteelementmap/qkdg.hh", filetag="operatorfile")
@@ -372,7 +378,7 @@ def name_polynomials(degree):
 def sort_quadrature_points_weights(qp, qw, bound):
-    range_field = lop_template_range_field()
+    range_field = type_floatingpoint()
     domain_field = name_domain_field()
     include_file("dune/codegen/sumfact/onedquadrature.hh", filetag="operatorfile")
     return frozenset({instruction(code="onedQuadraturePointsWeights<{}, {}, {}>({}, {});"
@@ -461,22 +467,31 @@ def define_theta(name, tabmat, additional_indices=(), width=None):
+    # Get right hand side of basis evaluation matrix assignment
+    expr = prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args))
+    # Maybe multiply another matrix (needed for the very special case of assembling point diagonals)
+    if tabmat.additional_tabulation is not None:
+        expr = prim.Product((expr, prim.Call(PolynomialLookup(polynomials, tabmat.additional_tabulation.derivative), tuple(args))))
     instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices),
-                expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)),
+                expression=expr,
-def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None, basis_size=None):
+def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None, basis_size=None, additional_sequence=None):
     dim = world_dimension()
     result = [None] * dim
+    if additional_sequence is None:
+        additional_sequence = [None] * dim
     quadrature_size = quadrature_points_per_direction()
     assert (basis_size is not None)
     if facedir is not None:
         quadrature_size = quadrature_size[:facedir] + (1,) + quadrature_size[facedir:]
-    for i in range(dim):
+    for i, add_seq in zip(range(dim), additional_sequence):
         onface = None
         if facedir == i:
             onface = facemod
@@ -485,6 +500,7 @@ def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=No
                                           derivative=derivative == i,
-                                          face=onface)
+                                          face=onface,
+                                          additional_tabulation=add_seq)
     return tuple(result)
diff --git a/python/dune/codegen/sumfact/vectorization.py b/python/dune/codegen/sumfact/vectorization.py
index e753652b10b3ac9b1765ee071bce1ccebd15f6b1..8da146281d6d5323303d215f42117ff5f011da37 100644
--- a/python/dune/codegen/sumfact/vectorization.py
+++ b/python/dune/codegen/sumfact/vectorization.py
@@ -20,7 +20,7 @@ from dune.codegen.sumfact.tabulation import (quadrature_points_per_direction,
 from dune.codegen.error import CodegenVectorizationError
-from dune.codegen.options import get_form_option, get_option, set_form_option
+from dune.codegen.options import get_form_option, get_option, form_option_context
 from dune.codegen.tools import add_to_frozendict, round_to_multiple, list_diff
 from pymbolic.mapper.flop_counter import FlopCounter
@@ -331,17 +331,16 @@ def level1_optimal_vectorization_strategy(sumfacts, width):
     # If we are using the 'target' strategy, we might want to log some information.
     if get_form_option("vectorization_strategy") == "target":
         # Print the achieved cost and the target cost on the screen
-        set_form_option("vectorization_strategy", "model")
-        target = float(get_form_option("vectorization_target"))
-        qp = min(optimal_strategies, key=lambda qp: abs(strategy_cost((qp, optimal_strategies[qp])) - target))
-        cost = strategy_cost((qp, optimal_strategies[qp]))
-        print("The target cost was:   {}".format(target))
-        print("The achieved cost was: {}".format(cost))
-        optimum = level1_optimal_vectorization_strategy(sumfacts, width)
-        print("The optimal cost would be: {}".format(strategy_cost(optimum)))
-        set_form_option("vectorization_strategy", "target")
-        print("The score in 'target' logic was: {}".format(strategy_cost((qp, optimal_strategies[qp]))))
+        with form_option_context(vectorization_strategy="model"):
+            target = float(get_form_option("vectorization_target"))
+            qp = min(optimal_strategies, key=lambda qp: abs(strategy_cost((qp, optimal_strategies[qp])) - target))
+            cost = strategy_cost((qp, optimal_strategies[qp]))
+            print("The target cost was:   {}".format(target))
+            print("The achieved cost was: {}".format(cost))
+            optimum = level1_optimal_vectorization_strategy(sumfacts, width)
+            print("The optimal cost would be: {}".format(strategy_cost(optimum)))
+            print("The score in 'target' logic was: {}".format(strategy_cost((qp, optimal_strategies[qp]))))
         # Print the employed vectorization strategy into a file
         suffix = ""
diff --git a/python/dune/codegen/ufl/transformations/blockpreconditioner.py b/python/dune/codegen/ufl/transformations/blockpreconditioner.py
index c992af12e0cae3499f6f6694027610dfd53c535e..b3fd11249e766bdc8abec4c0d01defbf5de9e59c 100644
--- a/python/dune/codegen/ufl/transformations/blockpreconditioner.py
+++ b/python/dune/codegen/ufl/transformations/blockpreconditioner.py
@@ -21,7 +21,7 @@ class OffDiagonalBlockSwitcher(MultiFunction):
     def positive_restricted(self, o):
         self.res = Restriction.POSITIVE
         ret = self(o.ufl_operands[0])
-        self.rest = Restriction.NONE
+        self.res = Restriction.NONE
         if isinstance(ret, uc.Zero):
             return ret
@@ -55,6 +55,8 @@ class OffDiagonalBlockSwitcher(MultiFunction):
 def list_restriction_tuples(diagonal):
     if diagonal:
         yield (Restriction.NONE, Restriction.NONE)
+        yield (Restriction.POSITIVE, Restriction.POSITIVE)
+        return
     res = (Restriction.POSITIVE, Restriction.NEGATIVE)
     amount = 1 if diagonal else 2
diff --git a/python/dune/codegen/ufl/visitor.py b/python/dune/codegen/ufl/visitor.py
index ab7c334f323bbc51221d926d357466d6ebc8cf88..e774e03d90e8bc61a1108767867ab046bfa56c81 100644
--- a/python/dune/codegen/ufl/visitor.py
+++ b/python/dune/codegen/ufl/visitor.py
@@ -37,6 +37,7 @@ from ufl.classes import (Coefficient,
+from pytools import product as ptproduct
 import pymbolic.primitives as prim
 import numpy as np
@@ -278,7 +279,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
     def product(self, o):
-        return prim.flattened_product(tuple(self.call(op) for op in o.ufl_operands))
+        ops = tuple(self.call(op) for op in o.ufl_operands)
+        if all(isinstance(op, (int, float)) for op in ops):
+            return ptproduct(ops)
+        return prim.flattened_product(ops)
     def float_value(self, o):
         return o.value()
@@ -290,7 +294,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         return prim.quotient(self.call(o.ufl_operands[0]), self.call(o.ufl_operands[1]))
     def sum(self, o):
-        return prim.flattened_sum(tuple(self.call(op) for op in o.ufl_operands))
+        ops = tuple(self.call(op) for op in o.ufl_operands)
+        if all(isinstance(op, (int, float)) for op in ops):
+            return sum(ops)
+        return prim.flattened_sum(ops)
     def zero(self, o):
         # UFL has Zeroes with shape. We ignore those indices.
diff --git a/python/setup.py b/python/setup.py
index 2edf70aaf5f063a278f06b5cfe7c91ac145ed857..9692259428fc392fda99403d44396d3d420ea1d0 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -29,6 +29,7 @@ setup(name='dune.codegen',
       description='Performance optimizing form compiler for the Dune project',
       author='Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>',
+      python_requires='>=3',
@@ -47,5 +48,6 @@ setup(name='dune.codegen',
         "console_scripts": [
             "generate_operators = dune.codegen.compile:entry_generate_operators",
             "generate_driver = dune.codegen.compile:entry_generate_driver",
+            "show_options = dune.codegen.options:show_options",
diff --git a/test/adjoint/poisson_mc_driver.hh b/test/adjoint/poisson_mc_driver.hh
index 5bd222b800baf36c09bd172c48d706a135912c80..a3ac578afc8d9a2039dd9437ef48fe971a37fcb6 100644
--- a/test/adjoint/poisson_mc_driver.hh
+++ b/test/adjoint/poisson_mc_driver.hh
@@ -60,7 +60,7 @@ bool driver(int argc, char** argv){
   Dune::PDELab::constraints(p1_bctype, p1_dirichlet_gfs_, p1_dirichlet_gfs__cc);
   // Set up grid grid operators...
-  using LOP_R = ROperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType>;
+  using LOP_R = ROperator<P1_dirichlet_GFS, P1_dirichlet_GFS>;
   using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
   using GO_r = Dune::PDELab::GridOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, LOP_R, MatrixBackend, DF, RangeType, RangeType, P1_dirichlet_GFS_CC, P1_dirichlet_GFS_CC>;
   LOP_R lop_r(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree);
@@ -122,7 +122,7 @@ bool driver(int argc, char** argv){
   GF_X x_gf(p1_dirichlet_gfs_, x_r);
   // Local operator for adjoint problem
-  using LOP_Adjoint = RAdjointOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType, GF_X>;
+  using LOP_Adjoint = RAdjointOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, GF_X>;
   LOP_Adjoint lop_adjoint(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree, x_gf);
   // Grid operator for adjoint problem
@@ -156,7 +156,7 @@ bool driver(int argc, char** argv){
   DJDM dJdm(7,0.0);
   // Local operator for control problem
-  using LOP_Control = RControlOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType, GF_Adjoint, DJDM>;
+  using LOP_Control = RControlOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, GF_Adjoint, DJDM>;
   LOP_Control lop_control(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree, gf_adjoint, dJdm);
   // Grid operator for control problem
diff --git a/test/coeffeval/coeffeval_poisson.cc b/test/coeffeval/coeffeval_poisson.cc
index 4f9e8861b5139dacb8183327ad053b3cb2373b8b..f0d645ffb5a4521f80c82684de736412d49f737e 100644
--- a/test/coeffeval/coeffeval_poisson.cc
+++ b/test/coeffeval/coeffeval_poisson.cc
@@ -74,7 +74,7 @@ int main(int argc, char** argv)
   GF c_gf(p2_gfs, c);
   // Local Operator
-  using LOP_R = PoissonLocalOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType, GF>;
+  using LOP_R = PoissonLocalOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, GF>;
   LOP_R lop_r(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree, c_gf);
   // Constraints stuff
diff --git a/test/poisson/poisson_tensor.ufl b/test/poisson/poisson_tensor.ufl
index b527d05258667dae629f608a1a630e5f11f947b8..3591d4d70e17f8811cf32bb759212c7c33e0bb24 100644
--- a/test/poisson/poisson_tensor.ufl
+++ b/test/poisson/poisson_tensor.ufl
@@ -12,6 +12,9 @@ V = FiniteElement("CG", cell, 1)
 u = TrialFunction(V)
 v = TestFunction(V)
+# Test metadata setting of options
+dx = dx(metadata={"quadrature_order": 27})
 r= (inner(A*grad(u), grad(v)) + c*u*v -f*v)*dx
 exact_solution = g
 is_dirichlet = 1
diff --git a/test/sumfact/CMakeLists.txt b/test/sumfact/CMakeLists.txt
index 187aee212ed125a99bc82670fe34e680379542f7..61b81b76082cfa3c83b04d2632194d5a4a6e6447 100644
--- a/test/sumfact/CMakeLists.txt
+++ b/test/sumfact/CMakeLists.txt
@@ -2,3 +2,4 @@ add_subdirectory(hyperbolic)
diff --git a/test/sumfact/poisson/facedir-facemod-variation/poisson_dg_3d_facedir_facemod_variation_driver.cc b/test/sumfact/poisson/facedir-facemod-variation/poisson_dg_3d_facedir_facemod_variation_driver.cc
index 2f8cc6c8614e0cdb954a52fa5de808dbff466b07..d9fa57bf0d692186a5761667508f9e7c1b645715 100644
--- a/test/sumfact/poisson/facedir-facemod-variation/poisson_dg_3d_facedir_facemod_variation_driver.cc
+++ b/test/sumfact/poisson/facedir-facemod-variation/poisson_dg_3d_facedir_facemod_variation_driver.cc
@@ -109,7 +109,7 @@ int main(int argc, char** argv){
     Dune::PDELab::constraints(dg1_gfs_, dg1_gfs__cc);
     // Set up grid grid operators...
-    using LOP_R = rOperator<DG1_GFS, DG1_GFS, RangeType>;
+    using LOP_R = rOperator<DG1_GFS, DG1_GFS>;
     using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
     using GO_r = Dune::PDELab::GridOperator<DG1_GFS, DG1_GFS, LOP_R, MatrixBackend, DF, RangeType, RangeType, DG1_GFS_CC, DG1_GFS_CC>;
     LOP_R lop_r(dg1_gfs_, dg1_gfs_, initree);
diff --git a/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_dg_3d_facedir_facemod_variation_driver.cc b/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_dg_3d_facedir_facemod_variation_driver.cc
index 4b57deaa50bc6e435b40dec8060d55e927797b2a..cc4c1d01d3d6d5bb67f123e98696e7945002dc45 100644
--- a/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_dg_3d_facedir_facemod_variation_driver.cc
+++ b/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_dg_3d_facedir_facemod_variation_driver.cc
@@ -251,7 +251,7 @@ int main(int argc, char** argv){
     Dune::PDELab::constraints(dg1_gfs_, dg1_gfs__cc);
     // Set up grid grid operators...
-    using LOP_R = CLASSNAME<DG1_GFS, DG1_GFS, RangeType>;
+    using LOP_R = CLASSNAME<DG1_GFS, DG1_GFS>;
     using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
     using GO_r = Dune::PDELab::GridOperator<DG1_GFS, DG1_GFS, LOP_R, MatrixBackend, DF, RangeType, RangeType, DG1_GFS_CC, DG1_GFS_CC>;
     LOP_R lop_r(dg1_gfs_, dg1_gfs_, initree);
diff --git a/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_fastdg_3d_facedir_facemod_variation_driver.cc b/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_fastdg_3d_facedir_facemod_variation_driver.cc
index 90a5c381b538f8c41403161f96ba78a960fe491b..ad75cfc4ed894a45879fd2e078f01fa53aff3b09 100644
--- a/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_fastdg_3d_facedir_facemod_variation_driver.cc
+++ b/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_fastdg_3d_facedir_facemod_variation_driver.cc
@@ -225,7 +225,7 @@ int main(int argc, char** argv){
     Dune::PDELab::constraints(dg1_gfs_, dg1_gfs__cc);
     // Set up grid grid operators...
-    using LOP_R = CLASSNAME<DG1_GFS, DG1_GFS, RangeType>;
+    using LOP_R = CLASSNAME<DG1_GFS, DG1_GFS>;
     using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
     using GO_r = Dune::PDELab::FastDGGridOperator<DG1_GFS, DG1_GFS, LOP_R, MatrixBackend, DF, RangeType, RangeType, DG1_GFS_CC, DG1_GFS_CC>;
     LOP_R lop_r(dg1_gfs_, dg1_gfs_, initree);
diff --git a/test/sumfact/preconditioner/CMakeLists.txt b/test/sumfact/preconditioner/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ea77ba5dca71ce5afdff567be1f872dc0dc604e0
--- /dev/null
+++ b/test/sumfact/preconditioner/CMakeLists.txt
@@ -0,0 +1,11 @@
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl
+                                  BASENAME sumfact_preconditioner_2d
+                                  INIFILE preconditioner_2d.mini
+                                  SOURCE test_preconditioner_2d.cc
+                                  )
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
+                                  BASENAME sumfact_preconditioner_3d
+                                  INIFILE preconditioner_3d.mini
+                                  SOURCE test_preconditioner_3d.cc
+                                  )
diff --git a/test/sumfact/preconditioner/poisson_dg_2d.ufl b/test/sumfact/preconditioner/poisson_dg_2d.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..ff41f6164df46aa1909db03768aef4041011afae
--- /dev/null
+++ b/test/sumfact/preconditioner/poisson_dg_2d.ufl
@@ -0,0 +1,38 @@
+cell = "quadrilateral"
+dim = 2
+degree = 1
+x = SpatialCoordinate(cell)
+c = (0.5-x[0])**2 + (0.5-x[1])**2
+g = exp(-1.*c)
+f = 2*(2.-2*c)*g
+V = FiniteElement("DG", cell, degree)
+u = TrialFunction(V)
+v = TestFunction(V)
+n = FacetNormal(cell)('+')
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = 1.0
+r = inner(grad(u), grad(v))*dx \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - inner(n, grad(u))*v*ds \
+  + gamma_ext*u*v*ds \
+  + theta*u*inner(grad(v), n)*ds \
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
+exact_solution = g
diff --git a/test/sumfact/preconditioner/poisson_dg_3d.ufl b/test/sumfact/preconditioner/poisson_dg_3d.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..cc6594573762ee12e1b11206eeba07a4bd038726
--- /dev/null
+++ b/test/sumfact/preconditioner/poisson_dg_3d.ufl
@@ -0,0 +1,38 @@
+cell = hexahedron
+dim = 3
+degree = 1
+x = SpatialCoordinate(cell)
+c = (0.5-x[0])**2 + (0.5-x[1])**2 + (0.5-x[2])**2
+g = exp(-1.*c)
+f = 2*(3.-2*c)*g
+V = FiniteElement("DG", cell, degree)
+u = TrialFunction(V)
+v = TestFunction(V)
+n = FacetNormal(cell)('+')
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = 1.0
+r = inner(grad(u), grad(v))*dx \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - inner(n, grad(u))*v*ds \
+  + gamma_ext*u*v*ds \
+  + theta*u*inner(grad(v), n)*ds \
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
+exact_solution = g
diff --git a/test/sumfact/preconditioner/preconditioner_2d.mini b/test/sumfact/preconditioner/preconditioner_2d.mini
new file mode 100644
index 0000000000000000000000000000000000000000..adcf214dde7018a9c5aa093cff89c51cf67d6a3a
--- /dev/null
+++ b/test/sumfact/preconditioner/preconditioner_2d.mini
@@ -0,0 +1,46 @@
+__name = preconditioner_2d
+__exec_suffix = exec
+cells = 2 2
+extension = 1. 1.
+name = {__name}
+extension = vtu
+operators = r, blockdiag, blockoffdiag, pointdiag
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+classname = FullOperator
+filename = full_2d_operator.hh
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_diagonal = 1
+form = r
+classname = BlockDiagonalOperator
+filename = block_diagonal_2d_operator.hh
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_offdiagonal = 1
+form = r
+classname = BlockOffDiagonalOperator
+filename = block_offdiagonal_2d_operator.hh
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_pointdiagonal = 1
+form = r
+classname = PointDiagonalOperator
+filename = point_diagonal_2d_operator.hh
diff --git a/test/sumfact/preconditioner/preconditioner_3d.mini b/test/sumfact/preconditioner/preconditioner_3d.mini
new file mode 100644
index 0000000000000000000000000000000000000000..304d6e28f0e6f152d6327929984c0121dd55e978
--- /dev/null
+++ b/test/sumfact/preconditioner/preconditioner_3d.mini
@@ -0,0 +1,46 @@
+__name = preconditioner_3d
+__exec_suffix = exec
+cells = 2 2 2
+extension = 1. 1. 1.
+name = {__name}
+extension = vtu
+operators = r, blockdiag, blockoffdiag, pointdiag
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+classname = FullOperator
+filename = full_3d_operator.hh
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_diagonal = 1
+form = r
+classname = BlockDiagonalOperator
+filename = block_diagonal_3d_operator.hh
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_offdiagonal = 1
+form = r
+classname = BlockOffDiagonalOperator
+filename = block_offdiagonal_3d_operator.hh
+sumfact = 1
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+block_preconditioner_pointdiagonal = 1
+form = r
+classname = PointDiagonalOperator
+filename = point_diagonal_3d_operator.hh
diff --git a/test/sumfact/preconditioner/test_preconditioner_2d.cc b/test/sumfact/preconditioner/test_preconditioner_2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e8fcfc117288feba4d5bd76abb5ca548e18f649
--- /dev/null
+++ b/test/sumfact/preconditioner/test_preconditioner_2d.cc
@@ -0,0 +1,129 @@
+#include "config.h"
+#include "dune/common/parallel/mpihelper.hh"
+#include "dune/pdelab/stationary/linearproblem.hh"
+#include "dune/pdelab/backend/istl.hh"
+#include "dune/grid/yaspgrid.hh"
+#include "dune/pdelab/finiteelementmap/qkdg.hh"
+#include "dune/pdelab/gridoperator/fastdg.hh"
+#include "dune/testtools/gridconstruction.hh"
+#include "dune/common/parametertree.hh"
+#include "dune/common/parametertreeparser.hh"
+#include <random>
+#include "dune/pdelab/gridfunctionspace/vtk.hh"
+#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh"
+#include "string"
+#include "dune/codegen/vtkpredicate.hh"
+// Include all the generated operators
+#include "full_2d_operator.hh"
+#include "block_diagonal_2d_operator.hh"
+#include "block_offdiagonal_2d_operator.hh"
+#include "point_diagonal_2d_operator.hh"
+int main(int argc, char** argv){  
+  try
+  {    
+    // Initialize basic stuff...    
+    Dune::MPIHelper& mpihelper = Dune::MPIHelper::instance(argc, argv);
+    using RangeType = double;
+    Dune::ParameterTree initree;
+    Dune::ParameterTreeParser::readINITree(argv[1], initree);
+    // Setup grid (view)...    
+    using Grid = Dune::YaspGrid<2, Dune::EquidistantCoordinates<RangeType, 2>>;
+    using GV = Grid::LeafGridView;
+    using DF = Grid::ctype;
+    IniGridFactory<Grid> factory(initree);
+    std::shared_ptr<Grid> grid = factory.getGrid();
+    GV gv = grid->leafGridView();
+    // Set up finite element maps...    
+    using DG2_FEM = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RangeType, 1, 2>;
+    DG2_FEM dg2_fem;
+    // Set up grid function spaces...    
+    using VectorBackendDG2 = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::fixed>;
+    using NoConstraintsAssembler = Dune::PDELab::NoConstraints;
+    using DG2_GFS = Dune::PDELab::GridFunctionSpace<GV, DG2_FEM, NoConstraintsAssembler, VectorBackendDG2>;
+    DG2_GFS dg2_gfs_(gv, dg2_fem);
+    dg2_gfs_.name("dg2_gfs_");
+    // Set up constraints container...    
+    using DG2_GFS_CC = DG2_GFS::ConstraintsContainer<RangeType>::Type;
+    DG2_GFS_CC dg2_gfs__cc;
+    dg2_gfs__cc.clear();
+    Dune::PDELab::constraints(dg2_gfs_, dg2_gfs__cc);
+    // Set up grid grid operators...    
+    using FullLOP = FullOperator<DG2_GFS, DG2_GFS>;
+    using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
+    using FullGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, FullLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    FullLOP fulllop(dg2_gfs_, dg2_gfs_, initree);
+    dg2_gfs_.update();
+    MatrixBackend mb(5);
+    FullGO fullgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, fulllop, mb);
+    // Additional grid operators for preconditioner
+    using BDLOP = BlockDiagonalOperator<DG2_GFS, DG2_GFS>;
+    using BDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BDLOP bdlop(dg2_gfs_, dg2_gfs_, initree);
+    BDGO bdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bdlop, mb);
+    using BODLOP = BlockOffDiagonalOperator<DG2_GFS, DG2_GFS>;
+    using BODGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BODLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BODLOP bodlop(dg2_gfs_, dg2_gfs_, initree);
+    BODGO bodgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bodlop, mb);
+    using PDLOP = PointDiagonalOperator<DG2_GFS, DG2_GFS>;
+    using PDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, PDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    PDLOP pdlop(dg2_gfs_, dg2_gfs_, initree);
+    PDGO pdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, pdlop, mb);
+    // Set up solution vectors...    
+    using V_R = Dune::PDELab::Backend::Vector<DG2_GFS,DF>;
+    V_R x(dg2_gfs_, 0.0);
+    // Testing!
+    // Assemble all those matrices
+    using Dune::PDELab::Backend::native;
+    using M = typename FullGO::Traits::Jacobian;
+    M m(fullgo);
+    fullgo.jacobian(x, m);
+    Dune::printmatrix(std::cout, native(m),"full matrix","row",9,1);
+    using BDM = typename BDGO::Traits::Jacobian;
+    BDM bdm(bdgo);
+    bdgo.jacobian(x, bdm);
+    Dune::printmatrix(std::cout, native(bdm),"blockdiagonal matrix","row",9,1);
+    using BODM = typename BODGO::Traits::Jacobian;
+    BODM bodm(bodgo);
+    bodgo.jacobian(x, bodm);
+    Dune::printmatrix(std::cout, native(bodm),"blockoffdiagonal matrix","row",9,1);
+    V_R pd(dg2_gfs_, 0.0);
+    pdgo.residual(x, pd);
+    Dune::printvector(std::cout, native(pd), "point diagonal vector", "row");
+    // test failure boolean
+    bool testfail(false);
+    // TODO: Properly test this stuff given the above matrices.
+    //       Right now, visuals need to suffice.
+    // Return statement...    
+    return testfail;
+  }  
+  catch (Dune::Exception& e)
+  {    std::cerr << "Dune reported error: " << e << std::endl;
+    return 1;
+  }  
+  catch (std::exception& e)
+  {    std::cerr << "Unknown exception thrown!" << std::endl;
+    return 1;
+  }  
diff --git a/test/sumfact/preconditioner/test_preconditioner_3d.cc b/test/sumfact/preconditioner/test_preconditioner_3d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90ea7aee7def84e9fc2395a798ace1bc39018f7d
--- /dev/null
+++ b/test/sumfact/preconditioner/test_preconditioner_3d.cc
@@ -0,0 +1,129 @@
+#include "config.h"
+#include "dune/common/parallel/mpihelper.hh"
+#include "dune/pdelab/stationary/linearproblem.hh"
+#include "dune/pdelab/backend/istl.hh"
+#include "dune/grid/yaspgrid.hh"
+#include "dune/pdelab/finiteelementmap/qkdg.hh"
+#include "dune/pdelab/gridoperator/fastdg.hh"
+#include "dune/testtools/gridconstruction.hh"
+#include "dune/common/parametertree.hh"
+#include "dune/common/parametertreeparser.hh"
+#include <random>
+#include "dune/pdelab/gridfunctionspace/vtk.hh"
+#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh"
+#include "string"
+#include "dune/codegen/vtkpredicate.hh"
+// Include all the generated operators
+#include "full_3d_operator.hh"
+#include "block_diagonal_3d_operator.hh"
+#include "block_offdiagonal_3d_operator.hh"
+#include "point_diagonal_3d_operator.hh"
+int main(int argc, char** argv){  
+  try
+  {    
+    // Initialize basic stuff...    
+    Dune::MPIHelper& mpihelper = Dune::MPIHelper::instance(argc, argv);
+    using RangeType = double;
+    Dune::ParameterTree initree;
+    Dune::ParameterTreeParser::readINITree(argv[1], initree);
+    // Setup grid (view)...    
+    using Grid = Dune::YaspGrid<3, Dune::EquidistantCoordinates<RangeType, 3>>;
+    using GV = Grid::LeafGridView;
+    using DF = Grid::ctype;
+    IniGridFactory<Grid> factory(initree);
+    std::shared_ptr<Grid> grid = factory.getGrid();
+    GV gv = grid->leafGridView();
+    // Set up finite element maps...    
+    using DG2_FEM = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RangeType, 1, 3>;
+    DG2_FEM dg2_fem;
+    // Set up grid function spaces...    
+    using VectorBackendDG2 = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::fixed>;
+    using NoConstraintsAssembler = Dune::PDELab::NoConstraints;
+    using DG2_GFS = Dune::PDELab::GridFunctionSpace<GV, DG2_FEM, NoConstraintsAssembler, VectorBackendDG2>;
+    DG2_GFS dg2_gfs_(gv, dg2_fem);
+    dg2_gfs_.name("dg2_gfs_");
+    // Set up constraints container...    
+    using DG2_GFS_CC = DG2_GFS::ConstraintsContainer<RangeType>::Type;
+    DG2_GFS_CC dg2_gfs__cc;
+    dg2_gfs__cc.clear();
+    Dune::PDELab::constraints(dg2_gfs_, dg2_gfs__cc);
+    // Set up grid grid operators...    
+    using FullLOP = FullOperator<DG2_GFS, DG2_GFS>;
+    using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
+    using FullGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, FullLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    FullLOP fulllop(dg2_gfs_, dg2_gfs_, initree);
+    dg2_gfs_.update();
+    MatrixBackend mb(5);
+    FullGO fullgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, fulllop, mb);
+    // Additional grid operators for preconditioner
+    using BDLOP = BlockDiagonalOperator<DG2_GFS, DG2_GFS>;
+    using BDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BDLOP bdlop(dg2_gfs_, dg2_gfs_, initree);
+    BDGO bdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bdlop, mb);
+    using BODLOP = BlockOffDiagonalOperator<DG2_GFS, DG2_GFS>;
+    using BODGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BODLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    BODLOP bodlop(dg2_gfs_, dg2_gfs_, initree);
+    BODGO bodgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bodlop, mb);
+    using PDLOP = PointDiagonalOperator<DG2_GFS, DG2_GFS>;
+    using PDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, PDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>;
+    PDLOP pdlop(dg2_gfs_, dg2_gfs_, initree);
+    PDGO pdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, pdlop, mb);
+    // Set up solution vectors...    
+    using V_R = Dune::PDELab::Backend::Vector<DG2_GFS,DF>;
+    V_R x(dg2_gfs_, 0.0);
+    // Testing!
+    // Assemble all those matrices
+    using Dune::PDELab::Backend::native;
+    using M = typename FullGO::Traits::Jacobian;
+    M m(fullgo);
+    fullgo.jacobian(x, m);
+    Dune::printmatrix(std::cout, native(m),"full matrix","row",9,1);
+    using BDM = typename BDGO::Traits::Jacobian;
+    BDM bdm(bdgo);
+    bdgo.jacobian(x, bdm);
+    Dune::printmatrix(std::cout, native(bdm),"blockdiagonal matrix","row",9,1);
+    using BODM = typename BODGO::Traits::Jacobian;
+    BODM bodm(bodgo);
+    bodgo.jacobian(x, bodm);
+    Dune::printmatrix(std::cout, native(bodm),"blockoffdiagonal matrix","row",9,1);
+    V_R pd(dg2_gfs_, 0.0);
+    pdgo.residual(x, pd);
+    Dune::printvector(std::cout, native(pd), "point diagonal vector", "row");
+    // test failure boolean
+    bool testfail(false);
+    // TODO: Properly test this stuff given the above matrices.
+    //       Right now, visuals need to suffice.
+    // Return statement...    
+    return testfail;
+  }  
+  catch (Dune::Exception& e)
+  {    std::cerr << "Dune reported error: " << e << std::endl;
+    return 1;
+  }  
+  catch (std::exception& e)
+  {    std::cerr << "Unknown exception thrown!" << std::endl;
+    return 1;
+  }  