diff --git a/.gitmodules b/.gitmodules
index 4cf4ba72af8959c4b35e4f3e734481602dde8ba8..1733ea4f5c7666f27dfc88945612f491bc6cc4df 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,7 +3,7 @@
 	url = https://gitlab.tiker.net/inducer/loopy.git
 [submodule "python/ufl"]
 	path = python/ufl
-	url = https://parcomp-git.iwr.uni-heidelberg.de/dominic/ufl.git
+	url = https://bitbucket.org/fenics-project/ufl.git
 [submodule "python/pymbolic"]
 	path = python/pymbolic
 	url = https://github.com/inducer/pymbolic.git
diff --git a/applications/knl/poisson_dg/knl_poisson_dg.mini b/applications/knl/poisson_dg/knl_poisson_dg.mini
index bf478e854b00a14ae5c48567945051e92e22f02c..ba2f345fd39edcca22fbbb9aded8e4bd3193157c 100644
--- a/applications/knl/poisson_dg/knl_poisson_dg.mini
+++ b/applications/knl/poisson_dg/knl_poisson_dg.mini
@@ -36,18 +36,21 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+instrumentation_level = 2, 3, 4 | expand
+opcounter = 1, 0 | expand opcount
+performance_measuring = 0, 1 | expand opcount
+architecture = knl
+
+[formcompiler.r]
 fastdg = 1
 sumfact = 1
 vectorization_quadloop = 1
 vectorization_strategy = explicit
 vectorization_horizontal = 4
 vectorization_vertical = 2
-instrumentation_level = 2, 3, 4 | expand
-opcounter = 1, 0 | expand opcount
-time_opcounter = 0, 1 | expand opcount
+matrix_free = 1
+generate_jacobians = 0
 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
-architecture = knl
-assure_statement_ordering = 1
 
 [formcompiler.ufl_variants]
 cell = hexahedron
diff --git a/applications/knl/poisson_dg/poisson_dg.ufl b/applications/knl/poisson_dg/poisson_dg.ufl
index 7b5c3e548f81688a0735256411fd54e9d49fa9f4..b5b2b90624dcd891f08bde4aba6cf68481b152f0 100644
--- a/applications/knl/poisson_dg/poisson_dg.ufl
+++ b/applications/knl/poisson_dg/poisson_dg.ufl
@@ -28,5 +28,3 @@ r = inner(grad(u), grad(v))*dx \
   - f*v*dx \
   - theta*g*inner(grad(v), n)*ds \
   - gamma_ext*g*v*ds
-
-forms = [r]
diff --git a/applications/knl/poisson_dg/verify.mini b/applications/knl/poisson_dg/verify.mini
index d6dcdc8532a2809b4842ee2d078d72f7f20ff97a..b82dce6952c34792c633461bfadb834afd900885 100644
--- a/applications/knl/poisson_dg/verify.mini
+++ b/applications/knl/poisson_dg/verify.mini
@@ -9,6 +9,9 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+architecture = knl
+
+[formcompiler.r]
 fastdg = 1
 sumfact = 1
 vectorization_quadloop = 1
@@ -16,7 +19,6 @@ vectorization_strategy = explicit
 vectorization_horizontal = 4
 vectorization_vertical = 2
 quadrature_order = 6
-architecture = knl
 
 [formcompiler.ufl_variants]
 cell = hexahedron
diff --git a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini
index 8b746a3d3dc86ae9be6a96f5c95b8bda6c6a259b..4f3f5826f71fd9fc742efa4d88ae222e45759529 100644
--- a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini
+++ b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini
@@ -36,18 +36,21 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+instrumentation_level = 2, 3, 4 | expand
+opcounter = 1, 0 | expand opcount
+performance_measuring = 0, 1 | expand opcount
+architecture = knl
+
+[formcompiler.r]
 fastdg = 1
 sumfact = 1
 vectorization_quadloop = 1
 vectorization_strategy = explicit
 vectorization_horizontal = 4
 vectorization_vertical = 2
-instrumentation_level = 2, 3, 4 | expand
-opcounter = 1, 0 | expand opcount
-time_opcounter = 0, 1 | expand opcount
 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
-architecture = knl
-assure_statement_ordering = 1
+matrix_free = 1
+generate_jacobians = 0
 
 [formcompiler.ufl_variants]
 cell = hexahedron
diff --git a/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl b/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl
index d3c95a5226e26981cf78e5d8b78a686150a9310d..918f92138a9fe04f9b2c4ddddefa472845d4eb5f 100644
--- a/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl
+++ b/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl
@@ -31,5 +31,3 @@ r = (inner(A*grad(u), grad(v)) + (c*u-f)*v)*dx \
   + theta*u*inner(A*grad(v), n)*ds \
   - theta*g*inner(A*grad(v), n)*ds \
   - gamma_ext*g*v*ds
-
-forms = [r]
diff --git a/applications/knl/poisson_dg_tensor/verify.mini b/applications/knl/poisson_dg_tensor/verify.mini
index c2447c077f3ad00c39585b846e57b01b19ef23f3..cedf2fb3418dd92e5678ced1d20bbe0e73530d7d 100644
--- a/applications/knl/poisson_dg_tensor/verify.mini
+++ b/applications/knl/poisson_dg_tensor/verify.mini
@@ -9,6 +9,9 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+architecture = knl
+
+[formcompiler.r]
 fastdg = 1
 sumfact = 1
 vectorization_quadloop = 1
@@ -16,7 +19,6 @@ vectorization_strategy = explicit
 vectorization_horizontal = 4
 vectorization_vertical = 2
 quadrature_order = 6
-architecture = knl
 
 [formcompiler.ufl_variants]
 cell = hexahedron
diff --git a/applications/poisson_dg/poisson_dg.mini b/applications/poisson_dg/poisson_dg.mini
index e99c20202fad49ef344c37776b9c4264a88bbfcf..7e65a4815f5fccc2dc6aae955bbdef559e9ebe92 100644
--- a/applications/poisson_dg/poisson_dg.mini
+++ b/applications/poisson_dg/poisson_dg.mini
@@ -36,15 +36,18 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+instrumentation_level = 2, 3, 4 | expand
+opcounter = 1, 0 | expand opcount
+performance_measuring = 0, 1 | expand opcount
+
+[formcompiler.r]
 fastdg = 1
 sumfact = 1
 vectorization_quadloop = 1
 vectorization_strategy = explicit
-instrumentation_level = 2, 3, 4 | expand
-opcounter = 1, 0 | expand opcount
-time_opcounter = 0, 1 | expand opcount
 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
-assure_statement_ordering = 1
+matrix_free = 1
+generate_jacobians = 0
 
 [formcompiler.ufl_variants]
 cell = hexahedron
diff --git a/applications/poisson_dg/poisson_dg.ufl b/applications/poisson_dg/poisson_dg.ufl
index 7b5c3e548f81688a0735256411fd54e9d49fa9f4..54c53913fcdb844eb19c89d5100d6461536a1df7 100644
--- a/applications/poisson_dg/poisson_dg.ufl
+++ b/applications/poisson_dg/poisson_dg.ufl
@@ -29,4 +29,3 @@ r = inner(grad(u), grad(v))*dx \
   - theta*g*inner(grad(v), n)*ds \
   - gamma_ext*g*v*ds
 
-forms = [r]
diff --git a/applications/poisson_dg/verify.mini b/applications/poisson_dg/verify.mini
index a2eed15308793b163781a9ecba590cd31c4f7400..475cb4db8e0e93276ea797cef786ad3d0924ccb6 100644
--- a/applications/poisson_dg/verify.mini
+++ b/applications/poisson_dg/verify.mini
@@ -9,12 +9,14 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+exact_solution_expression = g
+compare_l2errorsquared = 1e-6
+
+[formcompiler.r]
 fastdg = 1
 sumfact = 1
 vectorization_quadloop = 1
 vectorization_strategy = explicit
-exact_solution_expression = g
-compare_l2errorsquared = 1e-6
 
 [formcompiler.ufl_variants]
 cell = hexahedron
diff --git a/applications/poisson_dg_tensor/poisson_dg_tensor.mini b/applications/poisson_dg_tensor/poisson_dg_tensor.mini
index 720b89578df0c59b88c764265f761c2be9e54546..4e7ac555b33fa5fcef568cca5b6dddf6dd3552e3 100644
--- a/applications/poisson_dg_tensor/poisson_dg_tensor.mini
+++ b/applications/poisson_dg_tensor/poisson_dg_tensor.mini
@@ -36,15 +36,18 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+instrumentation_level = 2, 3, 4 | expand
+opcounter = 1, 0 | expand opcount
+performance_measuring = 0, 1 | expand opcount
+
+[formcompiler.r]
 fastdg = 1
 sumfact = 1
 vectorization_quadloop = 1
 vectorization_strategy = explicit
-instrumentation_level = 2, 3, 4 | expand
-opcounter = 1, 0 | expand opcount
-time_opcounter = 0, 1 | expand opcount
 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
-assure_statement_ordering = 1
+matrix_free = 1
+generate_jacobians = 0
 
 [formcompiler.ufl_variants]
 cell = hexahedron
diff --git a/applications/poisson_dg_tensor/poisson_dg_tensor.ufl b/applications/poisson_dg_tensor/poisson_dg_tensor.ufl
index 2b1a7b9803016a50b609a9d5379461c20252310c..be57149e091d1e7f9d1eb4229992965fbf3ee633 100644
--- a/applications/poisson_dg_tensor/poisson_dg_tensor.ufl
+++ b/applications/poisson_dg_tensor/poisson_dg_tensor.ufl
@@ -32,5 +32,4 @@ r = (inner(A*grad(u), grad(v)) + (c*u-f)*v)*dx \
   - theta*g*inner(A*grad(v), n)*ds \
   - gamma_ext*g*v*ds
 
-forms = [r]
 exact_solution = g
diff --git a/applications/poisson_dg_tensor/verify.mini b/applications/poisson_dg_tensor/verify.mini
index 774b5e1b9c033c519c19bb35b8deb5db0efd58aa..d33be7360b97c07c7eeca56e878adf824381c053 100644
--- a/applications/poisson_dg_tensor/verify.mini
+++ b/applications/poisson_dg_tensor/verify.mini
@@ -9,11 +9,13 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+compare_l2errorsquared = 1e-6
+
+[formcompiler.r]
 fastdg = 1
 sumfact = 1
 vectorization_quadloop = 1
 vectorization_strategy = explicit
-compare_l2errorsquared = 1e-6
 
 [formcompiler.ufl_variants]
 cell = hexahedron
diff --git a/applications/stokes_dg/stokes_dg.mini b/applications/stokes_dg/stokes_dg.mini
index 9cf5f0ba066b7879cbde5120c7a5b2d7f92de077..a80567f9b73473ae09e52c3a768177431435e849 100644
--- a/applications/stokes_dg/stokes_dg.mini
+++ b/applications/stokes_dg/stokes_dg.mini
@@ -37,16 +37,19 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+instrumentation_level = 2, 3, 4 | expand
+opcounter = 1, 0 | expand opcount
+performance_measuring = 0, 1 | expand opcount
+
+[formcompiler.r]
 fastdg = 1
 sumfact = 1
 vectorization_quadloop = 1
 vectorization_strategy = model
 vectorization_allow_quadrature_changes = 1
-instrumentation_level = 2, 3, 4 | expand
-opcounter = 1, 0 | expand opcount
-time_opcounter = 0, 1 | expand opcount
 quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval
-assure_statement_ordering = 1
+matrix_free = 1
+generate_jacobians = 0
 
 [formcompiler.ufl_variants]
 cell = hexahedron
diff --git a/applications/stokes_dg/stokes_dg.ufl b/applications/stokes_dg/stokes_dg.ufl
index 8c2ac036b6bb8961a312037b88866e5a1aff4ccb..d36556dbaf38b76dbc565437a28de38889b62d0f 100644
--- a/applications/stokes_dg/stokes_dg.ufl
+++ b/applications/stokes_dg/stokes_dg.ufl
@@ -1,4 +1,5 @@
 cell = hexahedron
+dim = 3
 
 x = SpatialCoordinate(cell)
 g_v = as_vector((4.*x[1]*(1.-x[1]), 0.0, 0.0))
@@ -14,23 +15,29 @@ u, p = TrialFunctions(TH)
 ds = ds(subdomain_id=1, subdomain_data=bctype)
 
 n = FacetNormal(cell)('+')
-eps = -1.0
-sigma = 1.0
-h_e = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = -1.0
+
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * v_degree * (v_degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * v_degree * (v_degree + dim - 1)) / h_int
 
 r = inner(grad(u), grad(v))*dx \
   - p*div(v)*dx \
   - q*div(u)*dx \
-  + inner(avg(grad(u))*n, jump(v))*dS \
-  + sigma / h_e * inner(jump(u), jump(v))*dS \
-  - eps * inner(avg(grad(v))*n, jump(u))*dS \
-  - avg(p)*inner(jump(v), n)*dS \
-  - avg(q)*inner(jump(u), n)*dS \
+  - inner(avg(grad(u))*n, jump(v))*dS \
+  + gamma_int * inner(jump(u), jump(v))*dS \
+  + theta * inner(avg(grad(v))*n, jump(u))*dS \
+  + avg(p)*inner(jump(v), n)*dS \
+  + avg(q)*inner(jump(u), n)*dS \
   - inner(grad(u)*n, v)*ds \
-  + sigma / h_e * inner(u-g_v, v)*ds \
-  + eps * inner(grad(v)*n, u-g_v)*ds \
+  + gamma_ext * inner(u-g_v, v)*ds \
+  + theta * inner(grad(v)*n, u-g_v)*ds \
   + p*inner(v, n)*ds \
   + q*inner(u-g_v, n)*ds
 
-forms = [r]
 exact_solution = g_v, 8*(1.-x[0])
\ No newline at end of file
diff --git a/bin/timings.sh b/bin/timings.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b85e6589dc9086b1488f788cccc86118e8a51e1e
--- /dev/null
+++ b/bin/timings.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# If an argument was given use it as the working directory
+if [ $# -eq 1 ]
+then
+  cd $1
+fi
+
+# Search for runnable executables
+FILES=$(ls *.ini | grep -v '^verify')
+for inifile in $FILES
+do
+  line=$(grep ^"opcounter = " $inifile)
+  extract=${line##opcounter = }
+  UPPER=10
+  if [ $extract -eq 1 ]
+  then
+    UPPER=1
+  fi
+  COUNT=0
+  while [ $COUNT -lt $UPPER ]; do
+    exec=${inifile%.ini}
+    MAXCORES=40
+    mpirun --bind-to core -np $MAXCORES ./$exec $inifile
+    COUNT=$((COUNT + 1))
+  done
+done
diff --git a/cmake/modules/CMakeLists.txt b/cmake/modules/CMakeLists.txt
index 415cd9c6354e3318db1675739c36a3f90bee2916..85e38089ee5a7389a34f1697ae7a6939e8cf8a45 100644
--- a/cmake/modules/CMakeLists.txt
+++ b/cmake/modules/CMakeLists.txt
@@ -1,2 +1,5 @@
-install(FILES DunePerftoolMacros.cmake
+install(FILES deplist.py
+              DunePerftoolMacros.cmake
+              GeneratedSystemtest.cmake
+              perftool_sourcepath.py
         DESTINATION ${DUNE_INSTALL_MODULEDIR})
diff --git a/cmake/modules/DunePerftoolMacros.cmake b/cmake/modules/DunePerftoolMacros.cmake
index 66d0b4d7537b4ba44c69bfa4ebfd6b8fe9aae2c4..ca33bfae93d7bcedc198d864518f424944924efb 100644
--- a/cmake/modules/DunePerftoolMacros.cmake
+++ b/cmake/modules/DunePerftoolMacros.cmake
@@ -8,28 +8,23 @@
 #
 #       The UFL file to create the executable from.
 #
-#    .. cmake_param:: TARGET
+#    .. cmake_param:: INIFILE
 #       :single:
 #       :required:
 #
-#       The name given to the added executable target.
-#
-#    .. cmake_param:: OPERATOR
-#       :single:
-#
-#       The local operator file name to generate. Defaults
-#       to a suitably mangled, but not easily readable name.
+#       The ini file that controls the form compilation process.
+#       It is expected to contain a [formcompiler] section
 #
-#    .. cmake_param:: DRIVER
+#    .. cmake_param:: TARGET
 #       :single:
+#       :required:
 #
-#       The driver file name to generate. Defaults
-#       to a suitably mangled, but not easily readable name.
+#       The name given to the added executable target.
 #
-#    .. cmake_param:: MAIN
+#    .. cmake_param:: SOURCE
 #
-#       The main source file to generate. Defaults
-#       to a suitably mangled, but not easily readable name.
+#       The cc source file to build from. If omitted, a minimal
+#       source file and a driver file will be generated.
 #
 #    .. cmake_param:: FORM_COMPILER_ARGS
 #       :multi:
@@ -44,18 +39,17 @@
 #       Additional dependencies of the generated executable (changes in those
 #       will retrigger generation)
 #
+#    .. cmake_param:: EXCLUDE_FROM_ALL
+#       :option:
+#
+#       Set this option, if you do not want the target to be automatically
+#       built. This option is forwarded to the builtin command add_executable.
+#
 #    Add an executable to the project that gets automatically
 #    generated at configure time with the form compiler uf2pdelab.
 #    Regeneration is triggered correctly if the UFL file or the
 #    form compiler changed.
 #
-# .. cmake_variable:: UFL2PDELAB_INTERACTIVE
-#
-#    If this variable is set, all code generation will be done in
-#    interactive mode. This option is interesting in development
-#    of the form compiler, but might be quite tedious in production
-#    and automated testing.
-#
 
 add_custom_target(generation)
 
@@ -63,6 +57,7 @@ add_custom_target(generation)
 # to have correct retriggers of generated executables
 if(CMAKE_PROJECT_NAME STREQUAL dune-perftool)
   set(UFL2PDELAB_GLOB_PATTERN "${CMAKE_SOURCE_DIR}/python/*.py")
+  set(perftool_path ${CMAKE_SOURCE_DIR}/cmake/modules)
 else()
   dune_module_path(MODULE dune-perftool
                    RESULT perftool_path
@@ -76,8 +71,8 @@ endif()
 file(GLOB_RECURSE UFL2PDELAB_SOURCES ${UFL2PDELAB_GLOB_PATTERN})
 
 function(add_generated_executable)
-  set(OPTIONS)
-  set(SINGLE TARGET OPERATOR DRIVER UFLFILE)
+  set(OPTIONS EXCLUDE_FROM_ALL)
+  set(SINGLE TARGET SOURCE UFLFILE INIFILE)
   set(MULTI FORM_COMPILER_ARGS DEPENDS)
   include(CMakeParseArguments)
   cmake_parse_arguments(GEN "${OPTIONS}" "${SINGLE}" "${MULTI}" ${ARGN})
@@ -96,42 +91,56 @@ function(add_generated_executable)
   if(NOT IS_ABSOLUTE GEN_UFLFILE)
     set(GEN_UFLFILE ${CMAKE_CURRENT_SOURCE_DIR}/${GEN_UFLFILE})
   endif()
-  if(NOT GEN_OPERATOR)
-    set(GEN_OPERATOR ${GEN_TARGET}_operator.hh)
-    set(GEN_OPERATOR ${CMAKE_CURRENT_BINARY_DIR}/${GEN_OPERATOR})
-  endif()
-  if(NOT GEN_DRIVER)
-    set(GEN_DRIVER ${GEN_TARGET}_driver.hh)
-    set(GEN_DRIVER ${CMAKE_CURRENT_BINARY_DIR}/${GEN_DRIVER})
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${GEN_INIFILE})
+    set(GEN_INIFILE ${CMAKE_CURRENT_SOURCE_DIR}/${GEN_INIFILE})
   endif()
-  if(NOT GEN_MAIN)
-    set(GEN_MAIN ${GEN_TARGET}_main.cc)
-    set(GEN_MAIN ${CMAKE_CURRENT_BINARY_DIR}/${GEN_MAIN})
+  if(NOT GEN_SOURCE)
+    # Generate a driver file
+    set(GEN_SOURCE ${GEN_TARGET}_driver.cc)
+    add_custom_command(OUTPUT ${GEN_SOURCE}
+                       COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env generate_driver
+                               --uflfile ${GEN_UFLFILE}
+                               --ini-file ${GEN_INIFILE}
+                               --target-name ${GEN_TARGET}
+                               --driver-file ${GEN_SOURCE}
+                               --project-basedir ${CMAKE_BINARY_DIR}
+                               ${GEN_FORM_COMPILER_ARGS}
+                       DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_PERFTOOL_ADDITIONAL_PYTHON_SOURCES}
+                       COMMENT "Generating driver for the target ${GEN_TARGET}"
+                       )
   endif()
-  if(UFL2PDELAB_INTERACTIVE)
-    set(GEN_FORM_COMPILER_ARGS ${GEN_FORM_COMPILER_ARGS} --interactive)
+  if(GEN_EXCLUDE_FROM_ALL)
+    set(GEN_EXCLUDE_FROM_ALL "EXCLUDE_FROM_ALL")
+  else()
+    set(GEN_EXCLUDE_FROM_ALL "")
   endif()
 
-  # Write a standard main function
-  dune_module_path(MODULE dune-perftool
-                   RESULT perftool_path
-                   CMAKE_MODULES)
-  configure_file(${perftool_path}/StandardMain.cmake ${GEN_MAIN})
-
-  add_custom_command(OUTPUT ${GEN_OPERATOR} ${GEN_DRIVER}
-                     COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env ufl2pdelab
-                             --project-basedir ${CMAKE_BINARY_DIR}
-                             --operator-file ${GEN_OPERATOR}
-                             --driver-file ${GEN_DRIVER}
-                             ${GEN_FORM_COMPILER_ARGS}
-                             --uflfile ${GEN_UFLFILE}
-                     DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_PERFTOOL_ADDITIONAL_PYTHON_SOURCES}
-                     COMMENT "Running ufl2pdelab for the target ${GEN_TARGET}"
-                    )
+  # Parse a mapping of operators to build and their respective filenames
+  dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${perftool_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET}
+                       OUTPUT_VARIABLE depdata
+                       )
+  parse_python_data(PREFIX depdata INPUT ${depdata})
 
-  add_executable(${GEN_TARGET} ${GEN_MAIN} ${GEN_OPERATOR} ${GEN_DRIVER})
+  # Define build rules for all operator header files and gather a list of them
+  set(header_deps)
+  foreach(op ${depdata___operators})
+    add_custom_command(OUTPUT ${depdata___${op}}
+                       COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env generate_operators
+                               --project-basedir ${CMAKE_BINARY_DIR}
+                               ${GEN_FORM_COMPILER_ARGS}
+                               --uflfile ${GEN_UFLFILE}
+                               --ini-file ${GEN_INIFILE}
+                               --target-name ${GEN_TARGET}
+                               --operator-to-build ${op}
+                       DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_PERFTOOL_ADDITIONAL_PYTHON_SOURCES}
+                       COMMENT "Generating operator file ${depdata___${op}} for the target ${GEN_TARGET}"
+                       )
+    set(header_deps ${header_deps} ${depdata___${op}})
+  endforeach()
 
+  add_executable(${GEN_TARGET} ${GEN_EXCLUDE_FROM_ALL} ${GEN_SOURCE} ${header_deps})
+  target_include_directories(${GEN_TARGET} PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
   add_dependencies(generation ${GEN_TARGET})
 endfunction()
 
-include(GeneratedSystemtests)
\ No newline at end of file
+include(GeneratedSystemtests)
diff --git a/cmake/modules/GeneratedSystemtests.cmake b/cmake/modules/GeneratedSystemtests.cmake
index de9dc259ffd2442bde0c522e2883e79b461ba621..634d7ad35605ef6707436eaad98f882c40bb4c1d 100644
--- a/cmake/modules/GeneratedSystemtests.cmake
+++ b/cmake/modules/GeneratedSystemtests.cmake
@@ -4,7 +4,7 @@
 function(dune_add_formcompiler_system_test)
   # parse arguments
   set(OPTION DEBUG NO_TESTS)
-  set(SINGLE INIFILE BASENAME SCRIPT UFLFILE)
+  set(SINGLE INIFILE BASENAME SCRIPT UFLFILE SOURCE)
   set(MULTI CREATED_TARGETS)
   cmake_parse_arguments(SYSTEMTEST "${OPTION}" "${SINGLE}" "${MULTI}" ${ARGN})
 
@@ -12,11 +12,15 @@ function(dune_add_formcompiler_system_test)
     message(WARNING "dune_add_system_test: Encountered unparsed arguments: This often indicates typos in named arguments")
   endif()
 
-  # construct a string containg DEBUG to pass the debug flag to the other macros
+  # Construct strings to pass options to other functions
   set(DEBUG "")
   if(SYSTEMTEST_DEBUG)
     set(DEBUG "DEBUG")
   endif()
+  set(SOURCE "")
+  if(SYSTEMTEST_SOURCE)
+    set(SOURCE SOURCE ${SYSTEMTEST_SOURCE})
+  endif()
 
   # set a default for the script. call_executable.py just calls the executable.
   # There, it is also possible to hook in things depending on the inifile
@@ -49,12 +53,26 @@ function(dune_add_formcompiler_system_test)
 
     add_generated_executable(TARGET ${tname}
                              UFLFILE ${SYSTEMTEST_UFLFILE}
-                             FORM_COMPILER_ARGS --ini-file ${inifile}
+                             INIFILE "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
                              DEPENDS ${SYSTEMTEST_INIFILE}
+                             EXCLUDE_FROM_ALL
+                             ${SOURCE}
                              )
 
-    # Exclude the target from all
-    set_property(TARGET ${tname} PROPERTY EXCLUDE_FROM_ALL 1)
+    # Enrich the target with preprocessor variables from the __static section
+    # just the way that dune-testtools does.
+    dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_extract_static.py
+                               --ini ${inifile}
+                         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                         OUTPUT_VARIABLE output
+                         ERROR_MESSAGE "Error extracting static info from ${inifile}")
+    parse_python_data(PREFIX STAT INPUT "${output}")
+
+    foreach(config ${STAT___CONFIGS})
+      foreach(cd ${STAT___STATIC_DATA})
+        target_compile_definitions(${tname} PUBLIC "${cd}=${STAT_${config}_${cd}}")
+      endforeach()
+    endforeach()
 
     # Add dependency on the metatarget for this systemtest
     if(NOT ${INIINFO_${inifile}_suffix} STREQUAL "__empty")
@@ -67,10 +85,15 @@ function(dune_add_formcompiler_system_test)
 
       _add_test(NAME ${tname}
                 COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env ${SYSTEMTEST_SCRIPT}
-                      --exec ${tname}
-                      --ini "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
-                      --source ${CMAKE_CURRENT_SOURCE_DIR}
-                )
+                --exec ${tname}
+                --ini "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
+                --source ${CMAKE_CURRENT_SOURCE_DIR}
+                --mpi-exec "${MPIEXEC}"
+                --mpi-numprocflag=${MPIEXEC_NUMPROC_FLAG}
+                --mpi-preflags "${MPIEXEC_PREFLAGS}"
+                --mpi-postflags "${MPIEXEC_POSTFLAGS}"
+                --max-processors=${DUNE_MAX_TEST_CORES}
+               )
 
       set_tests_properties(${tname} PROPERTIES SKIP_RETURN_CODE 77)
       set_tests_properties(${tname} PROPERTIES TIMEOUT 60)
diff --git a/cmake/modules/deplist.py b/cmake/modules/deplist.py
new file mode 100755
index 0000000000000000000000000000000000000000..9cb5d7d42cbfc712e18e37798dffc2d553416f8c
--- /dev/null
+++ b/cmake/modules/deplist.py
@@ -0,0 +1,26 @@
+# Return the list of generated files for a given ini file 
+# This is used by the build system, do not use this yourself!
+
+from dune.testtools.parser import parse_ini_file
+from dune.testtools.cmakeoutput import printForCMake
+
+import sys
+
+ini = parse_ini_file(sys.argv[1])
+section = ini["formcompiler"]
+operators = section.get("operators", "r")
+operators = [i.strip() for i in operators.split(",")]
+
+def get_filename(operator):
+    ssection = ini.get("formcompiler.{}".format(operator), {})
+    if ssection.get("filename", None):
+        return ssection["filename"]
+    else:
+        classname = ssection.get("classname", "{}Operator".format(ssection.get("form", operator)))
+        return "{}_{}_file.hh".format(sys.argv[2], classname)
+
+result = {"__{}".format(o): get_filename(o) for o in operators}
+result["__operators"] = ";".join(operators)
+
+printForCMake(result)
+sys.exit(0)
diff --git a/dune/perftool/blockstructured/blockstructuredqkfem.hh b/dune/perftool/blockstructured/blockstructuredqkfem.hh
index fd27ef36d65ac8f25c18f3f44fd0ebf68d16189e..90ad10e684fe2735527e527701dc4962b510e3d7 100644
--- a/dune/perftool/blockstructured/blockstructuredqkfem.hh
+++ b/dune/perftool/blockstructured/blockstructuredqkfem.hh
@@ -15,7 +15,7 @@ namespace Dune {
     //! \ingroup FiniteElementMap
     template<typename GV, typename D, typename R, std::size_t k>
     class BlockstructuredQkLocalFiniteElementMap
-        : public SimpleLocalFiniteElementMap< Dune::QkLocalFiniteElement<D,R,GV::dimension,k> >
+        : public SimpleLocalFiniteElementMap< Dune::QkLocalFiniteElement<D,R,GV::dimension,k>, GV::dimension>
     {
 
     public:
diff --git a/dune/perftool/common/opcounter.hh b/dune/perftool/common/opcounter.hh
index a103a06550665b26e840bf083c60b8019296b2be..edb16eaacc39f3c383fc7ace2fc7b733354116e7 100644
--- a/dune/perftool/common/opcounter.hh
+++ b/dune/perftool/common/opcounter.hh
@@ -16,6 +16,16 @@ namespace oc {
   template<typename F>
   class OpCounter;
 
+  template<typename T>
+  struct isOpCounter : public std::false_type
+  {};
+
+  template<typename F>
+  struct isOpCounter<OpCounter<F>> : public std::true_type
+  {};
+
+  template<typename T>
+  constexpr bool isOpCounterV = isOpCounter<T>::value;
 }
 
 namespace Dune {
diff --git a/dune/perftool/common/vectorclass.hh b/dune/perftool/common/vectorclass.hh
index 3d3fae8b61d6a435391160456952144dfd12d96d..6204b0a213796861533b9668f4c9a869198eb7cc 100644
--- a/dune/perftool/common/vectorclass.hh
+++ b/dune/perftool/common/vectorclass.hh
@@ -11,12 +11,38 @@
 
 #define BARRIER asm volatile("": : :"memory")
 
+template<typename T>
+struct base_floatingpoint
+{};
+
 #ifndef ENABLE_COUNTER
 
 #include <dune/perftool/vectorclass/vectorclass.h>
 #include <dune/perftool/vectorclass/vectormath_exp.h>
 #include <dune/perftool/vectorclass/vectormath_trig.h>
 
+template<>
+struct base_floatingpoint<Vec4d>
+{
+  using value = double;
+};
+
+template<>
+struct base_floatingpoint<Vec8f>
+{
+  using value = float;
+};
+
+#if MAX_VECTOR_SIZE >= 512
+
+template<>
+struct base_floatingpoint<Vec8d>
+{
+  using value = double;
+};
+
+#endif
+
 #else
 
 #include <algorithm>
@@ -46,10 +72,11 @@ struct Vec4d
     BARRIER;
   }
 
-  Vec4d(double d)
+  Vec4d(F dl, F du)
   {
     BARRIER;
-    std::fill(_d,_d+4,d);
+    std::fill(_d,_d+2,dl);
+    std::fill(_d+2,_d+4,du);
     BARRIER;
   }
 
@@ -114,6 +141,11 @@ struct Vec4d
 
 };
 
+template<>
+struct base_floatingpoint<Vec4d>
+{
+  using value = typename Vec4d::F;
+};
 
 /*****************************************************************************
 *
@@ -142,7 +174,7 @@ static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) {
 static inline Vec4d operator ++ (Vec4d & a, int) {
   BARRIER;
   Vec4d a0 = a;
-  a = a + 1.0;
+  a = a + Vec4d(1.0);
   BARRIER;
   return a0;
 }
@@ -150,7 +182,7 @@ static inline Vec4d operator ++ (Vec4d & a, int) {
 // prefix operator ++
 static inline Vec4d & operator ++ (Vec4d & a) {
   BARRIER;
-  a = a + 1.0;
+  a = a + Vec4d(1.0);
   BARRIER;
   return a;
 }
@@ -187,7 +219,7 @@ static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) {
 static inline Vec4d operator -- (Vec4d & a, int) {
   BARRIER;
   Vec4d a0 = a;
-  a = a - 1.0;
+  a = a - Vec4d(1.0);
   BARRIER;
   return a0;
 }
@@ -195,7 +227,7 @@ static inline Vec4d operator -- (Vec4d & a, int) {
 // prefix operator --
 static inline Vec4d & operator -- (Vec4d & a) {
   BARRIER;
-  a = a - 1.0;
+  a = a - Vec4d(1.0);
   BARRIER;
   return a;
 }
@@ -248,6 +280,30 @@ static inline _vcl::Vec4db operator == (Vec4d const & a, Vec4d const & b) {
   return a_ == b_;
 }
 
+// vector operator == : returns true for elements for which a == b
+static inline _vcl::Vec4db operator == (oc::OpCounter<double> a, Vec4d const & b) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ == b_;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline _vcl::Vec4db operator == (Vec4d const & b, oc::OpCounter<double> a) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ == b_;
+}
+
 // vector operator != : returns true for elements for which a != b
 static inline _vcl::Vec4db operator != (Vec4d const & a, Vec4d const & b) {
   BARRIER;
@@ -262,6 +318,30 @@ static inline _vcl::Vec4db operator != (Vec4d const & a, Vec4d const & b) {
   return a_ != b_;
 }
 
+// vector operator != : returns true for elements for which a != b
+static inline _vcl::Vec4db operator != (oc::OpCounter<double> a, Vec4d const & b) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ != b_;
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline _vcl::Vec4db operator != (Vec4d const & b, oc::OpCounter<double> a) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ != b_;
+}
+
 // vector operator < : returns true for elements for which a < b
 static inline _vcl::Vec4db operator < (Vec4d const & a, Vec4d const & b) {
   BARRIER;
@@ -276,6 +356,30 @@ static inline _vcl::Vec4db operator < (Vec4d const & a, Vec4d const & b) {
   return a_ < b_;
 }
 
+// vector operator < : returns true for elements for which a < b
+static inline _vcl::Vec4db operator < (oc::OpCounter<double> a, Vec4d const & b) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ < b_;
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline _vcl::Vec4db operator < (Vec4d const & b, oc::OpCounter<double> a) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return b_ < a_;
+}
+
 // vector operator <= : returns true for elements for which a <= b
 static inline _vcl::Vec4db operator <= (Vec4d const & a, Vec4d const & b) {
   BARRIER;
@@ -290,16 +394,61 @@ static inline _vcl::Vec4db operator <= (Vec4d const & a, Vec4d const & b) {
   return a_ <= b_;
 }
 
+// vector operator <= : returns true for elements for which a <= b
+static inline _vcl::Vec4db operator <= (oc::OpCounter<double> a, Vec4d const & b) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ <= b_;
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline _vcl::Vec4db operator <= (Vec4d const & b, oc::OpCounter<double> a) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return b_ <= a_;
+}
+
 // vector operator > : returns true for elements for which a > b
 static inline _vcl::Vec4db operator > (Vec4d const & a, Vec4d const & b) {
     return b < a;
 }
 
+// vector operator > : returns true for elements for which a > b
+static inline _vcl::Vec4db operator > (oc::OpCounter<double> a, Vec4d const & b) {
+    return a < b;
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline _vcl::Vec4db operator > (Vec4d const & b, oc::OpCounter<double> a) {
+    return a < b;
+}
+
 // vector operator >= : returns true for elements for which a >= b
 static inline _vcl::Vec4db operator >= (Vec4d const & a, Vec4d const & b) {
     return b <= a;
 }
 
+// vector operator >= : returns true for elements for which a >= b
+static inline _vcl::Vec4db operator >= (oc::OpCounter<double> a, Vec4d const & b) {
+    return b <= a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline _vcl::Vec4db operator >= (Vec4d const & b, oc::OpCounter<double> a) {
+    return a <= b;
+}
+
+
 // avoid logical operators for now, I don't think we need them
 #if 0
 
@@ -415,81 +564,40 @@ static inline Vec4d exp(Vec4d const & a){
   return r;
 }
 
-
-// ignore pow() for now
-#if 0
-
-// pow(Vec4d, int):
-template <typename TT> static Vec4d pow(Vec4d const & a, TT n);
-
-// Raise floating point numbers to integer power n
-template <>
-inline Vec4d pow<int>(Vec4d const & x0, int n) {
-    return pow_template_i<Vec4d>(x0, n);
+// pow
+template <typename TT>
+static inline Vec4d pow(Vec4d const & a, oc::OpCounter<TT> n)
+{
+  BARRIER;
+  Vec4d r;
+  std::transform(a._d,a._d+4,r._d,[=](auto x){ return pow(x, n); });
+  BARRIER;
+  return r;
 }
 
-// allow conversion from unsigned int
-template <>
-inline Vec4d pow<uint32_t>(Vec4d const & x0, uint32_t n) {
-    return pow_template_i<Vec4d>(x0, (int)n);
+// pow
+template <typename TT>
+static inline
+std::enable_if_t<not oc::isOpCounterV<TT>, Vec4d> pow(Vec4d const & a, TT n)
+{
+  BARRIER;
+  Vec4d r;
+  std::transform(a._d,a._d+4,r._d,[=](auto x){ return pow(x, n); });
+  BARRIER;
+  return r;
 }
 
 
-// Raise floating point numbers to integer power n, where n is a compile-time constant
-template <int n>
-static inline Vec4d pow_n(Vec4d const & a) {
-    if (n < 0)    return Vec4d(1.0) / pow_n<-n>(a);
-    if (n == 0)   return Vec4d(1.0);
-    if (n >= 256) return pow(a, n);
-    Vec4d x = a;                       // a^(2^i)
-    Vec4d y;                           // accumulator
-    const int lowest = n - (n & (n-1));// lowest set bit in n
-    if (n & 1) y = x;
-    if (n < 2) return y;
-    x = x*x;                           // x^2
-    if (n & 2) {
-        if (lowest == 2) y = x; else y *= x;
-    }
-    if (n < 4) return y;
-    x = x*x;                           // x^4
-    if (n & 4) {
-        if (lowest == 4) y = x; else y *= x;
-    }
-    if (n < 8) return y;
-    x = x*x;                           // x^8
-    if (n & 8) {
-        if (lowest == 8) y = x; else y *= x;
-    }
-    if (n < 16) return y;
-    x = x*x;                           // x^16
-    if (n & 16) {
-        if (lowest == 16) y = x; else y *= x;
-    }
-    if (n < 32) return y;
-    x = x*x;                           // x^32
-    if (n & 32) {
-        if (lowest == 32) y = x; else y *= x;
-    }
-    if (n < 64) return y;
-    x = x*x;                           // x^64
-    if (n & 64) {
-        if (lowest == 64) y = x; else y *= x;
-    }
-    if (n < 128) return y;
-    x = x*x;                           // x^128
-    if (n & 128) {
-        if (lowest == 128) y = x; else y *= x;
-    }
-    return y;
-}
-
-template <int n>
-static inline Vec4d pow(Vec4d const & a, Const_int_t<n>) {
-    return pow_n<n>(a);
+static inline Vec4d select(const _vcl::Vec4db& s, const Vec4d& a, const Vec4d& b)
+{
+  BARRIER;
+  Vec4d r;
+  for(int i=0; i<4; ++i)
+    r._d[i] = s.extract(i) ? a._d[i] : b._d[i];
+  BARRIER;
+  return r;
 }
 
-#endif
-
 // function round: round to nearest integer (even). (result as double vector)
 static inline Vec4d round(Vec4d const & a) {
   BARRIER;
@@ -632,10 +740,19 @@ struct Vec8d
     BARRIER;
   }
 
-  Vec8d(double d)
+  Vec8d(F dl, F du)
   {
     BARRIER;
-    std::fill(_d,_d+8,d);
+    std::fill(_d,_d+4,dl);
+    std::fill(_d+4,_d+8,du);
+    BARRIER;
+  }
+
+  Vec8d(Vec4d low, Vec4d high)
+  {
+    BARRIER;
+    std::copy(_d, _d+4, low._d);
+    std::copy(_d+4, _d+8, high._d);
     BARRIER;
   }
 
@@ -645,6 +762,24 @@ struct Vec8d
     BARRIER;
   }
 
+  Vec4d get_low() const
+  {
+    BARRIER;
+    Vec4d ret;
+    ret.load(_d);
+    BARRIER;
+    return ret;
+  }
+
+  Vec4d get_high() const
+  {
+    BARRIER;
+    Vec4d ret;
+    ret.load(_d + 4);
+    BARRIER;
+    return ret;
+  }
+
   Vec8d& load(const F* p)
   {
     BARRIER;
@@ -700,6 +835,11 @@ struct Vec8d
 
 };
 
+template<>
+struct base_floatingpoint<Vec8d>
+{
+  using value = typename Vec8d::F;
+};
 
 /*****************************************************************************
 *
@@ -728,7 +868,7 @@ static inline Vec8d & operator += (Vec8d & a, Vec8d const & b) {
 static inline Vec8d operator ++ (Vec8d & a, int) {
   BARRIER;
   Vec8d a0 = a;
-  a = a + 1.0;
+  a = a + Vec8d(1.0);
   BARRIER;
   return a0;
 }
@@ -736,7 +876,7 @@ static inline Vec8d operator ++ (Vec8d & a, int) {
 // prefix operator ++
 static inline Vec8d & operator ++ (Vec8d & a) {
   BARRIER;
-  a = a + 1.0;
+  a = a + Vec8d(1.0);
   BARRIER;
   return a;
 }
@@ -773,7 +913,7 @@ static inline Vec8d & operator -= (Vec8d & a, Vec8d const & b) {
 static inline Vec8d operator -- (Vec8d & a, int) {
   BARRIER;
   Vec8d a0 = a;
-  a = a - 1.0;
+  a = a - Vec8d(1.0);
   BARRIER;
   return a0;
 }
@@ -781,7 +921,7 @@ static inline Vec8d operator -- (Vec8d & a, int) {
 // prefix operator --
 static inline Vec8d & operator -- (Vec8d & a) {
   BARRIER;
-  a = a - 1.0;
+  a = a - Vec8d(1.0);
   BARRIER;
   return a;
 }
@@ -1139,10 +1279,15 @@ struct Vec8f
 
 };
 
+template<>
+struct base_floatingpoint<Vec8f>
+{
+  using value = typename Vec8f::F;
+};
 
 /*****************************************************************************
 *
-*          Operators for Vec4d
+*          Operators for Vec8f
 *
 *****************************************************************************/
 
@@ -1167,7 +1312,7 @@ static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) {
 static inline Vec8f operator ++ (Vec8f & a, int) {
   BARRIER;
   Vec8f a0 = a;
-  a = a + 1.0;
+  a = a + Vec8f(1.0);
   BARRIER;
   return a0;
 }
@@ -1175,7 +1320,7 @@ static inline Vec8f operator ++ (Vec8f & a, int) {
 // prefix operator ++
 static inline Vec8f & operator ++ (Vec8f & a) {
   BARRIER;
-  a = a + 1.0;
+  a = a + Vec8f(1.0);
   BARRIER;
   return a;
 }
@@ -1212,7 +1357,7 @@ static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) {
 static inline Vec8f operator -- (Vec8f & a, int) {
   BARRIER;
   Vec8f a0 = a;
-  a = a - 1.0;
+  a = a - Vec8f(1.0);
   BARRIER;
   return a0;
 }
@@ -1220,7 +1365,7 @@ static inline Vec8f operator -- (Vec8f & a, int) {
 // prefix operator --
 static inline Vec8f & operator -- (Vec8f & a) {
   BARRIER;
-  a = a - 1.0;
+  a = a - Vec8f(1.0);
   BARRIER;
   return a;
 }
diff --git a/dune/perftool/sumfact/horizontaladd.hh b/dune/perftool/sumfact/horizontaladd.hh
new file mode 100644
index 0000000000000000000000000000000000000000..db7634f0e5507214318dabb719240ff3674808ed
--- /dev/null
+++ b/dune/perftool/sumfact/horizontaladd.hh
@@ -0,0 +1,19 @@
+#ifndef DUNE_PERFTOOL_SUMFACT_HORIZONTALADD_HH
+#define DUNE_PERFTOOL_SUMFACT_HORIZONTALADD_HH
+
+#include<dune/perftool/common/vectorclass.hh>
+
+
+template<class V>
+typename base_floatingpoint<V>::value horizontal_add_lower(const V& x)
+{
+  return horizontal_add(x.get_low());
+}
+
+template<class V>
+typename base_floatingpoint<V>::value horizontal_add_upper(const V& x)
+{
+  return horizontal_add(x.get_high());
+}
+
+#endif
diff --git a/dune/perftool/sumfact/onedquadrature.hh b/dune/perftool/sumfact/onedquadrature.hh
index 72c4cde64cc7afb7e337b7bb7744ce33f8977e22..6ff3195c54f29c843ca5822fdaf05710834f57de 100644
--- a/dune/perftool/sumfact/onedquadrature.hh
+++ b/dune/perftool/sumfact/onedquadrature.hh
@@ -30,7 +30,7 @@ void onedQuadraturePointsWeights(RF (&qp)[m], RF (&qw)[m]){
   } // end 1D quadrature loop
   // Order 1D quadrature points lexicographically
   for (size_t j=0; j<m/2; j++){
-    if (qp[j]>0.5){
+    if (qp[j]>DF(0.5)){
       RF temp=qp[j];
       qp[j] = qp[m-1-j];
       qp[m-1-j] = temp;
diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh
index f73c6a2f717b243e32411a5feea948c7777ce430..d2ce09c39bf6ff5b87cc236cbf5d2a5bfcf850f1 100644
--- a/dune/perftool/sumfact/transposereg.hh
+++ b/dune/perftool/sumfact/transposereg.hh
@@ -66,6 +66,9 @@ void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
   a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3);
 }
 
+/** TODO: Is this transpose using blend8d superior to the swap_halves
+ *        version below using get_low/get_high?
+ */
 void transpose_reg (Vec8d& a0, Vec8d& a1)
 {
   Vec8d b0, b1;
@@ -75,6 +78,48 @@ void transpose_reg (Vec8d& a0, Vec8d& a1)
   a1 = b1;
 }
 
+namespace impl
+{
+  /* (alow, aupp), (blow, bupp) -> (alow, blow), (aupp, bupp) */
+  void swap_halves(Vec8d& a, Vec8d& b)
+  {
+    Vec4d tmp = a.get_high();
+    a = Vec8d(a.get_low(), b.get_low());
+    b = Vec8d(tmp, b.get_high());
+  }
+
+  /* A 4x8 transpose that behaves exactly like Vec4d's 4x4 transpose
+   * on the lower and upper halves of the Vec8d
+   */
+  void _transpose4x8(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
+  {
+    Vec8d b0,b1,b2,b3;
+    b0 = blend8d<0,8,2,10,4,12,6,14>(a0,a1);
+    b1 = blend8d<1,9,3,11,5,13,7,15>(a0,a1);
+    b2 = blend8d<0,8,2,10,4,12,6,14>(a2,a3);
+    b3 = blend8d<1,9,3,11,5,13,7,15>(a2,a3);
+    a0 = blend8d<0,1,8,9,4,5,12,13>(b0,b2);
+    a1 = blend8d<0,1,8,9,4,5,12,13>(b1,b3);
+    a2 = blend8d<2,3,10,11,6,7,14,15>(b0,b2);
+    a3 = blend8d<2,3,10,11,6,7,14,15>(b1,b3);
+  }
+}
+
+/* This is the 8x8 transpose of Vec8d's. It uses the same shuffling
+ * as Vec4d, but on the 4x4 subblocks. Afterwards, the off diagonal
+ * blocks are swapped.
+ */
+void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3,
+                   Vec8d& a4, Vec8d& a5, Vec8d& a6, Vec8d& a7)
+{
+  impl::_transpose4x8(a0,a1,a2,a3);
+  impl::_transpose4x8(a4,a5,a6,a7);
+  impl::swap_halves(a0,a4);
+  impl::swap_halves(a1,a5);
+  impl::swap_halves(a2,a6);
+  impl::swap_halves(a3,a7);
+}
+
 #endif
 
 #endif
diff --git a/patches/apply_patches.sh b/patches/apply_patches.sh
index 4abe4fa9de8f80bac0465ccfa7724035ce586063..5fa3ab5e28c162fb391e0b66ec5aec066d4b903a 100755
--- a/patches/apply_patches.sh
+++ b/patches/apply_patches.sh
@@ -2,6 +2,7 @@
 
 pushd python/loopy
 git apply ../../patches/loopy/Current.patch
+git apply ../../patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
 popd
 
 pushd dune/perftool/vectorclass
@@ -12,7 +13,3 @@ pushd python/ufl
 git apply ../../patches/ufl/conditional-uflid.patch
 git apply ../../patches/ufl/0001-Remove-special-case-for-variable-in-ufl2dot.patch
 popd
-
-pushd python/ufl
-git apply ../../patches/ufl/tensor-product-element.patch
-popd
diff --git a/patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch b/patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
new file mode 100644
index 0000000000000000000000000000000000000000..436533b399471411d105addab125814de66ec4e5
--- /dev/null
+++ b/patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
@@ -0,0 +1,33 @@
+From abac8a2068e0333a0f00c276519c24c5c16bedf4 Mon Sep 17 00:00:00 2001
+From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
+Date: Mon, 26 Mar 2018 11:13:42 +0200
+Subject: [PATCH] Disable a logging statement that breaks
+
+---
+ loopy/kernel/tools.py | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
+index 15840180..cb877eb6 100644
+--- a/loopy/kernel/tools.py
++++ b/loopy/kernel/tools.py
+@@ -197,11 +197,11 @@ def find_all_insn_inames(kernel):
+         assert isinstance(write_deps, frozenset), type(insn)
+         assert isinstance(iname_deps, frozenset), type(insn)
+ 
+-        logger.debug("%s: find_all_insn_inames: %s (init): %s - "
+-                "read deps: %s - write deps: %s" % (
+-                    kernel.name, insn.id, ", ".join(sorted(iname_deps)),
+-                    ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)),
+-                    ))
++#         logger.debug("%s: find_all_insn_inames: %s (init): %s - "
++#                 "read deps: %s - write deps: %s" % (
++#                     kernel.name, insn.id, ", ".join(sorted(iname_deps)),
++#                     ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)),
++#                     ))
+ 
+         insn_id_to_inames[insn.id] = iname_deps
+         insn_assignee_inames[insn.id] = write_deps & kernel.all_inames()
+-- 
+2.11.0
+
diff --git a/patches/ufl/tensor-product-element.patch b/patches/ufl/tensor-product-element.patch
deleted file mode 100644
index 9fc64f124e95bcbf28391ed5be4c87e1040a4e28..0000000000000000000000000000000000000000
--- a/patches/ufl/tensor-product-element.patch
+++ /dev/null
@@ -1,19 +0,0 @@
-commit f87dcd18d765b0200808b79b2e7374f82a0c6199
-Author: RenÃ© HeÃŸ <rene.hess@iwr.uni-heidelberg.de>
-Date:   Tue Aug 29 14:56:17 2017 +0200
-
-    Patch for TensorProductElements
-
-diff --git a/ufl/algorithms/compute_form_data.py b/ufl/algorithms/compute_form_data.py
-index 3388bbfc..1cef3924 100644
---- a/ufl/algorithms/compute_form_data.py
-+++ b/ufl/algorithms/compute_form_data.py
-@@ -56,7 +56,7 @@ def _auto_select_degree(elements):
-     """
-     # Use max degree of all elements, at least 1 (to work with
-     # Lagrange elements)
--    return max({e.degree() for e in elements} - {None} | {1})
-+    return max({e.degree() if not isinstance(e.degree(), tuple) else max(e.degree()) for e in elements} - {None} | {1})
- 
- 
- def _compute_element_mapping(form):
diff --git a/python/cgen b/python/cgen
index 0062a75a614db6602012b6e926c4b5ced06fcc89..f411383630b272a3a5d3e28b82acaaa530a64723 160000
--- a/python/cgen
+++ b/python/cgen
@@ -1 +1 @@
-Subproject commit 0062a75a614db6602012b6e926c4b5ced06fcc89
+Subproject commit f411383630b272a3a5d3e28b82acaaa530a64723
diff --git a/python/dune/perftool/blockstructured/geometry.py b/python/dune/perftool/blockstructured/geometry.py
index 951ada695f3c8d29d78cbf8992d2fe7172470083..1ffbdf0417c8261d2f0882b2384ed044af1ddd03 100644
--- a/python/dune/perftool/blockstructured/geometry.py
+++ b/python/dune/perftool/blockstructured/geometry.py
@@ -2,7 +2,7 @@ from dune.perftool.generation import (get_backend,
                                       temporary_variable,
                                       instruction)
 from dune.perftool.tools import get_pymbolic_basename
-from dune.perftool.options import (get_option,
+from dune.perftool.options import (get_form_option,
                                    option_switch)
 from dune.perftool.pdelab.geometry import (name_jacobian_determinant,
                                            local_dimension,
@@ -15,20 +15,20 @@ import pymbolic.primitives as prim
 # scale determinant according to the order of the blockstructure
 def pymbolic_jacobian_determinant():
     return prim.Quotient(prim.Variable(name_jacobian_determinant()),
-                         prim.Power(get_option("number_of_blocks"), local_dimension()))
+                         prim.Power(get_form_option("number_of_blocks"), local_dimension()))
 
 
 # scale Jacobian according to the order of the blockstructure
 def pymbolic_jacobian_inverse_transposed(i, j, restriction):
     name_jit = get_backend(interface="name_jit", selector=option_switch("constant_transformation_matrix"))(restriction)
-    return prim.Product((get_option("number_of_blocks"),
+    return prim.Product((get_form_option("number_of_blocks"),
                          prim.Subscript(prim.Variable(name_jit), (j, i))))
 
 
 # scale determinant according to the order of the blockstructure
 def pymbolic_facet_jacobian_determinant():
     return prim.Quotient(prim.Variable(name_facet_jacobian_determinant()),
-                         prim.Power(get_option("number_of_blocks"), local_dimension()))
+                         prim.Power(get_form_option("number_of_blocks"), local_dimension()))
 
 
 # translate a point in the micro element into macro coordinates
@@ -45,7 +45,7 @@ def define_point_in_macro(name, point_in_micro):
         else:
             expr = prim.Subscript(point_in_micro, (i,))
         expr = prim.Sum((expr, prim.Variable(subelem_inames[i]),))
-        expr = prim.Quotient(expr, get_option('number_of_blocks'))
+        expr = prim.Quotient(expr, get_form_option('number_of_blocks'))
         instruction(assignee=prim.Subscript(prim.Variable(name), (i,)),
                     expression=expr,
                     within_inames=frozenset(subelem_inames),
diff --git a/python/dune/perftool/blockstructured/tools.py b/python/dune/perftool/blockstructured/tools.py
index e3f1416e150920beb1d9ba186a34d5f0c1b384b6..a9bf01f26f5a52daa91f6cbbcf01bd31dffabb36 100644
--- a/python/dune/perftool/blockstructured/tools.py
+++ b/python/dune/perftool/blockstructured/tools.py
@@ -11,7 +11,7 @@ from dune.perftool.pdelab.geometry import (local_dimension,
 
 from dune.perftool.pdelab.quadrature import quadrature_inames
 from dune.perftool.generation.counter import get_counted_variable
-from dune.perftool.options import get_option
+from dune.perftool.options import get_form_option
 import pymbolic.primitives as prim
 
 
@@ -19,12 +19,13 @@ import pymbolic.primitives as prim
 # i.e. each element has (i_1,i_2,...,i_d) indices
 @iname
 def sub_element_inames():
+    name = "subel"
     dim = local_dimension()
     dim_names = ["x", "y", "z"] + [str(i) for i in range(4, dim + 1)]
     inames = tuple()
     for i in range(dim):
         inames = inames + ("subel_" + dim_names[i],)
-        domain("subel_" + dim_names[i], get_option("number_of_blocks"))
+        domain("subel_" + dim_names[i], get_form_option("number_of_blocks"))
     return inames
 
 
@@ -37,7 +38,7 @@ def sub_element_inames():
 def sub_facet_inames():
     subelem_inames = sub_element_inames()
 
-    center = pymbolic_in_cell_coordinates(prim.Variable(name_localcenter()), Restriction.NEGATIVE)
+    center = pymbolic_in_cell_coordinates(prim.Variable(name_localcenter()), Restriction.POSITIVE)
 
     # check if iname[index] must be constant or not
     def predicate(index):
@@ -58,7 +59,7 @@ def sub_facet_inames():
                     predicates=frozenset([prim.LogicalNot(predicate(index))])
                     )
 
-    k = get_option("number_of_blocks")
+    k = get_form_option("number_of_blocks")
 
     inames = ("x",)
     temporary_variable(inames[0])
@@ -112,7 +113,7 @@ def micro_index_to_macro_index(element, inames):
     elif it == "exterior_facet" or it == "interior_facet":
         subelem_inames = sub_facet_inames()
 
-    k = get_option("number_of_blocks")
+    k = get_form_option("number_of_blocks")
     p = element.degree()
     return prim.Sum(tuple((p * prim.Variable(si) + prim.Variable(bi)) * (p * k + 1) ** i
                           for i, (si, bi) in enumerate(zip(subelem_inames, inames))))
diff --git a/python/dune/perftool/cgen/__init__.py b/python/dune/perftool/cgen/__init__.py
index 128c4dedffb347b256ed64b518fdde9009ce9344..24af73e0d8a2b6dfce3c1146d86c9eeccf7e5a2a 100644
--- a/python/dune/perftool/cgen/__init__.py
+++ b/python/dune/perftool/cgen/__init__.py
@@ -3,6 +3,7 @@ from __future__ import absolute_import
 from cgen import *
 
 from dune.perftool.cgen.clazz import Class
+from dune.perftool.cgen.exceptions import TryCatchBlock, CatchBlock
 
 
 class Namespace(PrivateNamespace):
diff --git a/python/dune/perftool/cgen/clazz.py b/python/dune/perftool/cgen/clazz.py
index dca212eca2aecee421bee757c900f2f1778949d8..7f74353f8a8ad09960e4aaa5340c83c8fe0357b6 100644
--- a/python/dune/perftool/cgen/clazz.py
+++ b/python/dune/perftool/cgen/clazz.py
@@ -34,9 +34,10 @@ class BaseClass(Generable):
 
 
 class ClassMember(Generable):
-    def __init__(self, member, access=AccessModifier.PUBLIC):
+    def __init__(self, member, access=AccessModifier.PUBLIC, name=""):
         self.member = member
         self.access = access
+        self.name = name
 
         if isinstance(member, str):
             from cgen import Line
diff --git a/python/dune/perftool/cgen/exceptions.py b/python/dune/perftool/cgen/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..434b3a62eb7dbc7d4fcc2fbe40cb41b53e202401
--- /dev/null
+++ b/python/dune/perftool/cgen/exceptions.py
@@ -0,0 +1,40 @@
+""" Add Try/Catch blocks to cgen """
+
+from cgen import Block, Generable, Value
+
+
+class CatchBlock(Generable):
+    def __init__(self, exc_decl, catch_block):
+        assert isinstance(exc_decl, Value)
+        self.exc_decl = exc_decl
+        assert isinstance(catch_block, Block)
+        self.catch_block = catch_block
+
+    def generate(self):
+        yield "catch ({})\n".format("".join(self.exc_decl.generate(with_semicolon=False)))
+        for item in self.catch_block.generate():
+            yield item
+        yield "\n"
+
+
+class TryCatchBlock(Generable):
+    def __init__(self, try_block, catch_blocks):
+        # Store the try block
+        assert isinstance(try_block, Block)
+        self.try_block = try_block
+
+        assert all(isinstance(b, CatchBlock) for b in catch_blocks)
+        self.catch_blocks = catch_blocks
+
+    def generate(self):
+        # Yield the try block
+        yield "\n"
+        yield "try\n"
+        for item in self.try_block.generate():
+            yield item
+        yield "\n"
+
+        # and now yield all the catch blocks
+        for catch_block in self.catch_blocks:
+            for item in catch_block.generate():
+                yield item
diff --git a/python/dune/perftool/compile.py b/python/dune/perftool/compile.py
index cd5344c0da55f68b88a3ea1809f800105160fbbc..ed4c1310a9274528a3764f2739d7ee59c025ef21 100644
--- a/python/dune/perftool/compile.py
+++ b/python/dune/perftool/compile.py
@@ -16,13 +16,14 @@ from ufl.algorithms.formfiles import interpret_ufl_namespace
 from dune.perftool.generation import (delete_cache_items,
                                       global_context,
                                       )
-from dune.perftool.interactive import start_interactive_session
-from dune.perftool.options import get_option, initialize_options
+from dune.perftool.options import (get_form_option,
+                                   get_option,
+                                   initialize_options,
+                                   )
 from dune.perftool.pdelab.driver import generate_driver
-from dune.perftool.pdelab.localoperator import (generate_localoperator_basefile,
-                                                generate_localoperator_file,
+from dune.perftool.pdelab.localoperator import (generate_localoperator_file,
                                                 generate_localoperator_kernels,
-                                                name_localoperator_file)
+                                                )
 from dune.perftool.ufl.preprocess import preprocess_form
 
 from os.path import splitext, basename, join, dirname, abspath
@@ -53,8 +54,8 @@ def read_ufl(uflfile):
 
     Returns:
     --------
-    formdatas: List of formdatas found in uflfile.
-    forms: List of forms found in uflfile.
+    data: The data in the namespace after execution of the UFL file
+          and some custom postprocessing.
     """
     # Read the given ufl file and execute it
     uflcode = read_ufl_file(uflfile)
@@ -92,54 +93,39 @@ def read_ufl(uflfile):
     if get_option("exact_solution_expression"):
         data.object_by_name[get_option("exact_solution_expression")] = namespace[get_option("exact_solution_expression")]
 
-    magic_names = ("dirichlet_expression",
+    magic_names = ("interpolate_expression",
                    "is_dirichlet",
                    "exact_solution",
                    )
     for name in magic_names:
         data.object_by_name[name] = namespace.get(name, None)
 
-    formdatas = []
-    forms = data.forms
-    for index, form in enumerate(forms):
-        formdatas.append(preprocess_form(form))
-        forms[index] = formdatas[index].preprocessed_form
+    return data
 
-    # We expect at least one form
-    assert len(data.forms) >= 1
 
-    return formdatas, data
-
-
-# This function is the entrypoint of the ufl2pdelab executable
-def compile_form():
+def entry_generate_driver():
+    """ This is the entry point for driver generation """
     initialize_options()
-    formdatas, data = read_ufl(get_option("uflfile"))
+    data = read_ufl(get_option("uflfile"))
 
-    with global_context(data=data, formdatas=formdatas):
-        # Generate driver file
-        if get_option("driver_file"):
-            generate_driver(formdatas, data)
+    with global_context(data=data):
+        generate_driver()
 
-    # In case of multiple forms: Genarate one file that includes all localoperator files
-    if len(formdatas) > 1:
-        generate_localoperator_basefile(formdatas, data)
 
-    # Generate local operator files
-    for formdata in formdatas:
-        with global_context(data=data, formdata=formdata):
+def entry_generate_operators():
+    """ This is the entry point for operator generation """
+    initialize_options()
+    data = read_ufl(get_option("uflfile"))
+
+    with global_context(data=data):
+        operator = get_option("operator_to_build")
+        with global_context(form_identifier=operator):
             # Make sure cache is empty
             delete_cache_items()
 
-            # Create localoperator kernels
-            if get_option("operator_file"):
-                kernels = generate_localoperator_kernels(formdata, data)
-
-            # TODO insert sophisticated analysis/feedback loops here
-            if get_option("interactive"):
-                start_interactive_session(kernels)
+            # Choose the form from the UFL input
+            kernels = generate_localoperator_kernels(operator)
 
-            # Create c++ file from kernels
-            if get_option("operator_file"):
-                filename = name_localoperator_file(formdata, data)
-                generate_localoperator_file(formdata, kernels, filename)
+            # Write the result to a file
+            filename = get_form_option("filename")
+            generate_localoperator_file(kernels, filename)
diff --git a/python/dune/perftool/error.py b/python/dune/perftool/error.py
index 3f99a83abeb947890b5c7fb36ffd2663514796fb..4d428b41b516845c3eb18803539edbf293f44cf1 100644
--- a/python/dune/perftool/error.py
+++ b/python/dune/perftool/error.py
@@ -15,3 +15,7 @@ class PerftoolCodegenError(PerftoolError):
 
 class PerftoolLoopyError(PerftoolError):
     pass
+
+
+class PerftoolVectorizationError(PerftoolCodegenError):
+    pass
diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py
index c8c085c178d7dc2224d1efd1d76efaacf9da5259..e541e71366371039157cb757201f710ffd0a19ca 100644
--- a/python/dune/perftool/generation/__init__.py
+++ b/python/dune/perftool/generation/__init__.py
@@ -43,6 +43,7 @@ from dune.perftool.generation.loopy import (barrier,
                                             kernel_cached,
                                             noop_instruction,
                                             silenced_warning,
+                                            subst_rule,
                                             temporary_variable,
                                             transform,
                                             valuearg,
diff --git a/python/dune/perftool/generation/cache.py b/python/dune/perftool/generation/cache.py
index 474e0e104040bb0caefea5361e5868790380784f..b4ae54f28479d607907ac59c3c6078283535bc85 100644
--- a/python/dune/perftool/generation/cache.py
+++ b/python/dune/perftool/generation/cache.py
@@ -69,6 +69,7 @@ class _RegisteredFunction(object):
                  on_store=lambda x: x,
                  item_tags=(),
                  context_tags=(),
+                 section=None,
                  **kwargs
                  ):
         self.func = func
@@ -78,6 +79,8 @@ class _RegisteredFunction(object):
         self.item_tags = item_tags
         self.context_tags = context_tags
         self.kwargs = kwargs
+        if section:
+            self.item_tags = self.item_tags + (section,)
 
         # Initialize the memoization cache
         self._memoize_cache = {}
diff --git a/python/dune/perftool/generation/cpp.py b/python/dune/perftool/generation/cpp.py
index 0f44b6950ef9b111f90f0d7a922835912c5b4500..858dc1fcaf50d07b08c044525db994f85ca8de29 100644
--- a/python/dune/perftool/generation/cpp.py
+++ b/python/dune/perftool/generation/cpp.py
@@ -18,7 +18,7 @@ template_parameter = generator_factory(item_tags=("template_param",), context_ta
 class_basename = generator_factory(item_tags=("basename",), context_tags=("classtag",))
 
 
-@generator_factory(item_tags=("file", "include"), context_tags=("filetag",))
+@generator_factory(item_tags=("file", "include"), context_tags=("filetag",), counted=True)
 def include_file(include, system=False):
     return cgen.Include(include, system=system)
 
diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index a97df4744fd1e661554a6febb306a8f858589c31..a4d8292f5f88bc315980efb521bf7c2cf6a95153 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -140,10 +140,9 @@ def _insn_cache_key(code=None, expression=None, **kwargs):
 def instruction(code=None, expression=None, **kwargs):
     assert (code is not None) or (expression is not None)
     assert not ((code is not None) and (expression is not None))
-    assert 'id' not in kwargs
 
     # Get an ID for this instruction
-    id = 'insn_{}'.format(str(get_counter('__insn_id')).zfill(4))
+    id = kwargs.pop("id", 'insn_{}'.format(str(get_counter('__insn_id')).zfill(4)))
 
     # Now create the actual instruction
     if code:
@@ -172,8 +171,8 @@ def noop_instruction(**kwargs):
                    context_tags="kernel",
                    cache_key_generator=no_caching,
                    )
-def transform(trafo, *args):
-    return (trafo, args)
+def transform(trafo, *args, **kwargs):
+    return (trafo, args, kwargs)
 
 
 @generator_factory(item_tags=("instruction", "barrier"),
@@ -216,3 +215,8 @@ def loopy_class_member(name, classtag=None, potentially_vectorized=False, **kwar
     globalarg(name, **kwargs)
 
     return name
+
+
+@generator_factory(item_tags=("substrule",), context_tags="kernel")
+def subst_rule(name, args, expr):
+    return lp.SubstitutionRule(name, args, expr)
diff --git a/python/dune/perftool/interactive.py b/python/dune/perftool/interactive.py
deleted file mode 100644
index 77094c7524fc2bc5d6a1dc1e0b4254458056ac35..0000000000000000000000000000000000000000
--- a/python/dune/perftool/interactive.py
+++ /dev/null
@@ -1,144 +0,0 @@
-from __future__ import print_function
-from functools import partial
-
-from dune.perftool.generation import global_context
-from dune.perftool.loopy.transformations import get_loopy_transformations
-from dune.perftool.pdelab.localoperator import LoopyKernelMethod
-from dune.perftool.pdelab.signatures import assembly_routine_signature
-
-import os
-
-
-# Use the builtin 'input' in python2 and 'raw_input' in python3
-try:
-    input = raw_input
-except:
-    pass
-
-
-def clear():
-    os.system('cls' if os.name == 'nt' else 'clear')
-
-
-def kernel_name(v):
-    first = None
-    if v[1] == "residual":
-        first = "alpha"
-    if v[1] == "jacobian":
-        first = "jacobian"
-    assert first
-
-    second = None
-    if v[0] == "cell":
-        second = "volume"
-    if v[0] == "exterior_facet":
-        second = "boundary"
-    if v[0] == "interior_facet":
-        second = "skeleton"
-    assert second
-
-    return "{}_{}".format(first, second)
-
-
-def show_kernel(which, kernel):
-    clear()
-    print("Showing the loo.py kernel for {}:\n".format(kernel_name(which)))
-    print(kernel.stringify(with_dependencies=True))
-    print("Press Return to return to the previous menu")
-    input()
-    return kernel
-
-
-def choose_transformation(which, kernel):
-    choice = None
-    while choice != "q":
-        clear()
-        keymap = {}
-        print("Choose one of the following transformations to apply to {}:\n".format(kernel_name(which)))
-
-        print("Transformations:")
-        for i, v in enumerate(get_loopy_transformations().values()):
-            print("  {}) {}".format(chr(ord('a') + i), v.name))
-            if v.description:
-                print("       {}".format(v.description))
-            keymap[chr(ord('a') + i)] = v
-
-        print("\n  q) Return to kernel options")
-        print("\nYour choice:")
-
-        choice = input().lower()
-        try:
-            kernel = keymap[choice](kernel)
-        except KeyError:
-            pass
-
-    return kernel
-
-
-def show_code(which, kernel):
-    clear()
-    print("Showing the generated dune-pdelab code for {}:\n".format(kernel_name(which)))
-
-    with global_context(integral_type=which[0], form_type=which[1]):
-        signature = assembly_routine_signature()
-        print("".join(LoopyKernelMethod(signature, kernel).generate()))
-
-    print("Press Return to return to the previous menu")
-    input()
-    return kernel
-
-
-def optimize_kernel(which, kernels):
-    kernel = kernels[which]
-    choice = None
-
-    while choice != "q":
-        clear()
-        print("Optimizing kernel {}:\n".format(kernel_name(which)))
-
-        print("Available options:")
-        print("  a) Show the loopy kernel")
-        print("  b) Apply loopy transformation")
-        print("  c) Show generated PDELab code for this kernel.")
-
-        print("\n  q) Return to the kernel overview")
-        print("\nYour choice:")
-
-        choice = input().lower()
-        try:
-            kernel = {'a': partial(show_kernel, which),
-                      'b': partial(choose_transformation, which),
-                      'c': partial(show_code, which)
-                      }[choice](kernel)
-        except KeyError:
-            pass
-
-    kernels[which] = kernel
-
-
-def kernel_choice(kernels):
-    choice = None
-    while choice != "q":
-        clear()
-        print("The following kernels are in the input. Pick one to optimize:")
-
-        keymap = {}
-        for i, k in enumerate(kernels.keys()):
-            print("  {}) {}".format(chr(ord('a') + i), kernel_name(k)))
-            keymap[chr(ord('a') + i)] = partial(optimize_kernel, k)
-
-        print("\n  q) End this interactive session and proceed to code generation")
-
-        print("\nYour choice: ")
-        choice = input().lower()
-        try:
-            keymap[choice](kernels)
-        except KeyError:
-            pass
-
-
-def start_interactive_session(kernels):
-    clear()
-    print("Welcome to the dune-perftool interactive mode!\n")
-
-    kernel_choice(kernels)
diff --git a/python/dune/perftool/loopy/buffer.py b/python/dune/perftool/loopy/buffer.py
deleted file mode 100644
index c0e2ab310ce4cf6ddc8c8ea403d2e5cad1c63365..0000000000000000000000000000000000000000
--- a/python/dune/perftool/loopy/buffer.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from dune.perftool.error import PerftoolLoopyError
-from dune.perftool.generation import (get_counted_variable,
-                                      kernel_cached,
-                                      temporary_variable,
-                                      )
-
-
-class FlipFlopBuffer(object):
-    def __init__(self, identifier):
-        self.identifier = identifier
-
-        # Initialize the counter that switches between the base storages!
-        self._current = 0
-
-        # Generate the base storage names
-        self.base_storage = tuple("{}_base_{}".format(self.identifier, i) for i in (0, 1))
-
-    def switch_base_storage(self):
-        self._current = (self._current + 1) % 2
-
-    def get_temporary(self, **kwargs):
-        assert("base_storage" not in kwargs)
-        assert("storage_shape" not in kwargs)
-
-        # Select the base storage and increase counter
-        base = self.base_storage[self._current]
-
-        # Construct a temporary name
-        name = kwargs.pop("name", None)
-        if name is None:
-            name = get_counted_variable(self.identifier)
-
-        # Construct the temporary and return it
-        temporary_variable(name,
-                           base_storage=base,
-                           managed=True,
-                           _base_storage_access_may_be_aliasing=True,
-                           **kwargs
-                           )
-
-        return name
-
-
-@kernel_cached
-def initialize_buffer(identifier):
-    assert isinstance(identifier, str)
-    return FlipFlopBuffer(identifier)
-
-
-def get_buffer_temporary(identifier, **kwargs):
-    return initialize_buffer(identifier).get_temporary(**kwargs)
-
-
-def switch_base_storage(identifier):
-    initialize_buffer(identifier).switch_base_storage()
diff --git a/python/dune/perftool/loopy/mangler.py b/python/dune/perftool/loopy/mangler.py
index 297968c6b8332e3af00c4d101656a2e970d63f70..29b0d503bcc0269c7a39d0c3a3accc1657781577 100644
--- a/python/dune/perftool/loopy/mangler.py
+++ b/python/dune/perftool/loopy/mangler.py
@@ -6,6 +6,7 @@ from dune.perftool.generation import (function_mangler,
                                       )
 
 from loopy import CallMangleInfo
+from loopy.types import to_loopy_type
 
 import numpy as np
 
@@ -48,3 +49,14 @@ def dune_math_manglers(kernel, name, arg_dtypes):
                               (dt,),
                               (dt,) * len(arg_dtypes),
                               )
+
+
+@function_mangler
+def get_time_function_mangler(kernel, name, arg_dtypes):
+    """ The getTime method is defined on local operators once they inherit from
+    InstationaryLocalOperatorDefaultMethods
+    """
+    if name == "getTime":
+        assert(len(arg_dtypes) == 0)
+        from dune.perftool.loopy.target import dtype_floatingpoint
+        return CallMangleInfo("this->getTime", (to_loopy_type(dtype_floatingpoint()),), ())
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index 408a7a3df3a2b4058420db27a1f750ebe98fa7c1..8a89579e70f3c5bad2fc3bd3548836836a0b6d45 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -9,6 +9,7 @@ from dune.perftool.generation import (include_file,
                                       retrieve_cache_functions,
                                       )
 from dune.perftool.options import get_option
+from dune.perftool.tools import round_to_multiple
 
 from loopy.symbolic import Literal
 from loopy.target import (TargetBase,
@@ -146,7 +147,7 @@ class DuneCExpressionToCodeMapper(CExpressionToCodeMapper):
 
 class DuneASTBuilder(CASTBuilder):
     def function_manglers(self):
-        return CASTBuilder.function_manglers(self) + retrieve_cache_functions("mangler")
+        return retrieve_cache_functions("mangler") + CASTBuilder.function_manglers(self)
 
     def get_expression_to_c_expression_mapper(self, codegen_state):
         return DuneExpressionToCExpressionMapper(codegen_state)
@@ -154,14 +155,16 @@ class DuneASTBuilder(CASTBuilder):
     def get_c_expression_to_code_mapper(self):
         return DuneCExpressionToCodeMapper()
 
-    def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info):
+    def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info):
         # If this is not a DuneTemporaryVariable, it was introduced by loopy
         # and it should be totally under loopys control: Call the base class implementation!
         if not (isinstance(temp_var, DuneTemporaryVariable) and temp_var.custom_declaration):
-            return CASTBuilder.get_temporary_decl(self, knl, schedule_index, temp_var, decl_info)
+            return CASTBuilder.get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info)
 
-        if temp_var.decl_method:
-            return cgen.Line(temp_var.decl_method(temp_var.name, temp_var.shape, temp_var.shape_impl))
+        if temp_var.custom_declaration:
+            decl = temp_var.decl_method(temp_var.name, codegen_state.kernel, decl_info)
+            if decl:
+                return cgen.Line(decl)
 
     def add_vector_access(self, access_expr, index):
         # There is no generic way of implementing a vector access with VCL, as
@@ -176,10 +179,34 @@ class DuneASTBuilder(CASTBuilder):
         return cgen.Line("BARRIER;")
 
     def get_temporary_decls(self, codegen_state, schedule_index):
+        temps = codegen_state.kernel.temporary_variables.values()
+        # Declare all the custom base storages
+        ret = []
+        for bs in set(t.custom_base_storage for t in temps if isinstance(t, DuneTemporaryVariable)) - set({None}):
+            if bs in [a.name for a in codegen_state.kernel.args]:
+                continue
+
+            # Find the alignment bytes
+            alignment = []
+            size = []
+            for t in temps:
+                if isinstance(t, DuneTemporaryVariable) and t.custom_base_storage == bs:
+                    # TODO Extract alignment from the temporaries after switching to loopy 2018.1
+                    alignment.append(get_option("max_vector_width") // 8)
+                    from pytools import product
+                    size.append(product(t.shape))
+
+            alignment = max(alignment)
+            size = max(size)
+            size = round_to_multiple(size, alignment)
+
+            decl = "char {}[{}] __attribute__ ((aligned({})));".format(bs, size * 8, alignment)
+            ret.append(cgen.Line(decl))
+
         if self.target.declare_temporaries:
-            return CASTBuilder.get_temporary_decls(self, codegen_state, schedule_index)
+            return ret + CASTBuilder.get_temporary_decls(self, codegen_state, schedule_index)
         else:
-            return []
+            return ret
 
 
 class BlockstructuredDuneExpressionToCExpressionMapper(DuneExpressionToCExpressionMapper):
diff --git a/python/dune/perftool/loopy/temporary.py b/python/dune/perftool/loopy/temporary.py
index d916f6b0312ac763d60829047a52056d088014bc..2bf78ce94c573ca0f1614fc89b72fee52fa16333 100644
--- a/python/dune/perftool/loopy/temporary.py
+++ b/python/dune/perftool/loopy/temporary.py
@@ -5,6 +5,7 @@ from dune.perftool.error import PerftoolLoopyError
 
 from loopy import TemporaryVariable
 
+import loopy as lp
 import numpy
 
 
@@ -27,7 +28,10 @@ def _temporary_type(shape_impl, shape, first=True):
         return "Dune::FieldMatrix<{}, {}, {}>".format(_type, shape[0], shape[1])
 
 
-def default_declaration(name, shape=(), shape_impl=()):
+def default_declaration(name, kernel, decl_info):
+    shape = kernel.temporary_variables[name].shape
+    shape_impl = kernel.temporary_variables[name].shape_impl
+
     # Determine the C++ type to use for this temporary.
     t = _temporary_type(shape_impl, shape)
     if len(shape_impl) == 0:
@@ -44,11 +48,20 @@ def default_declaration(name, shape=(), shape_impl=()):
         return '{} {}(0.0);'.format(t, name)
 
 
+def custom_base_storage_temporary_declaration(storage):
+    def _decl(name, kernel, decl_info):
+        dtype = kernel.temporary_variables[name].dtype
+        _type = kernel.target.dtype_to_typename(decl_info.dtype)
+        return "{0} *{1} = ({0} *){2};".format(_type, name, storage)
+
+    return _decl
+
+
 class DuneTemporaryVariable(TemporaryVariable):
 
-    allowed_extra_kwargs = TemporaryVariable.allowed_extra_kwargs + ["managed", "shape_impl", "decl_method"]
+    allowed_extra_kwargs = TemporaryVariable.allowed_extra_kwargs + ["managed", "shape_impl", "decl_method", "custom_base_storage"]
 
-    def __init__(self, name, managed=False, shape_impl=None, decl_method=None, **kwargs):
+    def __init__(self, name, managed=False, shape_impl=None, decl_method=None, custom_base_storage=None, **kwargs):
         self.managed = managed
         self.decl_method = decl_method
         self.shape_impl = shape_impl
@@ -59,6 +72,15 @@ class DuneTemporaryVariable(TemporaryVariable):
         from dune.perftool.loopy.target import dtype_floatingpoint
         kwargs.setdefault('dtype', dtype_floatingpoint())
 
+        if custom_base_storage and self.decl_method is None:
+            assert shape_impl is None
+            self.decl_method = custom_base_storage_temporary_declaration(custom_base_storage)
+
         self.custom_declaration = self.decl_method is not None
 
-        TemporaryVariable.__init__(self, name, managed=self.managed, shape_impl=self.shape_impl, decl_method=self.decl_method, **kwargs)
+        TemporaryVariable.__init__(self, name,
+                                   managed=self.managed,
+                                   shape_impl=self.shape_impl,
+                                   decl_method=self.decl_method,
+                                   custom_base_storage=custom_base_storage,
+                                   **kwargs)
diff --git a/python/dune/perftool/loopy/transformations/__init__.py b/python/dune/perftool/loopy/transformations/__init__.py
index db43b04544a9d0852d9181843e2a0fd554ac2bb6..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/python/dune/perftool/loopy/transformations/__init__.py
+++ b/python/dune/perftool/loopy/transformations/__init__.py
@@ -1,35 +0,0 @@
-""" Infrastructure for loopy transformations.
-These are registered to list them in interactive mode
-"""
-
-_loopy_trafo_registry = {}
-
-
-def get_loopy_transformations():
-    return _loopy_trafo_registry
-
-
-class LoopyTransformationWrapper(object):
-    def __init__(self, f, name=None, description=""):
-        self.func = f
-        self.name = name
-        self.description = description
-
-        assert name
-        assert name not in _loopy_trafo_registry
-
-        _loopy_trafo_registry[name] = self
-
-    def __call__(self, kernel):
-        return self.func(kernel)
-
-
-def loopy_transformation(_positional_arg=None, **kwargs):
-    assert not _positional_arg
-    return lambda f: LoopyTransformationWrapper(f, **kwargs)
-
-
-# Just for debugging purposes we add an identity transformation here.
-@loopy_transformation(name="identity", description='''Does not change the kernel. Proof of concept implementation''')
-def _identity(kernel):
-    return kernel
diff --git a/python/dune/perftool/loopy/transformations/disjointgroups.py b/python/dune/perftool/loopy/transformations/disjointgroups.py
deleted file mode 100644
index 78ea4c9d6574f294416c5bb0f6c440083ddd3603..0000000000000000000000000000000000000000
--- a/python/dune/perftool/loopy/transformations/disjointgroups.py
+++ /dev/null
@@ -1,13 +0,0 @@
-""" A helper transformation that makes all groups conflicting """
-
-from dune.perftool.options import get_option
-
-
-def make_groups_conflicting(knl):
-    # As this transformation introduces a performance bug that basically
-    # kills our CI, we only apply it if really needed - meaning in production.
-    if get_option("assure_statement_ordering"):
-        groups = frozenset().union(*tuple(i.groups for i in knl.instructions))
-        return knl.copy(instructions=[i.copy(conflicts_with_groups=groups - i.groups) for i in knl.instructions])
-    else:
-        return knl
diff --git a/python/dune/perftool/loopy/transformations/instrumentation.py b/python/dune/perftool/loopy/transformations/instrumentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..89b08b6f0e191ca06db56d820c585ebe585e250b
--- /dev/null
+++ b/python/dune/perftool/loopy/transformations/instrumentation.py
@@ -0,0 +1,101 @@
+""" Add instrumentation instructions to a kernel """
+
+from dune.perftool.generation import (dump_accumulate_timer,
+                                      post_include,
+                                      )
+from dune.perftool.options import get_option
+
+import loopy as lp
+
+
+def _intersect(a):
+    """ Return intersection of a given tuple of frozensets. Also works for empty tuple """
+    if len(a) == 0:
+        return frozenset()
+    return frozenset.intersection(*a)
+
+
+def _union(a):
+    """ Return union of a given tuple of frozensets. Also works for empty tuple """
+    if len(a) == 0:
+        return frozenset()
+    return frozenset.union(*a)
+
+
+def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', operator=False):
+    """ Transform loopy kernel to contain instrumentation code
+
+    Arguments:
+    knl : The loopy kernel, follows the loopy transformation convention
+    match : A loopy match object or a string (interpreted as instruction ID or tag) to describe
+            which instructions should be wrapped in an instrumentation block.
+    identifier : The name of the counter to start and stop
+    level : The instrumentation level this measurement is defined at
+    filetag : The tag of the file that should contain the counter definitions
+    """
+    # If the instrumentation level is not high enough, this is a no-op
+    if level > get_option("instrumentation_level"):
+        return knl
+
+    # If a string was given for match, heuristically make it a match object
+    if isinstance(match, str):
+        match = lp.match.Or((lp.match.Id(match), lp.match.Tagged(match)))
+
+    # Find the instructions to wrap in instrumentation
+    insns = lp.find_instructions(knl, match)
+    rewritten_insns = []
+
+    # If the match is empty, this is also no op
+    if not insns:
+        return knl
+
+    # Determine the iname nesting of the timing block
+    insn_inames = _intersect(tuple(i.within_inames for i in insns))
+    other_inames = _union(tuple(i.within_inames for i in lp.find_instructions(knl, lp.match.Not(match))))
+    within = _intersect((insn_inames, other_inames))
+
+    # Get a unique identifer - note that the same timer could be started and stopped several times
+    # within one kernel...
+    ident = identifier
+    if lp.find_instructions(knl, lp.match.Id("{}_start".format(identifier))):
+        ident = "{}_".format(ident)
+
+    # Define the start instruction and correct dependencies for it
+    start_id = "{}_start".format(ident)
+    start_depends = _union(tuple(i.depends_on for i in insns)).difference(frozenset(i.id for i in insns))
+    start_insn = lp.CInstruction([],
+                                 "HP_TIMER_START({});".format(identifier),
+                                 id=start_id,
+                                 within_inames=within,
+                                 depends_on=start_depends,
+                                 boostable_into=frozenset(),
+                                 )
+
+    # Add dependencies on the timing instructions
+    rewritten_insns.extend([i.copy(depends_on=i.depends_on.union(frozenset({start_id}))) for i in insns])
+
+    # Define the stop instruction and correct dependencies for it
+    stop_id = "{}_stop".format(ident)
+    stop_insn = lp.CInstruction([],
+                                "HP_TIMER_STOP({});".format(identifier),
+                                id=stop_id,
+                                within_inames=within,
+                                depends_on=frozenset(i.id for i in insns),
+                                boostable_into=frozenset(),
+                                )
+
+    # Find all the instructions that should depend on stop
+    dep_insns = filter(lambda i: _intersect((i.depends_on, frozenset(i.id for i in insns))),
+                       lp.find_instructions(knl, lp.match.Not(match))
+                       )
+    rewritten_insns.extend([i.copy(depends_on=i.depends_on.union(frozenset({stop_id}))) for i in dep_insns])
+
+    # Trigger code generation on the file/operator level
+    post_include('HP_DECLARE_TIMER({});'.format(identifier), filetag=filetag)
+    dump_accumulate_timer(identifier)
+
+    # Filter all the instructions which were untouched
+    other_insns = list(filter(lambda i: i.id not in [j.id for j in rewritten_insns], knl.instructions))
+
+    # Add all the modified instructions into the kernel object
+    return knl.copy(instructions=rewritten_insns + other_insns + [start_insn, stop_insn])
diff --git a/python/dune/perftool/loopy/transformations/vectorize_quad.py b/python/dune/perftool/loopy/transformations/vectorize_quad.py
index fa5b03c204b4d77f628a7566897d33dae3e67a7c..f82f482929a5eaa8f742bbe388c8eeff5bce4ceb 100644
--- a/python/dune/perftool/loopy/transformations/vectorize_quad.py
+++ b/python/dune/perftool/loopy/transformations/vectorize_quad.py
@@ -7,8 +7,7 @@ from dune.perftool.generation import (function_mangler,
                                       )
 from dune.perftool.loopy.target import dtype_floatingpoint
 from dune.perftool.loopy.vcl import get_vcl_type, get_vcl_type_size
-from dune.perftool.loopy.transformations.vectorview import (add_temporary_with_vector_view,
-                                                            add_vector_view,
+from dune.perftool.loopy.transformations.vectorview import (add_vector_view,
                                                             get_vector_view_name,
                                                             )
 from dune.perftool.loopy.symbolic import substitute
@@ -149,7 +148,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
             knl = knl.copy(temporary_variables=tmps)
 
             # Introduce a vector view of the precomputation result
-            knl = add_vector_view(knl, prec_quantity, flatview=True)
+            knl = add_vector_view(knl, prec_quantity)
 
     #
     # Construct a flat loop for the given instructions
@@ -196,7 +195,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
                 horizontal, vertical = tuple(int(i) for i in re.match("vecsumfac_h(.*)_v(.*)", tag).groups())
 
                 # 1. Rotating the input data
-                knl = add_vector_view(knl, quantity, flatview=True)
+                knl = add_vector_view(knl, quantity)
                 if horizontal > 1:
                     new_insns.append(lp.CallInstruction((),  # assignees
                                                         prim.Call(TransposeReg(vertical=vertical, horizontal=horizontal),
@@ -207,6 +206,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
                                                         within_inames=common_inames.union(frozenset({outer_iname, vec_iname})),
                                                         within_inames_is_final=True,
                                                         id="{}_rotate{}".format(quantity, suffix),
+                                                        tags=frozenset({"sumfact_stage2"}),
                                                         ))
 
                 # Add substitution rules
@@ -219,7 +219,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
             elif tag is not None and tag == 'sumfac':
                 # Add a vector view to this quantity
                 expr, = quantity_exprs
-                knl = add_vector_view(knl, quantity, flatview=True)
+                knl = add_vector_view(knl, quantity)
                 replacemap[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)),
                                                   (vector_indices.get(1), prim.Variable(vec_iname)),
                                                   )
@@ -243,7 +243,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
     for insn in insns:
         # Get a vector view of the lhs expression
         lhsname = get_pymbolic_basename(insn.assignee)
-        knl = add_vector_view(knl, lhsname, pad_to=vec_size, flatview=True)
+        knl = add_vector_view(knl, lhsname)
         lhsname = get_vector_view_name(lhsname)
         rotating = "gradvec" in insn.tags
 
@@ -268,7 +268,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
                                        within_inames=common_inames.union(frozenset({outer_iname, vec_iname})),
                                        within_inames_is_final=True,
                                        id=insn.id,
-                                       tags=frozenset({"vec_write{}".format(suffix)})
+                                       tags=frozenset({"vec_write{}".format(suffix), "sumfact_stage2"})
                                        )
                          )
 
@@ -283,6 +283,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
                                                 within_inames=common_inames.union(frozenset({outer_iname, vec_iname})),
                                                 within_inames_is_final=True,
                                                 id="{}_rotateback{}".format(lhsname, suffix),
+                                                tags=frozenset({"sumfact_stage2"}),
                                                 ))
 
     # Add the necessary vector indices
@@ -297,6 +298,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
                                        within_inames=common_inames,
                                        within_inames_is_final=True,
                                        id="assign_{}{}".format(name, suffix),
+                                       tags=frozenset({"sumfact_stage2"}),
                                        ))
         new_insns.append(lp.Assignment(prim.Variable(name),  # assignee
                                        prim.Sum((prim.Variable(name), increment)),  # expression
@@ -305,6 +307,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
                                        depends_on=frozenset({Tagged("vec_write{}".format(suffix)), "assign_{}{}".format(name, suffix)}),
                                        depends_on_is_final=True,
                                        id="update_{}{}".format(name, suffix),
+                                       tags=frozenset({"sumfact_stage2"}),
                                        ))
 
     from loopy.kernel.creation import resolve_dependencies
diff --git a/python/dune/perftool/loopy/transformations/vectorview.py b/python/dune/perftool/loopy/transformations/vectorview.py
index 1160c8bda81f0f81f6a6c9c6b8c2db327b0fe79d..3a812abf50289b58e14bd2f155d155cc7bc3155d 100644
--- a/python/dune/perftool/loopy/transformations/vectorview.py
+++ b/python/dune/perftool/loopy/transformations/vectorview.py
@@ -5,7 +5,9 @@ being a an array of SIMD vectors
 """
 
 from dune.perftool.loopy.target import dtype_floatingpoint
+from dune.perftool.loopy.temporary import DuneTemporaryVariable
 from dune.perftool.loopy.vcl import get_vcl_type_size
+from dune.perftool.tools import round_to_multiple
 
 import loopy as lp
 import numpy as np
@@ -17,83 +19,47 @@ def get_vector_view_name(tmpname):
     return tmpname + "_vec"
 
 
-def add_vector_view(knl, tmpname, pad_to=None, flatview=False):
-    """
-    Kernel transformation to add a vector view temporary
-    that interprets the same memory as another temporary
-    """
+def add_vector_view(knl, tmpname, pad_to=1):
     temporaries = knl.temporary_variables
-    assert tmpname in temporaries
     temp = temporaries[tmpname]
-    vecname = get_vector_view_name(tmpname)
+    vectemp = get_vector_view_name(tmpname)
     bsname = tmpname + "_base"
+    vecsize = get_vcl_type_size(temp.dtype)
 
-    if vecname in knl.temporary_variables:
+    # Enforce idempotency
+    if vectemp in temporaries:
         return knl
 
-    # Add base storage to the original temporary!
-    if not temp.base_storage:
-        temp = temp.copy(base_storage=bsname)
-        temporaries[tmpname] = temp
-    else:
-        bsname = temp.base_storage
-
-    # Determine the shape by dividing total size by vector size
-    # Also apply the padding we need for rotation
-    # TODO: *Only* apply this padding if really needed (a bit hard to figure out)
-    vecsize = get_vcl_type_size(temp.dtype)
-    if all(isinstance(s, int) for s in temp.shape):
-        size = pt.product(temp.shape) // vecsize
-        if size % vecsize != 0:
-            size = (size // vecsize + 1) * vecsize
+    # Modify the original temporary to use our custom base storage mechanism
+    if isinstance(temp, DuneTemporaryVariable):
+        if temp.custom_base_storage:
+            bsname = temp.custom_base_storage
+        else:
+            temp = temp.copy(custom_base_storage=bsname)
+            temporaries[tmpname] = temp
     else:
-        size = prim.FloorDiv(prim.Product(temp.shape), vecsize)
-        size = (size // vecsize + 1) * vecsize
-
-    # Maybe do some padding.
-    if pad_to:
-        size = (size // pad_to + 1) * pad_to
+        temp = DuneTemporaryVariable(custom_base_storage=bsname,
+                                     managed=True,
+                                     **temp.get_copy_kwargs()
+                                     )
+        temporaries[tmpname] = temp
 
-    # Some vectorview are intentionally flat! (e.g. the output buffers of
-    # sum factorization kernels
-    if flatview:
-        shape = (size, vecsize)
-        dim_tags = "c,vec"
-    else:
-        shape = temp.shape
-        # This works around a loopy weirdness (which might as well be a bug)
-        # TODO: investigate this!
-        if len(shape) == 1:
-            shape = (1, vecsize)
-            dim_tags = "c,vec"
-        else:
-            dim_tags = temp.dim_tags[:-1] + ("vec",)
+    size = round_to_multiple(pt.product(temp.shape), vecsize) // vecsize
+    size = round_to_multiple(size, pad_to)
 
     # Now add a vector view temporary
-    vecname = tmpname + "_vec"
-    temporaries[vecname] = lp.TemporaryVariable(vecname,
-                                                dim_tags=dim_tags,
-                                                shape=shape,
-                                                base_storage=bsname,
-                                                dtype=dtype_floatingpoint(),
-                                                scope=lp.temp_var_scope.PRIVATE,
-                                                )
-
-    # Avoid that any of these temporaries are eliminated
-    silenced = ['temp_to_write({})'.format(tmpname),
-                'temp_to_write({})'.format(vecname),
-                'read_no_write({})'.format(tmpname),
-                'read_no_write({})'.format(vecname),
+    temporaries[vectemp] = DuneTemporaryVariable(vectemp,
+                                                 dim_tags="c,vec",
+                                                 shape=(size, vecsize),
+                                                 custom_base_storage=bsname,
+                                                 scope=lp.temp_var_scope.PRIVATE,
+                                                 managed=True,
+                                                 )
+
+    # Avoid that these temporaries are eliminated
+    silenced = ['temp_to_write({})'.format(vectemp),
+                'read_no_write({})'.format(vectemp),
                 ]
 
     return knl.copy(temporary_variables=temporaries,
                     silenced_warnings=knl.silenced_warnings + silenced)
-
-
-def add_temporary_with_vector_view(knl, name, *args, **kwargs):
-    temps = knl.temporary_variables
-    assert name not in temps
-    temps[name] = lp.TemporaryVariable(name, *args, **kwargs)
-    knl = knl.copy(temporary_variables=temps)
-    knl = add_vector_view(knl, name)
-    return knl
diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py
index 191889c00c9e5a4790a4b4bb9e0a25a1e3f2c16d..345dec931596c07c8641a03f5c0c5035f86d06b6 100644
--- a/python/dune/perftool/loopy/vcl.py
+++ b/python/dune/perftool/loopy/vcl.py
@@ -2,7 +2,7 @@
 Our extensions to the loopy type system
 """
 from dune.perftool.options import get_option
-from dune.perftool.generation import function_mangler
+from dune.perftool.generation import function_mangler, include_file
 
 import loopy as lp
 import numpy as np
@@ -62,8 +62,10 @@ def get_vcl_typename(nptype, register_size=None, vector_width=None):
 
 
 class ExplicitVCLCast(lp.symbolic.FunctionIdentifier):
-    def __init__(self, nptype, vector_width):
+    def __init__(self, nptype, vector_width=None):
         self.nptype = nptype
+        if vector_width is None:
+            vector_width = get_vcl_type_size(nptype)
         self.vector_width = vector_width
 
     def __getinitargs__(self):
@@ -74,8 +76,17 @@ class ExplicitVCLCast(lp.symbolic.FunctionIdentifier):
         return get_vcl_typename(self.nptype, vector_width=self.vector_width)
 
 
+class VCLLowerUpperLoad(ExplicitVCLCast):
+    pass
+
+
 @function_mangler
 def vcl_cast_mangler(knl, func, arg_dtypes):
+    if isinstance(func, VCLLowerUpperLoad):
+        return lp.CallMangleInfo(func.name,
+                                 (lp.types.NumpyType(func.nptype),),
+                                 arg_dtypes)
+
     if isinstance(func, ExplicitVCLCast):
         return lp.CallMangleInfo(func.name, (lp.types.NumpyType(func.nptype),), (arg_dtypes[0],))
 
@@ -107,10 +118,11 @@ def vcl_function_mangler(knl, func, arg_dtypes):
         vcl = lp.types.NumpyType(get_vcl_type(dtype))
         return lp.CallMangleInfo("select", (vcl,), (vcl, vcl, vcl))
 
-    if func == "horizontal_add":
+    if func in ("horizontal_add", "horizontal_add_lower", "horizontal_add_upper"):
         dtype = arg_dtypes[0]
         vcl = lp.types.NumpyType(get_vcl_type(dtype))
-        return lp.CallMangleInfo("horizontal_add", (lp.types.NumpyType(dtype.dtype),), (vcl,))
+        include_file("dune/perftool/sumfact/horizontaladd.hh", filetag="operatorfile")
+        return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype.dtype),), (vcl,))
 
     if isinstance(func, VCLPermute):
         dtype = arg_dtypes[0]
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 9f7c0e19619317b040083d98796939148fe29fb0..1ecc85c5903b69f83953502e8afa485f5a962702 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -25,10 +25,10 @@ class PerftoolOption(ImmutableRecord):
                                  )
 
 
-class PerftoolOptionsArray(ImmutableRecord):
+class PerftoolGlobalOptionsArray(ImmutableRecord):
     """ A collection of form compiler arguments """
     def __init__(self, **kwargs):
-        opts = {k: v.default for k, v in PerftoolOptionsArray.__dict__.items() if isinstance(v, PerftoolOption)}
+        opts = {k: v.default for k, v in PerftoolGlobalOptionsArray.__dict__.items() if isinstance(v, PerftoolOption)}
         opts.update(**kwargs)
         ImmutableRecord.__init__(self, **opts)
 
@@ -36,23 +36,46 @@ class PerftoolOptionsArray(ImmutableRecord):
     uflfile = PerftoolOption(helpstr="the UFL file to compile")
     debug_cache_with_stack = PerftoolOption(default=False, helpstr="Store stack along with cache objects. Makes debugging caching issues easier.")
     driver_file = PerftoolOption(helpstr="The filename for the generated driver header")
-    operator_file = PerftoolOption(helpstr="The filename for the generated local operator header")
-    numerical_jacobian = PerftoolOption(default=False, helpstr="use numerical jacobians (only makes sense, if uflpdelab for some reason fails to generate analytic jacobians)")
-    matrix_free = PerftoolOption(default=False, helpstr="Use iterative solver with matrix free jacobian application")
     explicit_time_stepping = PerftoolOption(default=False, helpstr="use explicit time stepping")
     exact_solution_expression = PerftoolOption(helpstr="name of the exact solution expression in the ufl file")
     compare_l2errorsquared = PerftoolOption(helpstr="maximal allowed l2 error squared of difference between numerical solution and interpolation of exact solution (NOTE: requires --exact-solution-expression)")
-    interactive = PerftoolOption(default=False, helpstr="whether the optimization process should be guided interactively (also useful for debugging)")
+    l2error_tree_path = PerftoolOption(default=None, helpstr="Tree pathes that should be considered for l2 error calculation. Default None means we take all of them into account.")
+    ini_file = PerftoolOption(helpstr="An inifile to use. A generated driver will be hard-coded to it, a [formcompiler] section will be used as default values to form compiler arguments (use snake case)")
+    opcounter = PerftoolOption(default=False, helpstr="Count operations. Note: In this case only operator applications are generated since solving and operator counting does not work. You probably want to set instrumentation level>0.")
+    performance_measuring = PerftoolOption(default=False, helpstr="Generate opcounter codepath, but only measure times!")
+    instrumentation_level = PerftoolOption(default=0, helpstr="Control time/opcounter measurements. 0-do nothing, 1-measure program as a whole, 2-operator applications, 3-measure kernel (eg. alpha-volume, ...), 4-parts of kernel (eg. stage 1-3 of SF)")
+    project_basedir = PerftoolOption(helpstr="The base (build) directory of the dune-perftool project")
+    architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl|skylake")
+    yaspgrid_offset = PerftoolOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.")
+    precision_bits = PerftoolOption(default=64, helpstr="The number of bits for the floating point type")
+    overlapping = PerftoolOption(default=False, helpstr="Use an overlapping solver and constraints. You still need to make sure to construct a grid with overlap! The parallel option will be set automatically.")
+    operators = PerftoolOption(default="r", helpstr="A comma separated list of operators, each name will be interpreted as a subsection name within the formcompiler section")
+    target_name = PerftoolOption(default=None, helpstr="The target name from CMake")
+    operator_to_build = PerftoolOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
+
+    # Arguments that are mainly to be set by logic depending on other options
+    max_vector_width = PerftoolOption(default=256, helpstr=None)
+    parallel = PerftoolOption(default=False, helpstr="Mark that this program should be run in parallel. If set to true the c++ code will check that there are more than 1 MPI-ranks involved and the error computation will use communication.")
+
+
+class PerftoolFormOptionsArray(ImmutableRecord):
+    """ A collection of form-specific form compiler arguments """
+    def __init__(self, **kwargs):
+        opts = {k: v.default for k, v in PerftoolFormOptionsArray.__dict__.items() if isinstance(v, PerftoolOption)}
+        opts.update(**kwargs)
+        ImmutableRecord.__init__(self, **opts)
+
+    # Form specific options
+    form = PerftoolOption(default=None, helpstr="The name of the UFL object representing the form in the UFL file")
+    filename = PerftoolOption(default=None, helpstr="The filename to use for this LocalOperator")
+    classname = PerftoolOption(default=None, helpstr="The name of the C++ class to generate")
+    numerical_jacobian = PerftoolOption(default=False, helpstr="use numerical jacobians (only makes sense, if uflpdelab for some reason fails to generate analytic jacobians)")
+    matrix_free = PerftoolOption(default=False, helpstr="Generate jacobian_apply_* methods for matrix free solvers")
     print_transformations = PerftoolOption(default=False, helpstr="print out dot files after ufl tree transformations")
     print_transformations_dir = PerftoolOption(default=".", helpstr="place where to put dot files (can be omitted)")
     quadrature_order = PerftoolOption(_type=int, helpstr="Quadrature order used for all integrals.")
     diagonal_transformation_matrix = PerftoolOption(default=False, helpstr="set option if the jacobian of the transformation is diagonal (axiparallel grids)")
     constant_transformation_matrix = PerftoolOption(default=False, helpstr="set option if the jacobian of the transformation is constant on a cell")
-    ini_file = PerftoolOption(helpstr="An inifile to use. A generated driver will be hard-coded to it, a [formcompiler] section will be used as default values to form compiler arguments (use snake case)")
-    opcounter = PerftoolOption(default=False, helpstr="Count operations. Note: In this case only oparor applications are generated since solving and operator counting does not work. You probably want to set instrumentation level>0.")
-    time_opcounter = PerftoolOption(default=False, helpstr="Generate opcounter codepath. Can be used for timing opcounter programs without setting the opcounter option.")
-    instrumentation_level = PerftoolOption(default=0, helpstr="Control time/opcounter measurements. 0-do nothing, 1-measure program as a whole, 2-operator applications, 3-measure kernel (eg. alpha-volume, ...), 4-parts of kernel (eg. stage 1-3 of SF)")
-    project_basedir = PerftoolOption(helpstr="The base (build) directory of the dune-perftool project")
     fastdg = PerftoolOption(default=False, helpstr="Use FastDGGridOperator from PDELab.")
     sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization")
     vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
@@ -61,36 +84,37 @@ class PerftoolOptionsArray(ImmutableRecord):
     vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy")
     vectorization_padding = PerftoolOption(default=None, helpstr="an explicit value for the allowed padding in vectorization")
     vectorization_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers")
-    turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.")
-    architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl")
-    grid_offset = PerftoolOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.")
-    simplify = PerftoolOption(default=True, helpstr="Whether to simplify expressions using sympy")
-    precision_bits = PerftoolOption(default=64, helpstr="The number of bits for the floating point type")
-    assure_statement_ordering = PerftoolOption(default=False, helpstr="Whether special care should be taken for a good statement ordering in sumfact kernels, runs into a loopy scheduler performance bug, but is necessary for production.")
-
-    # Arguments that are mainly to be set by logic depending on other options
-    max_vector_width = PerftoolOption(default=256, helpstr=None)
+    vectorization_list_index = PerftoolOption(default=None, helpstr="Which vectorization to pick from a list (only valid with vectorization_strategy=fromlist).")
+    simplify = PerftoolOption(default=False, helpstr="Whether to simplify expressions using sympy")
+    generate_jacobians = PerftoolOption(default=True, helpstr="Whether jacobian_* methods should be generated. This is set to false automatically, when numerical_jacobian is set to true.")
+    generate_residuals = PerftoolOption(default=True, helpstr="Whether alpha_* methods should be generated.")
     unroll_dimension_loops = PerftoolOption(default=False, helpstr="whether loops over the geometric dimension should be unrolled")
     precompute_quadrature_info = PerftoolOption(default=True, helpstr="compute quadrature points and weights in the constructor of the local operator")
     blockstructured = PerftoolOption(default=False, helpstr="Use block structure")
     number_of_blocks = PerftoolOption(default=1, helpstr="Number of sub blocks in one direction")
     vectorization_blockstructured = PerftoolOption(default=False, helpstr="Vectorize block structuring")
-
+    adjoint = PerftoolOption(default=False, helpstr="Generate adjoint operator")
+    control = PerftoolOption(default=False, helpstr="Generate operator of derivative w.r.t. the control variable")
+    objective_function = PerftoolOption(default=None, helpstr="Name of form representing the objective function in UFL file")
+    control_variable = PerftoolOption(default=None, helpstr="Name of control variable in UFL file")
+    block_preconditioner_diagonal = PerftoolOption(default=False, helpstr="Whether this operator should implement the diagonal part of a block preconditioner")
+    block_preconditioner_offdiagonal = PerftoolOption(default=False, helpstr="Whether this operator should implement the off-diagonal part of a block preconditioner")
 
 # Until more sophisticated logic is needed, we keep the actual option data in this module
-_options = PerftoolOptionsArray()
+_global_options = PerftoolGlobalOptionsArray()
+_form_options = {}
 
 
 def initialize_options():
     """ Initialize the options from the command line """
-    global _options
-    _options = update_options_from_commandline(_options)
-    _options = update_options_from_inifile(_options)
+    global _global_options
+    _global_options = update_options_from_commandline(_global_options)
+    _global_options = update_options_from_inifile(_global_options)
 
 
 def update_options_from_commandline(opt):
     """ Return an options array object with updated values from the commandline """
-    assert isinstance(opt, PerftoolOptionsArray)
+    assert isinstance(opt, PerftoolGlobalOptionsArray)
     parser = ArgumentParser(description="Compile UFL files to PDELab C++ code",
                             epilog="Please report bugs to dominic.kempf@iwr.uni-heidelberg.de",
                             )
@@ -106,26 +130,58 @@ def update_options_from_commandline(opt):
 def update_options_from_inifile(opt):
     """ Return an options array object with updated values from an inifile """
     if opt.ini_file:
-        def _fix_types(k, v):
-            if hasattr(type(opt), k) and getattr(type(opt), k).type is bool:
-                return bool(eval(v))
-            if hasattr(type(opt), k):
-                return getattr(type(opt), k).type(v)
-            return v
-        ini = parse_ini_file(opt.ini_file).get("formcompiler", {})
-        ini = {k: _fix_types(k, v) for k, v in ini.items()}
-        opt = opt.copy(**ini)
+        def parse_ini(section, opttype):
+            def _fix_types(k, v):
+                if hasattr(opttype, k) and getattr(opttype, k).type is bool:
+                    return bool(eval(v))
+                if hasattr(opttype, k):
+                    return getattr(opttype, k).type(v)
+                return v
+            ini = parse_ini_file(opt.ini_file).get(section, {})
+            return {k: _fix_types(k, v) for k, v in ini.items()}
+
+        opt = opt.copy(**parse_ini("formcompiler", PerftoolGlobalOptionsArray))
+        # Also parse form-specific options
+        for form in [i.strip() for i in opt.operators.split(",")]:
+            _form_options[form] = PerftoolFormOptionsArray(**parse_ini("formcompiler.{}".format(form), PerftoolFormOptionsArray))
+
     return opt
 
 
 @memoize
-def process_options(opt):
+def process_global_options(opt):
     """ Make sure that the options have been fully processed """
     opt = expand_architecture_options(opt)
 
+    if opt.overlapping:
+        opt = opt.copy(parallel=True)
+
+    return opt
+
+
+@memoize
+def process_form_options(opt, form):
     if opt.sumfact:
         opt = opt.copy(unroll_dimension_loops=True)
 
+    if opt.numerical_jacobian:
+        opt = opt.copy(generate_jacobians=False)
+
+    if opt.form is None:
+        opt = opt.copy(form=form)
+
+    if opt.classname is None:
+        opt = opt.copy(classname="{}Operator".format(form))
+
+    if opt.filename is None:
+        opt = opt.copy(filename="{}_{}_file.hh".format(get_option("target_name"), opt.classname))
+
+    if opt.block_preconditioner_diagonal or opt.block_preconditioner_offdiagonal:
+        assert opt.numerical_jacobian is False
+        opt = opt.copy(generate_residuals=False,
+                       generate_jacobians=False,
+                       matrix_free=True,
+                       )
     return opt
 
 
@@ -134,6 +190,8 @@ def expand_architecture_options(opt):
         return opt.copy(max_vector_width=256)
     elif opt.architecture == "knl":
         return opt.copy(max_vector_width=512)
+    elif opt.architecture == "skylake":
+        return opt.copy(max_vector_width=512)
     else:
         raise NotImplementedError("Architecture {} not known!".format(opt.architecture))
 
@@ -145,18 +203,44 @@ def set_option(key, value):
     overwritten.  Form compiler arguments will always be set before
     any other options.
     """
-    global _options
-    _options = process_options(_options).copy(**{key: value})
+    global _global_options
+    _global_options = process_global_options(_global_options).copy(**{key: value})
+
+
+def set_form_option(key, value, form=None):
+    if form is None:
+        from dune.perftool.generation import get_global_context_value
+        form = get_global_context_value("form_identifier", 0)
+    if isinstance(form, int):
+        form = get_option("operators").split(",")[form].strip()
+    _form_options[form] = _form_options[form].copy(**{key: value})
 
 
 def get_option(key):
-    return getattr(process_options(_options), key)
+    processed_global_opts = process_global_options(_global_options)
+    return getattr(processed_global_opts, key)
+
+
+def get_form_option(key, form=None):
+    if form is None:
+        from dune.perftool.generation import get_global_context_value
+        form = get_global_context_value("form_identifier", 0)
+    if isinstance(form, int):
+        form = get_option("operators").split(",")[form].strip()
+    processed_form_opts = process_form_options(_form_options[form], form)
+    return getattr(processed_form_opts, key)
 
 
 def option_switch(opt):
     def _switch():
-        if get_option(opt):
-            return opt
-        else:
-            return "default"
+        try:
+            if get_option(opt):
+                return opt
+            else:
+                return "default"
+        except AttributeError:
+            if get_form_option(opt):
+                return opt
+            else:
+                return "default"
     return _switch
diff --git a/python/dune/perftool/pdelab/__init__.py b/python/dune/perftool/pdelab/__init__.py
index 2450b8a6912550eefd6c98cd70a06228b6ad216e..d3153ca0b8612b5391843a8689fa2c1c9d16bc56 100644
--- a/python/dune/perftool/pdelab/__init__.py
+++ b/python/dune/perftool/pdelab/__init__.py
@@ -13,6 +13,7 @@ from dune.perftool.pdelab.argument import (pymbolic_apply_function,
 from dune.perftool.pdelab.basis import (pymbolic_basis,
                                         pymbolic_reference_gradient,
                                         )
+from dune.perftool.pdelab.function import pymbolic_gridfunction
 from dune.perftool.pdelab.geometry import (component_iname,
                                            pymbolic_cell_volume,
                                            pymbolic_facet_area,
@@ -25,9 +26,6 @@ from dune.perftool.pdelab.geometry import (component_iname,
                                            )
 from dune.perftool.pdelab.index import (name_index,
                                         )
-from dune.perftool.pdelab.parameter import (cell_parameter_function,
-                                            intersection_parameter_function,
-                                            )
 from dune.perftool.pdelab.quadrature import (pymbolic_quadrature_weight,
                                              pymbolic_quadrature_position,
                                              quadrature_inames,
@@ -101,15 +99,8 @@ class PDELabInterface(object):
     def pymbolic_apply_function(self, element, restriction, index):
         return pymbolic_apply_function(self.visitor, element, restriction, index)
 
-    #
-    # Parameter function related generator functions
-    #
-
-    def intersection_parameter_function(self, name, expr, cellwise_constant):
-        return intersection_parameter_function(name, expr, cellwise_constant)
-
-    def cell_parameter_function(self, name, expr, restriction, cellwise_constant):
-        return cell_parameter_function(name, expr, restriction, cellwise_constant)
+    def pymbolic_gridfunction(self, coeff, restriction, grad):
+        return pymbolic_gridfunction(coeff, restriction, grad)
 
     #
     # Tensor expression related generator functions
diff --git a/python/dune/perftool/pdelab/adjoint.py b/python/dune/perftool/pdelab/adjoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..aef03bbbf08e45f2510bd90f164bdd70b071580c
--- /dev/null
+++ b/python/dune/perftool/pdelab/adjoint.py
@@ -0,0 +1,171 @@
+import logging
+
+import numpy
+
+from loopy import CallMangleInfo
+from loopy.symbolic import FunctionIdentifier
+from loopy.types import NumpyType
+
+import pymbolic.primitives as prim
+
+from dune.perftool.generation import (class_member,
+                                      constructor_parameter,
+                                      function_mangler,
+                                      get_global_context_value,
+                                      global_context,
+                                      globalarg,
+                                      initializer_list,
+                                      template_parameter,
+                                      )
+from dune.perftool.options import (get_form_option,
+                                   )
+from dune.perftool.loopy.target import dtype_floatingpoint
+from dune.perftool.pdelab import PDELabInterface
+from dune.perftool.pdelab.localoperator import (boundary_predicates,
+                                                determine_accumulation_space,
+                                                extract_kernel_from_cache,
+                                                )
+
+
+@template_parameter(classtag="operator")
+def type_dJdm():
+    return "DJDM_VEC"
+
+
+def name_dJdm_constructor_argument(name):
+    _type = type_dJdm()
+    constructor_name = name + "_"
+    constructor_parameter("{}&".format(_type), constructor_name, classtag="operator")
+    return constructor_name
+
+
+@class_member(classtag="operator")
+def define_dJdm_member(name):
+    _type = type_dJdm()
+    param = name_dJdm_constructor_argument(name)
+    initializer_list(name, [param, ], classtag="operator")
+    return "{}& {};".format(_type, name)
+
+
+def generate_accumulation_instruction(expr, visitor, accumulation_index, number_of_controls):
+    # Create class member dJdm for accumulating
+    accumvar = "dJdm"
+    shape = (number_of_controls,)
+    define_dJdm_member(accumvar)
+
+    # Tell loopy about
+    globalarg(accumvar, shape=shape)
+    assignee = prim.Subscript(prim.Variable(accumvar), accumulation_index)
+
+    # We need to accumulate
+    expr = prim.Sum((assignee, expr))
+
+    from dune.perftool.generation import instruction
+    quad_inames = visitor.interface.quadrature_inames()
+    instruction(assignee=assignee,
+                expression=expr,
+                forced_iname_deps=frozenset(quad_inames),
+                forced_iname_deps_is_final=True,
+                )
+
+
+def list_accumulation_infos(expr, visitor):
+    return ["control", ]
+
+
+class ControlInterface(PDELabInterface):
+    """Interface for generating the control localoperator
+
+    In this case we will not accumulate in the residual vector but use
+    a class member representing dJdm instead.
+
+    """
+    def __init__(self, accumulation_index, number_of_controls):
+        """Create ControlInterface
+
+        Arguments:
+        ----------
+        accumulation_index: In which component of the dJdm should be accumulated.
+        number_of_controls: Number of components of dJdm. Needed for creating the member variable.
+        """
+        self.accumulation_index = accumulation_index
+        self.number_of_controls = number_of_controls
+
+    def list_accumulation_infos(self, expr, visitor):
+        return list_accumulation_infos(expr, visitor)
+
+    def generate_accumulation_instruction(self, expr, visitor):
+        return generate_accumulation_instruction(expr,
+                                                 visitor,
+                                                 self.accumulation_index,
+                                                 self.number_of_controls)
+
+
+def get_visitor(measure, subdomain_id, accumulation_index, number_of_controls):
+    interface = ControlInterface(accumulation_index, number_of_controls)
+    from dune.perftool.ufl.visitor import UFL2LoopyVisitor
+    return UFL2LoopyVisitor(interface, measure, subdomain_id)
+
+
+def visit_integral(integral, accumulation_index, number_of_controls):
+    integrand = integral.integrand()
+    measure = integral.integral_type()
+    subdomain_id = integral.subdomain_id()
+
+    # The visitor needs to know about the current index and the number
+    # of controls in order to generate the accumulation instruction
+    visitor = get_visitor(measure, subdomain_id, accumulation_index, number_of_controls)
+
+    # Start the visiting process!
+    visitor.accumulate(integrand)
+
+
+def generate_kernel(forms):
+    # Similar to the standard residual generation, except:
+    # - Have multiple forms
+    # - Pass index and number of forms along
+    logger = logging.getLogger(__name__)
+
+    # Visit all integrals once to collect information (dry-run)!
+    logger.debug('generate_kernel: visit_integrals (dry run)')
+    with global_context(dry_run=True):
+        for i, form in enumerate(forms):
+            for integral in form:
+                visit_integral(integral, i, len(forms))
+
+    # Now perform some checks on what should be done
+    from dune.perftool.sumfact.vectorization import decide_vectorization_strategy
+    logger.debug('generate_kernel: decide_vectorization_strategy')
+    decide_vectorization_strategy()
+
+    # Delete the cache contents and do the real thing!
+    logger.debug('generate_kernel: visit_integrals (no dry run)')
+    from dune.perftool.generation import delete_cache_items
+    delete_cache_items("kernel_default")
+    for i, form in enumerate(forms):
+        for integral in form:
+            visit_integral(integral, i, len(forms))
+
+    from dune.perftool.pdelab.signatures import kernel_name, assembly_routine_signature
+    name = kernel_name()
+    signature = assembly_routine_signature()
+    knl = extract_kernel_from_cache("kernel_default", name, signature)
+    delete_cache_items("kernel_default")
+
+    # Reset the quadrature degree
+    from dune.perftool.sumfact.tabulation import set_quadrature_points
+    set_quadrature_points(None)
+
+    # Clean the cache from any data collected after the dry run
+    delete_cache_items("dryrundata")
+
+    return knl
+
+
+# @backend(interface="generate_kernels_per_integral")
+def control_generate_kernels_per_integral(forms):
+    """For the control problem forms will have one form for every
+    measure. Every form will only contain integrals of one type.
+
+    """
+    yield generate_kernel(forms)
diff --git a/python/dune/perftool/pdelab/argument.py b/python/dune/perftool/pdelab/argument.py
index 848d9d9ad275b4ff910da9e9f6514fe883f9b436..30449edea105a0a962c5883559e61ced1fc4cdb0 100644
--- a/python/dune/perftool/pdelab/argument.py
+++ b/python/dune/perftool/pdelab/argument.py
@@ -5,7 +5,6 @@ Namely:
 * accumulation object (r, jac...)
 """
 
-from dune.perftool.options import get_option
 from dune.perftool.generation import (domain,
                                       function_mangler,
                                       iname,
@@ -178,14 +177,14 @@ def name_accumulation_variable(restrictions=None):
             if measure == "cell":
                 restrictions = (Restriction.NONE,)
             else:
-                restrictions = (Restriction.NEGATIVE,)
+                restrictions = (Restriction.POSITIVE,)
         return name_residual(*restrictions)
     if ft == 'jacobian':
         if restrictions is None:
             if measure == "cell":
                 restrictions = (Restriction.NONE, Restriction.NONE)
             else:
-                restrictions = (Restriction.NEGATIVE, Restriction.NEGATIVE)
+                restrictions = (Restriction.POSITIVE, Restriction.POSITIVE)
         return name_jacobian(*restrictions)
     assert False
 
diff --git a/python/dune/perftool/pdelab/basis.py b/python/dune/perftool/pdelab/basis.py
index 7effdb5835daa3c333bfbd134da45366310c9a07..55c9a7ef7676c7647d485486d73440aed9d22aa2 100644
--- a/python/dune/perftool/pdelab/basis.py
+++ b/python/dune/perftool/pdelab/basis.py
@@ -6,10 +6,11 @@ from dune.perftool.generation import (backend,
                                       include_file,
                                       instruction,
                                       kernel_cached,
+                                      preamble,
                                       temporary_variable,
                                       )
 from dune.perftool.options import (option_switch,
-                                   get_option
+                                   get_form_option,
                                    )
 from dune.perftool.pdelab.spaces import (lfs_iname,
                                          lfs_inames,
@@ -23,6 +24,7 @@ from dune.perftool.pdelab.geometry import (component_iname,
                                            world_dimension,
                                            name_jacobian_inverse_transposed,
                                            to_cell_coordinates,
+                                           name_cell,
                                            )
 from dune.perftool.pdelab.localoperator import (lop_template_ansatz_gfs,
                                                 lop_template_test_gfs,
@@ -84,7 +86,7 @@ def declare_cache_temporary(element, restriction, which):
     t_cache = type_localbasis_cache(element)
     lfs = name_leaf_lfs(element, restriction)
 
-    def decl(name, shape, shape_impl):
+    def decl(name, kernel, decl_info):
         return "typename {}::{}ReturnType {};".format(t_cache,
                                                       which,
                                                       name,
@@ -171,7 +173,7 @@ def evaluate_coefficient(visitor, element, name, container, restriction, index):
     lfs = name_lfs(element, restriction, index)
     basis = visitor.interface.pymbolic_basis(sub_element, restriction, 0, context='trial')
     basisindex = get_pymbolic_indices(basis)[:-1]
-    if get_option("blockstructured"):
+    if get_form_option("blockstructured"):
         from dune.perftool.blockstructured.argument import pymbolic_coefficient
         coeff = pymbolic_coefficient(container, lfs, sub_element, basisindex)
     else:
@@ -207,7 +209,7 @@ def evaluate_coefficient_gradient(visitor, element, name, container, restriction
     from dune.perftool.tools import maybe_wrap_subscript
     basis = maybe_wrap_subscript(basis, Variable(dimindex))
 
-    if get_option("blockstructured"):
+    if get_form_option("blockstructured"):
         from dune.perftool.blockstructured.argument import pymbolic_coefficient
         coeff = pymbolic_coefficient(container, lfs, sub_element, basisindex)
     else:
@@ -221,5 +223,5 @@ def evaluate_coefficient_gradient(visitor, element, name, container, restriction
     instruction(expression=Reduction("sum", basisindex, reduction_expr, allow_simultaneous=True),
                 assignee=assignee,
                 forced_iname_deps=frozenset(get_backend("quad_inames")()).union(frozenset({dimindex})),
-                forced_iname_deps_is_final=True
+                forced_iname_deps_is_final=True,
                 )
diff --git a/python/dune/perftool/pdelab/driver/__init__.py b/python/dune/perftool/pdelab/driver/__init__.py
index e052fa954f61a3d0890ba373a3527cc770ff3fac..3877165c574ac8ef7b6d7197b56f97d071f38f42 100644
--- a/python/dune/perftool/pdelab/driver/__init__.py
+++ b/python/dune/perftool/pdelab/driver/__init__.py
@@ -12,90 +12,60 @@ NB: Previously this __init__.py was a module driver.py. As it was growing,
 """
 from dune.perftool.error import PerftoolCodegenError
 from dune.perftool.generation import (generator_factory,
+                                      get_global_context_value,
                                       global_context,
                                       include_file,
                                       cached,
                                       pre_include,
                                       preamble,
                                       )
-from dune.perftool.options import get_option
+from dune.perftool.options import (get_form_option,
+                                   get_option,
+                                   )
 
-# Have a global variable with the entire form data. This allows functions that depend
-# deterministically on the entire data set to directly access it instead of passing it
-# through the entire generator chain.
-_driver_data = {}
+#
+# The following functions are not doing anything useful, but providing easy access
+# to quantities that are needed throughout the process of generating the driver!
+#
 
 
-# Have a function access this global data structure
-def set_driver_data(formdatas, data):
-    assert (len(formdatas) <= 2)
-    if len(formdatas) == 1:
-        _driver_data['form'] = formdatas[0].preprocessed_form
-        _driver_data['formdata'] = formdatas[0]
-    else:
-        mass_index = mass_form_index(formdatas, data)
-        if mass_index is None:
-            raise NotImplementedError("Form for mass matrix needs to have name 'mass' in ufl file.")
-        _driver_data['mass_form'] = formdatas[mass_index].preprocessed_form
-        _driver_data['mass_formdata'] = formdatas[mass_index]
-        _driver_data['form'] = formdatas[1 - mass_index].preprocessed_form
-        _driver_data['formdata'] = formdatas[1 - mass_index]
+def get_form_ident():
+    idents = [i.strip() for i in get_option("operators").split(",")]
+    if len(idents) == 2:
+        idents.remove("mass")
+    assert(len(idents) == 1)
+    return idents[0]
+
 
-    _driver_data['data'] = data
+def get_form():
+    data = get_global_context_value("data")
+    return data.object_by_name[get_form_option("form", get_form_ident())]
 
 
 def get_dimension():
-    return _driver_data['form'].ufl_cell().geometric_dimension()
+    return get_form().ufl_cell().geometric_dimension()
 
 
 def get_cell():
-    return _driver_data['form'].ufl_cell().cellname()
+    return get_form().ufl_cell().cellname()
 
 
 def get_test_element():
-    return _driver_data['form'].arguments()[0].ufl_element()
+    return get_form().arguments()[0].ufl_element()
 
 
 def get_trial_element():
-    return _driver_data['form'].coefficients()[0].ufl_element()
-
-
-def get_formdata():
-    return _driver_data['formdata']
-
-
-def get_mass_formdata():
-    return _driver_data["mass_formdata"]
+    return get_form().coefficients()[0].ufl_element()
 
 
 def is_stationary():
-    return 'mass_form' not in _driver_data
-
-
-def form_name_suffix(name, formdata):
-    from dune.perftool.pdelab.localoperator import name_form
-    data = _driver_data['data']
-    form_name = name_form(formdata, data)
-    return name + '_' + form_name
-
-
-def get_object(name):
-    return _driver_data['data'].object_by_name.get(name, None)
-
-
-def mass_form_index(formdatas, data):
-    for index, formdata in enumerate(formdatas):
-        try:
-            if data.object_names[id(formdata.original_form)] == 'mass':
-                return index
-        except KeyError:
-            continue
+    return "mass" not in [i.strip() for i in get_option("operators").split(",")]
 
 
 def is_linear(form=None):
     '''Test if form is linear in trial function'''
     if form is None:
-        form = get_formdata().original_form
+        form = get_form()
     from ufl import derivative
     from ufl.algorithms import expand_derivatives
     jacform = expand_derivatives(derivative(form, form.coefficients()[0]))
@@ -192,8 +162,11 @@ def unroll_list_tensors(data):
             yield e
 
 
-def preprocess_leaf_data(element, data):
-    data = get_object(data)
+def preprocess_leaf_data(element, data, applyZeroDefault=True):
+    data = get_global_context_value("data").object_by_name.get(data, None)
+    if data is None and not applyZeroDefault:
+        return None
+
     from ufl import MixedElement
     if isinstance(element, MixedElement):
         # data is None -> use 0 default
@@ -222,7 +195,7 @@ def name_inifile():
     return "argv[1]"
 
 
-@preamble
+@preamble(section="init")
 def parse_initree(varname):
     include_file("dune/common/parametertree.hh", filetag="driver")
     include_file("dune/common/parametertreeparser.hh", filetag="driver")
@@ -236,7 +209,7 @@ def name_initree():
     return "initree"
 
 
-@preamble
+@preamble(section="init")
 def define_mpihelper(name):
     include_file("dune/common/parallel/mpihelper.hh", filetag="driver")
     return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name)
@@ -248,23 +221,40 @@ def name_mpihelper():
     return name
 
 
-def generate_driver(formdatas, data):
-    # The driver module uses a global dictionary for storing necessary data
-    set_driver_data(formdatas, data)
+@preamble(section="grid")
+def check_parallel_execution():
+    from dune.perftool.pdelab.driver.gridfunctionspace import name_leafview
+    gv = name_leafview()
+    return ["if ({}.comm().size()==1){{".format(gv),
+            '  std::cout << "This program should be run in parallel!"  << std::endl;',
+            "  return 1;",
+            "}"]
+
+
+def generate_driver():
+    # Guarantee that config.h is the very first include in the generated file
+    include_file("config.h", filetag="driver")
+
+    # Make sure that the MPI helper is instantiated
+    name_mpihelper()
+
+    # Add check to c++ file if this program should only be used in parallel mode
+    if get_option("parallel"):
+        check_parallel_execution()
 
     # Entrypoint for driver generation
-    if get_option("opcounter") or get_option("time_opcounter"):
-        if get_option("time_opcounter"):
+    if get_option("opcounter") or get_option("performance_measuring"):
+        if get_option("performance_measuring"):
             assert(not get_option("opcounter"))
-        assert(any(_driver_data['form'].ufl_cell().cellname() in x for x in
-                   ["vertex", "interval", "quadrilateral", "hexahedron"]))
-        # In case of operator conunting we only assemble the matrix and evaluate the residual
+        assert(isQuadrilateral(get_cell()))
+        # In case of operator counting we only assemble the matrix and evaluate the residual
         # assemble_matrix_timer()
         from dune.perftool.pdelab.driver.timings import apply_jacobian_timer, evaluate_residual_timer
         from dune.perftool.loopy.target import type_floatingpoint
         pre_include("#define HP_TIMER_OPCOUNTER {}".format(type_floatingpoint()), filetag="driver")
         evaluate_residual_timer()
-        apply_jacobian_timer()
+        if get_form_option("matrix_free"):
+            apply_jacobian_timer()
     elif is_stationary():
         from dune.perftool.pdelab.driver.solve import dune_solve
         vec = dune_solve()
@@ -288,11 +278,33 @@ def generate_driver(formdatas, data):
     return_statement()
 
     from dune.perftool.generation import retrieve_cache_items
-    from cgen import FunctionDeclaration, FunctionBody, Block, Value
-    driver_signature = FunctionDeclaration(Value('bool', 'driver'), [Value('int', 'argc'), Value('char**', 'argv')])
-    contents = [i for i in retrieve_cache_items("preamble", make_generable=True)]
+    from cgen import FunctionDeclaration, FunctionBody, Block, Value, LineComment, Line
+    driver_signature = FunctionDeclaration(Value('int', 'main'), [Value('int', 'argc'), Value('char**', 'argv')])
+
+    contents = []
+
+    def add_section(tag, comment):
+        tagcontents = [i for i in retrieve_cache_items("preamble and {}".format(tag), make_generable=True)]
+        if tagcontents:
+            contents.append(LineComment(comment))
+            contents.append(Line("\n"))
+            contents.extend(tagcontents)
+            contents.append(Line("\n"))
+
+    add_section("init", "Initialize basic stuff...")
+    add_section("grid", "Setup grid (view)...")
+    add_section("fem", "Set up finite element maps...")
+    add_section("gfs", "Set up grid function spaces...")
+    add_section("constraints", "Set up constraints container...")
+    add_section("gridoperator", "Set up grid grid operators...")
+    add_section("vector", "Set up solution vectors...")
+    add_section("timings", "Maybe take performance measurements...")
+    add_section("solver", "Set up (non)linear solvers...")
+    add_section("vtk", "Do visualization...")
+    add_section("instat", "Set up instationary stuff...")
+    add_section("printing", "Maybe print residuals and matrices to stdout...")
+    add_section("error", "Maybe calculate errors for test results...")
 
-    from cgen import Line
     if get_option("instrumentation_level") >= 1:
         from dune.perftool.generation import post_include
         post_include("HP_DECLARE_TIMER(driver);\n", filetag="driver")
@@ -301,12 +313,27 @@ def generate_driver(formdatas, data):
         contents.insert(len(contents) - 1, Line(text="DUMP_TIMER({}, driver, {}, true);\n".format(get_option("instrumentation_level"), timestream)))
     contents.insert(0, Line(text="\n"))
     driver_body = Block(contents)
+
+    # Wrap a try/catch block around the driver body
+    from dune.perftool.cgen import CatchBlock, TryCatchBlock, Value, Block, Line
+    catch_blocks = [CatchBlock(Value("Dune::Exception&", "e"),
+                               Block([Line("std::cerr << \"Dune reported error: \" << e << std::endl;\n"),
+                                      Line("return 1;\n"),
+                                      ])
+                               ),
+                    CatchBlock(Value("std::exception&", "e"),
+                               Block([Line("std::cerr << \"Unknown exception thrown!\" << std::endl;\n"),
+                                      Line("return 1;\n"),
+                                      ])
+                               )
+                    ]
+    driver_body = Block([TryCatchBlock(driver_body, catch_blocks)])
     driver = FunctionBody(driver_signature, driver_body)
 
     filename = get_option("driver_file")
 
     from dune.perftool.file import generate_file
-    generate_file(filename, "driver", [driver])
+    generate_file(filename, "driver", [driver], headerguard=False)
 
     # Reset the caching data structure
     from dune.perftool.generation import delete_cache_items
diff --git a/python/dune/perftool/pdelab/driver/constraints.py b/python/dune/perftool/pdelab/driver/constraints.py
index 9158fc6be52d5c95c976eb08041fc3ad5b4a5da6..e3a96df485dc382761587e0950764372102869a6 100644
--- a/python/dune/perftool/pdelab/driver/constraints.py
+++ b/python/dune/perftool/pdelab/driver/constraints.py
@@ -3,7 +3,6 @@ from dune.perftool.generation import (global_context,
                                       preamble,
                                       )
 from dune.perftool.pdelab.driver import (FEM_name_mangling,
-                                         get_formdata,
                                          get_trial_element,
                                          )
 from dune.perftool.pdelab.driver.gridfunctionspace import (name_gfs,
@@ -24,14 +23,26 @@ def name_assembled_constraints():
     return name
 
 
-@preamble
+def has_dirichlet_constraints(is_dirichlet):
+    if isinstance(is_dirichlet, (list, tuple)):
+        return any(bool(d) for d in is_dirichlet)
+    else:
+        return bool(is_dirichlet)
+
+
+@preamble(section="constraints")
 def assemble_constraints(name):
     element = get_trial_element()
     gfs = name_trial_gfs()
     is_dirichlet = preprocess_leaf_data(element, "is_dirichlet")
-    bctype_function = name_bctype_function(element, is_dirichlet)
-    return "Dune::PDELab::constraints({}, {}, {});".format(bctype_function,
-                                                           gfs,
+    if has_dirichlet_constraints(is_dirichlet):
+        bctype_function = name_bctype_function(element, is_dirichlet)
+        return "Dune::PDELab::constraints({}, {}, {});".format(bctype_function,
+                                                               gfs,
+                                                               name,
+                                                               )
+    else:
+        return "Dune::PDELab::constraints({}, {});".format(gfs,
                                                            name,
                                                            )
 
@@ -59,7 +70,7 @@ def name_bctype_function(element, is_dirichlet):
         return name
 
 
-@preamble
+@preamble(section="constraints")
 def define_bctype_function(element, is_dirichlet, name):
     gv = name_leafview()
     bctype_lambda = name_bctype_lambda(name, is_dirichlet)
@@ -70,13 +81,13 @@ def define_bctype_function(element, is_dirichlet, name):
                                                                                        )
 
 
-@preamble
+@preamble(section="constraints")
 def define_power_bctype_function(element, name, subgfs):
     include_file('dune/pdelab/constraints/common/constraintsparameters.hh', filetag='driver')
     return "Dune::PDELab::PowerConstraintsParameters<decltype({}), {}> {}({});".format(subgfs, element.num_sub_elements(), name, subgfs)
 
 
-@preamble
+@preamble(section="constraints")
 def define_composite_bctype_function(element, is_dirichlet, name, subgfs):
     include_file('dune/pdelab/constraints/common/constraintsparameters.hh', filetag='driver')
     return "Dune::PDELab::CompositeConstraintsParameters<{}> {}({});".format(', '.join('decltype({})'.format(c) for c in subgfs),
@@ -91,7 +102,7 @@ def name_bctype_lambda(name, func):
     return name
 
 
-@preamble
+@preamble(section="constraints")
 def define_intersection_lambda(name, func):
     from ufl.classes import Expr
     if func is None:
@@ -105,7 +116,7 @@ def define_intersection_lambda(name, func):
     raise ValueError("Expression not understood")
 
 
-@preamble
+@preamble(section="constraints")
 def typedef_constraintscontainer(name):
     gfs = type_trial_gfs()
     r = type_range()
@@ -118,7 +129,7 @@ def type_constraintscontainer():
     return name
 
 
-@preamble
+@preamble(section="constraints")
 def define_constraintscontainer(name):
     cctype = type_constraintscontainer()
     return ["{} {};".format(cctype, name), "{}.clear();".format(name)]
diff --git a/python/dune/perftool/pdelab/driver/error.py b/python/dune/perftool/pdelab/driver/error.py
index 6e7fb9128e5437650baf140191311ee4a02eb5f7..a4a7e2d414c7ffffce709c97d80c1b7a329dbef3 100644
--- a/python/dune/perftool/pdelab/driver/error.py
+++ b/python/dune/perftool/pdelab/driver/error.py
@@ -5,11 +5,12 @@ from dune.perftool.generation import (cached,
                                       preamble,
                                       )
 from dune.perftool.options import get_option
-from dune.perftool.pdelab.driver import (get_formdata,
+from dune.perftool.pdelab.driver import (get_form_ident,
                                          get_trial_element,
                                          preprocess_leaf_data,
                                          )
-from dune.perftool.pdelab.driver.gridfunctionspace import (name_trial_gfs,
+from dune.perftool.pdelab.driver.gridfunctionspace import (name_leafview,
+                                                           name_trial_gfs,
                                                            name_trial_subgfs,
                                                            type_range,
                                                            )
@@ -23,7 +24,7 @@ from dune.perftool.pdelab.driver.solve import (define_vector,
 from ufl import MixedElement, TensorElement, VectorElement
 
 
-@preamble
+@preamble(section="error")
 def define_test_fail_variable(name):
     return 'bool {}(false);'.format(name)
 
@@ -48,7 +49,7 @@ def type_discrete_grid_function(gfs):
     return "{}_DGF".format(gfs.upper())
 
 
-@preamble
+@preamble(section="error")
 def define_discrete_grid_function(gfs, vector_name, dgf_name):
     dgf_type = type_discrete_grid_function(gfs)
     return ["using {} = Dune::PDELab::DiscreteGridFunction<decltype({}),decltype({})>;".format(dgf_type, gfs, vector_name),
@@ -61,10 +62,10 @@ def name_discrete_grid_function(gfs, vector_name):
     return dgf_name
 
 
-@preamble
+@preamble(section="error")
 def typedef_difference_squared_adapter(name, treepath):
     sol = name_exact_solution_gridfunction(treepath)
-    vector = name_vector(get_formdata())
+    vector = name_vector(get_form_ident())
     gfs = name_trial_subgfs(treepath)
     dgf = name_discrete_grid_function(gfs, vector)
 
@@ -77,11 +78,11 @@ def type_difference_squared_adapter(treepath):
     return name
 
 
-@preamble
+@preamble(section="error")
 def define_difference_squared_adapter(name, treepath):
     t = type_difference_squared_adapter(treepath)
     sol = name_exact_solution_gridfunction(treepath)
-    vector = name_vector(get_formdata())
+    vector = name_vector(get_form_ident())
     gfs = name_trial_subgfs(treepath)
     dgf = name_discrete_grid_function(gfs, vector)
 
@@ -94,7 +95,7 @@ def name_difference_squared_adapter(treepath):
     return name
 
 
-@preamble
+@preamble(section="error")
 def _accumulate_L2_squared(treepath):
     dsa = name_difference_squared_adapter(treepath)
     accum_error = name_accumulated_L2_error()
@@ -104,14 +105,21 @@ def _accumulate_L2_squared(treepath):
 
     strtp = ", ".join(str(t) for t in treepath)
 
+    gv = name_leafview()
+    sum_error_over_ranks = ""
+    if get_option("parallel"):
+        sum_error_over_ranks = "  err = {}.comm().sum(err);".format(gv)
     return ["{",
             "  // L2 error squared of difference between numerical",
             "  // solution and the interpolation of exact solution",
             "  // for treepath ({})".format(strtp),
             "  typename decltype({})::Traits::RangeType err(0.0);".format(dsa),
             "  Dune::PDELab::integrateGridFunction({}, err, 10);".format(dsa),
+            sum_error_over_ranks,
             "  {} += err;".format(accum_error),
-            "  std::cout << \"L2 Error for treepath {}: \" << err << std::endl;".format(strtp),
+            "  if ({}.comm().rank() == 0){{".format(gv),
+            "    std::cout << \"L2 Error for treepath {}: \" << err << std::endl;".format(strtp),
+            "  }"
             "}",
             ]
 
@@ -139,13 +147,18 @@ def treepath_to_index(element, treepath, offset=0):
 def accumulate_L2_squared():
     element = get_trial_element()
     if isinstance(element, MixedElement):
-        for i in range(element.value_size()):
-            _accumulate_L2_squared(get_treepath(element, i))
+        tree_pathes = (True,) * element.value_size()
+        if get_option("l2error_tree_path") is not None:
+            tree_pathes = list(map(int, get_option("l2error_tree_path").split(',')))
+            assert len(tree_pathes) == element.value_size()
+        for i, path in enumerate(tree_pathes):
+            if path:
+                _accumulate_L2_squared(get_treepath(element, i))
     else:
         _accumulate_L2_squared(())
 
 
-@preamble
+@preamble(section="error")
 def define_accumulated_L2_error(name):
     t = type_range()
     return "{} {}(0.0);".format(t, name)
@@ -157,20 +170,23 @@ def name_accumulated_L2_error():
     return name
 
 
-@preamble
+@preamble(section="error")
 def compare_L2_squared():
     accumulate_L2_squared()
+    gv = name_leafview()
 
     accum_error = name_accumulated_L2_error()
     fail = name_test_fail_variable()
     return ["using std::abs;",
             "using std::isnan;",
-            "std::cout << \"\\nl2errorsquared: \" << {} << std::endl << std::endl;".format(accum_error),
+            "if ({}.comm().rank() == 0){{".format(gv),
+            "  std::cout << \"\\nl2errorsquared: \" << {} << std::endl << std::endl;".format(accum_error),
+            "}",
             "if (isnan({0}) or abs({0})>{1})".format(accum_error, get_option("compare_l2errorsquared")),
             "  {} = true;".format(fail)]
 
 
-@preamble
+@preamble(section="error")
 def return_statement():
     from dune.perftool.pdelab.driver.error import name_test_fail_variable
     fail = name_test_fail_variable()
diff --git a/python/dune/perftool/pdelab/driver/gridfunctionspace.py b/python/dune/perftool/pdelab/driver/gridfunctionspace.py
index c1b439eae390add27ad96337f9d8851671811b41..a956855ad0ee827d8f64e6b731071db82a3c851b 100644
--- a/python/dune/perftool/pdelab/driver/gridfunctionspace.py
+++ b/python/dune/perftool/pdelab/driver/gridfunctionspace.py
@@ -1,7 +1,10 @@
 from dune.perftool.generation import (include_file,
                                       preamble,
                                       )
-from dune.perftool.options import get_option, set_option
+from dune.perftool.options import (get_form_option,
+                                   get_option,
+                                   set_option,
+                                   )
 from dune.perftool.pdelab.driver import (FEM_name_mangling,
                                          get_cell,
                                          get_dimension,
@@ -21,7 +24,7 @@ from dune.perftool.loopy.target import type_floatingpoint
 from ufl import FiniteElement, MixedElement, TensorElement, VectorElement, TensorProductElement
 
 
-@preamble
+@preamble(section="grid")
 def typedef_domainfield(name):
     gridt = type_grid()
     return "using {} = {}::ctype;".format(name, gridt)
@@ -32,7 +35,7 @@ def type_domainfield():
     return "DF"
 
 
-@preamble
+@preamble(section="init")
 def typedef_range(name):
     return "using {} = {};".format(name, type_floatingpoint())
 
@@ -43,16 +46,12 @@ def type_range():
     return name
 
 
-@preamble
+@preamble(section="grid")
 def typedef_grid(name):
     dim = get_dimension()
     if isQuadrilateral(get_trial_element().cell()):
-        # For Yasp Grids the jacobi of the transformation is diagonal and constant on each cell
-        set_option('diagonal_transformation_matrix', True)
-        set_option('constant_transformation_matrix', True)
-
         range_type = type_range()
-        if get_option("grid_offset"):
+        if get_option("yaspgrid_offset"):
             gridt = "Dune::YaspGrid<{0}, Dune::EquidistantOffsetCoordinates<{1}, {0}>>".format(dim, range_type)
         else:
             gridt = "Dune::YaspGrid<{0}, Dune::EquidistantCoordinates<{1}, {0}>>".format(dim, range_type)
@@ -74,7 +73,7 @@ def type_grid():
     return name
 
 
-@preamble
+@preamble(section="grid")
 def define_grid(name):
     include_file("dune/testtools/gridconstruction.hh", filetag="driver")
     ini = name_initree()
@@ -89,7 +88,7 @@ def name_grid():
     return name
 
 
-@preamble
+@preamble(section="grid")
 def typedef_leafview(name):
     grid = type_grid()
     return "using {} = {}::LeafGridView;".format(name, grid)
@@ -101,7 +100,7 @@ def type_leafview():
     return name
 
 
-@preamble
+@preamble(section="grid")
 def define_leafview(name):
     _type = type_leafview()
     grid = name_grid()
@@ -114,16 +113,16 @@ def name_leafview():
     return name
 
 
-@preamble
+@preamble(section="fem")
 def typedef_fem(element, name):
     gv = type_leafview()
     df = type_domainfield()
     r = type_range()
     dim = get_dimension()
 
-    if get_option("blockstructured"):
+    if get_form_option("blockstructured"):
         include_file("dune/perftool/blockstructured/blockstructuredqkfem.hh", filetag="driver")
-        degree = element.degree() * get_option("number_of_blocks")
+        degree = element.degree() * get_form_option("number_of_blocks")
         return "using {} = Dune::PDELab::BlockstructuredQkLocalFiniteElementMap<{}, {}, {}, {}>;" \
             .format(name, gv, df, r, degree)
 
@@ -177,7 +176,7 @@ def type_fem(element):
     return name
 
 
-@preamble
+@preamble(section="fem")
 def define_fem(element, name):
     femtype = type_fem(element)
     from dune.perftool.pdelab.driver import isDG
@@ -234,6 +233,8 @@ def name_gfs(element, is_dirichlet, treepath=(), root=True):
             subgfs.append(name_gfs(subel, is_dirichlet[k:k + subel.value_size()], treepath=treepath + (i,), root=False))
             k = k + subel.value_size()
         name = "_".join(subgfs)
+        if len(subgfs) == 1:
+            name = "{}_dummy".format(name)
         name = "{}_{}".format(name, "_".join(str(t) for t in treepath))
         define_composite_gfs(element, is_dirichlet, name, tuple(subgfs), root)
         return name
@@ -272,6 +273,8 @@ def type_gfs(element, is_dirichlet, root=True):
             subgfs.append(type_gfs(subel, is_dirichlet[k:k + subel.value_size()], root=False))
             k = k + subel.value_size()
         name = "_".join(subgfs)
+        if len(subgfs) == 1:
+            name = "{}_dummy".format(name)
         typedef_composite_gfs(element, name, tuple(subgfs), root)
         return name
     else:
@@ -283,7 +286,7 @@ def type_gfs(element, is_dirichlet, root=True):
         return name
 
 
-@preamble
+@preamble(section="gfs")
 def define_gfs(element, is_dirichlet, name, root):
     gfstype = type_gfs(element, is_dirichlet, root=root)
     gv = name_leafview()
@@ -292,7 +295,7 @@ def define_gfs(element, is_dirichlet, name, root):
             "{}.name(\"{}\");".format(name, name)]
 
 
-@preamble
+@preamble(section="gfs")
 def define_power_gfs(element, is_dirichlet, name, subgfs, root):
     gfstype = type_gfs(element, is_dirichlet, root=root)
     names = ["using namespace Dune::Indices;"]
@@ -300,14 +303,14 @@ def define_power_gfs(element, is_dirichlet, name, subgfs, root):
     return ["{} {}({});".format(gfstype, name, subgfs)] + names
 
 
-@preamble
+@preamble(section="gfs")
 def define_composite_gfs(element, is_dirichlet, name, subgfs, root):
     gfstype = type_gfs(element, is_dirichlet, root=root)
     return ["{} {}({});".format(gfstype, name, ", ".join(subgfs)),
             "{}.update();".format(name)]
 
 
-@preamble
+@preamble(section="gfs")
 def typedef_gfs(element, is_dirichlet, name, root):
     vb = type_vectorbackend(element, root)
     gv = type_leafview()
@@ -316,7 +319,7 @@ def typedef_gfs(element, is_dirichlet, name, root):
     return "using {} = Dune::PDELab::GridFunctionSpace<{}, {}, {}, {}>;".format(name, gv, fem, cass, vb)
 
 
-@preamble
+@preamble(section="gfs")
 def typedef_power_gfs(element, is_dirichlet, name, subgfs, root):
     include_file("dune/pdelab/gridfunctionspace/powergridfunctionspace.hh", filetag="driver")
     vb = type_vectorbackend(element, root)
@@ -325,7 +328,7 @@ def typedef_power_gfs(element, is_dirichlet, name, subgfs, root):
     return "using {} = Dune::PDELab::PowerGridFunctionSpace<{}, {}, {}, {}>;".format(name, subgfs, element.num_sub_elements(), vb, ot)
 
 
-@preamble
+@preamble(section="gfs")
 def typedef_composite_gfs(element, name, subgfs, root):
     vb = type_vectorbackend(element, root)
     ot = type_orderingtag(isinstance(element, FiniteElement))
@@ -333,10 +336,10 @@ def typedef_composite_gfs(element, name, subgfs, root):
     return "using {} = Dune::PDELab::CompositeGridFunctionSpace<{}, {}, {}>;".format(name, vb, ot, args)
 
 
-@preamble
+@preamble(section="gfs")
 def typedef_vectorbackend(name, element, root):
     include_file("dune/pdelab/backend/istl.hh", filetag="driver")
-    if get_option("fastdg") and root:
+    if get_form_option("fastdg") and root:
         blocking = "Dune::PDELab::ISTL::Blocking::fixed"
         if isinstance(element, MixedElement):
             blocksize = ""
@@ -357,33 +360,52 @@ def type_vectorbackend(element, root):
 
 
 def type_orderingtag(leaf):
-    if leaf or not get_option("fastdg"):
+    if leaf or not get_form_option("fastdg"):
         return "Dune::PDELab::LexicographicOrderingTag"
     else:
         return "Dune::PDELab::EntityBlockedOrderingTag"
 
 
-@preamble
+@preamble(section="gfs")
+def typedef_overlapping_dirichlet_constraintsassembler(name):
+    include_file("dune/pdelab/constraints/conforming.hh", filetag="driver")
+    return "using {} = Dune::PDELab::ConformingDirichletConstraints;".format(name)
+
+
+@preamble(section="gfs")
+def typedef_p0parallel_constraintsassembler(name):
+    include_file("dune/pdelab/constraints/p0.hh", filetag="driver")
+    return "using {} = Dune::PDELab::P0ParallelConstraints;".format(name)
+
+
+@preamble(section="gfs")
 def typedef_dirichlet_constraintsassembler(name):
     include_file("dune/pdelab/constraints/conforming.hh", filetag="driver")
     return "using {} = Dune::PDELab::ConformingDirichletConstraints;".format(name)
 
 
-@preamble
+@preamble(section="gfs")
 def typedef_no_constraintsassembler(name):
     return "using {} = Dune::PDELab::NoConstraints;".format(name)
 
 
 def type_constraintsassembler(is_dirichlet):
     assert isinstance(is_dirichlet, bool)
-    if is_dirichlet:
+    overlapping = get_option("overlapping")
+    if is_dirichlet and not overlapping:
         name = "DirichletConstraintsAssember"
         typedef_dirichlet_constraintsassembler(name)
-        return name
+    elif is_dirichlet and overlapping:
+        name = "OverlappingConformingDirichletConstraints"
+        typedef_overlapping_dirichlet_constraintsassembler(name)
+    elif not is_dirichlet and overlapping:
+        name = "P0ParallelConstraints"
+        typedef_p0parallel_constraintsassembler(name)
     else:
+        assert not is_dirichlet and not overlapping
         name = "NoConstraintsAssembler"
         typedef_no_constraintsassembler(name)
-        return name
+    return name
 
 
 def name_trial_subgfs(treepath):
@@ -400,7 +422,7 @@ def name_subgfs(treepath):
     return name
 
 
-@preamble
+@preamble(section="vtk")
 def define_subgfs(name, treepath):
     t = type_subgfs(treepath)
     gfs = name_trial_gfs()
diff --git a/python/dune/perftool/pdelab/driver/gridoperator.py b/python/dune/perftool/pdelab/driver/gridoperator.py
index 4727f4f281ed6e8c6205cf5613fc778b07c476ac..8b5c8c0223b61f3d8a5464170cb5c2e496151e15 100644
--- a/python/dune/perftool/pdelab/driver/gridoperator.py
+++ b/python/dune/perftool/pdelab/driver/gridoperator.py
@@ -2,8 +2,7 @@ from dune.perftool.generation import (get_global_context_value,
                                       include_file,
                                       preamble,
                                       )
-from dune.perftool.pdelab.driver import (form_name_suffix,
-                                         get_cell,
+from dune.perftool.pdelab.driver import (get_cell,
                                          get_dimension,
                                          get_test_element,
                                          get_trial_element,
@@ -22,21 +21,20 @@ from dune.perftool.pdelab.driver.gridfunctionspace import (name_test_gfs,
                                                            type_trial_gfs,
                                                            )
 from dune.perftool.pdelab.localoperator import localoperator_basename
-from dune.perftool.pdelab.parameter import parameterclass_basename
-from dune.perftool.options import get_option
+from dune.perftool.options import get_form_option
 
 
-@preamble
-def typedef_gridoperator(name, formdata):
+@preamble(section="gridoperator")
+def typedef_gridoperator(name, form_ident):
     ugfs = type_trial_gfs()
     vgfs = type_test_gfs()
-    lop = type_localoperator(formdata)
+    lop = type_localoperator(form_ident)
     cc = type_constraintscontainer()
     mb = type_matrixbackend()
     df = type_domainfield()
     r = type_range()
-    if get_option("fastdg"):
-        if not get_option("sumfact"):
+    if get_form_option("fastdg"):
+        if not get_form_option("sumfact"):
             raise PerftoolCodegenError("FastDGGridOperator is only implemented for sumfactorization.")
         include_file("dune/pdelab/gridoperator/fastdg.hh", filetag="driver")
         return "using {} = Dune::PDELab::FastDGGridOperator<{}, {}, {}, {}, {}, {}, {}, {}, {}>;".format(name, ugfs, vgfs, lop, mb, df, r, r, cc, cc)
@@ -45,68 +43,66 @@ def typedef_gridoperator(name, formdata):
         return "using {} = Dune::PDELab::GridOperator<{}, {}, {}, {}, {}, {}, {}, {}, {}>;".format(name, ugfs, vgfs, lop, mb, df, r, r, cc, cc)
 
 
-def type_gridoperator(formdata):
-    name = form_name_suffix("GO", formdata).upper()
-    typedef_gridoperator(name, formdata)
+def type_gridoperator(form_ident):
+    name = "GO_{}".format(form_ident)
+    typedef_gridoperator(name, form_ident)
     return name
 
 
-@preamble
-def define_gridoperator(name, formdata):
-    gotype = type_gridoperator(formdata)
+@preamble(section="gridoperator")
+def define_gridoperator(name, form_ident):
+    gotype = type_gridoperator(form_ident)
     ugfs = name_trial_gfs()
     vgfs = name_test_gfs()
     if ugfs != vgfs:
         raise NotImplementedError("Non-Galerkin methods currently not supported!")
     cc = name_assembled_constraints()
-    lop = name_localoperator(formdata)
+    lop = name_localoperator(form_ident)
     mb = name_matrixbackend()
     return ["{} {}({}, {}, {}, {}, {}, {});".format(gotype, name, ugfs, cc, vgfs, cc, lop, mb),
             "std::cout << \"gfs with \" << {}.size() << \" dofs generated  \"<< std::endl;".format(ugfs),
             "std::cout << \"cc with \" << {}.size() << \" dofs generated  \"<< std::endl;".format(cc)]
 
 
-def name_gridoperator(formdata):
-    name = form_name_suffix("go", formdata)
-    define_gridoperator(name, formdata)
+def name_gridoperator(form_ident):
+    name = "go_{}".format(form_ident)
+    define_gridoperator(name, form_ident)
     return name
 
 
-@preamble
-def typedef_localoperator(name, formdata):
+@preamble(section="gridoperator")
+def typedef_localoperator(name, form_ident):
     ugfs = type_trial_gfs()
     vgfs = type_test_gfs()
-    data = get_global_context_value("data")
-    filename = get_option("operator_file")
+    filename = get_form_option("filename", form_ident)
     include_file(filename, filetag="driver")
-    lopname = localoperator_basename(formdata, data)
+    lopname = localoperator_basename(form_ident)
     range_type = type_range()
     return "using {} = {}<{}, {}, {}>;".format(name, lopname, ugfs, vgfs, range_type)
 
 
-def type_localoperator(formdata):
-    name = form_name_suffix("LOP", formdata).upper()
-    typedef_localoperator(name, formdata)
+def type_localoperator(form_ident):
+    name = "LOP_{}".format(form_ident.upper())
+    typedef_localoperator(name, form_ident)
     return name
 
 
-@preamble
-def define_localoperator(name, formdata):
+@preamble(section="gridoperator")
+def define_localoperator(name, form_ident):
     trial_gfs = name_trial_gfs()
     test_gfs = name_test_gfs()
-    loptype = type_localoperator(formdata)
+    loptype = type_localoperator(form_ident)
     ini = name_initree()
-    params = name_parameters(formdata)
-    return "{} {}({}, {}, {}, {});".format(loptype, name, trial_gfs, test_gfs, ini, params)
+    return "{} {}({}, {}, {});".format(loptype, name, trial_gfs, test_gfs, ini)
 
 
-def name_localoperator(formdata):
-    name = form_name_suffix("lop", formdata)
-    define_localoperator(name, formdata)
+def name_localoperator(form_ident):
+    name = "lop_{}".format(form_ident)
+    define_localoperator(name, form_ident)
     return name
 
 
-@preamble
+@preamble(section="gridoperator")
 def define_dofestimate(name):
     # Provide a worstcase estimate for the number of entries per row based
     # on the given gridfunction space and cell geometry
@@ -133,7 +129,7 @@ def name_dofestimate():
     return name
 
 
-@preamble
+@preamble(section="gridoperator")
 def typedef_matrixbackend(name):
     include_file("dune/pdelab/backend/istl.hh", filetag="driver")
     return "using {} = Dune::PDELab::ISTL::BCRSMatrixBackend<>;".format(name)
@@ -145,7 +141,7 @@ def type_matrixbackend():
     return name
 
 
-@preamble
+@preamble(section="gridoperator")
 def define_matrixbackend(name):
     mbtype = type_matrixbackend()
     dof = name_dofestimate()
@@ -156,21 +152,3 @@ def name_matrixbackend():
     name = "mb"
     define_matrixbackend(name)
     return name
-
-
-def type_parameters(formdata):
-    data = get_global_context_value("data")
-    name = parameterclass_basename(formdata, data)
-    return name
-
-
-@preamble
-def define_parameters(name, formdata):
-    partype = type_parameters(formdata)
-    return "{} {};".format(partype, name)
-
-
-def name_parameters(formdata):
-    name = form_name_suffix("params", formdata)
-    define_parameters(name, formdata)
-    return name
diff --git a/python/dune/perftool/pdelab/driver/instationary.py b/python/dune/perftool/pdelab/driver/instationary.py
index fe2cbc5a5a819c7e274759ab9e91cc61d1b14e34..c37792896a23837131ca71f83dcc9f7e497476a3 100644
--- a/python/dune/perftool/pdelab/driver/instationary.py
+++ b/python/dune/perftool/pdelab/driver/instationary.py
@@ -1,8 +1,7 @@
 from dune.perftool.generation import (include_file,
                                       preamble,
                                       )
-from dune.perftool.pdelab.driver import (get_formdata,
-                                         get_mass_formdata,
+from dune.perftool.pdelab.driver import (get_form_ident,
                                          get_trial_element,
                                          is_linear,
                                          name_initree,
@@ -12,9 +11,11 @@ from dune.perftool.pdelab.driver.gridfunctionspace import (name_trial_gfs,
                                                            type_range,
                                                            )
 from dune.perftool.pdelab.driver.gridoperator import (name_gridoperator,
-                                                      name_parameters,
-                                                      type_gridoperator,)
-from dune.perftool.pdelab.driver.constraints import (name_bctype_function,
+                                                      type_gridoperator,
+                                                      name_localoperator,
+                                                      )
+from dune.perftool.pdelab.driver.constraints import (has_dirichlet_constraints,
+                                                     name_bctype_function,
                                                      name_constraintscontainer,
                                                      )
 from dune.perftool.pdelab.driver.interpolate import (interpolate_dirichlet_data,
@@ -31,13 +32,15 @@ from dune.perftool.pdelab.driver.solve import (print_matrix,
                                                )
 from dune.perftool.pdelab.driver.vtk import (name_vtk_sequence_writer,
                                              visualize_initial_condition,
-                                             )
-from dune.perftool.options import get_option
+                                             name_predicate)
+from dune.perftool.options import (get_form_option,
+                                   get_option,
+                                   )
 
 
 def solve_instationary():
     # Create time loop
-    if get_option('matrix_free'):
+    if get_form_option('matrix_free'):
         raise NotImplementedError("Instationary matrix free not implemented!")
     else:
         time_loop()
@@ -46,20 +49,27 @@ def solve_instationary():
     print_matrix()
 
 
-@preamble
+@preamble(section="instat")
 def time_loop():
     ini = name_initree()
-    formdata = get_formdata()
-    params = name_parameters(formdata)
+    lop = name_localoperator(get_form_ident())
     time = name_time()
     element = get_trial_element()
-    is_dirichlet = preprocess_leaf_data(element, "is_dirichlet")
-    bctype = name_bctype_function(element, is_dirichlet)
-    gfs = name_trial_gfs()
-    cc = name_constraintscontainer()
-    vector_type = type_vector(formdata)
-    vector = name_vector(formdata)
+    vector_type = type_vector(get_form_ident())
+    vector = name_vector(get_form_ident())
     interpolate_dirichlet_data(vector)
+    gfs = name_trial_gfs()
+
+    is_dirichlet = preprocess_leaf_data(element, "is_dirichlet")
+    assemble_new_constraints = ""
+    if has_dirichlet_constraints(is_dirichlet):
+        bctype = name_bctype_function(element, is_dirichlet)
+        cc = name_constraintscontainer()
+        assemble_new_constraints = ("  // Assemble constraints for new time step\n"
+                                    "  {}.setTime({}+dt);\n"
+                                    "  Dune::PDELab::constraints({}, {}, {});\n"
+                                    "\n".format(lop, time, bctype, gfs, cc)
+                                    )
 
     # Choose between explicit and implicit time stepping
     explicit = get_option('explicit_time_stepping')
@@ -67,25 +77,27 @@ def time_loop():
         osm = name_explicitonestepmethod()
         apply_call = "{}.apply(time, dt, {}, {}new);".format(osm, vector, vector)
     else:
-        dirichlet = preprocess_leaf_data(element, "dirichlet_expression")
-        boundary = name_boundary_function(element, dirichlet)
         osm = name_onestepmethod()
+    if has_dirichlet_constraints(is_dirichlet):
+        dirichlet = preprocess_leaf_data(element, "interpolate_expression")
+        boundary = name_boundary_function(element, dirichlet)
         apply_call = "{}.apply(time, dt, {}, {}, {}new);".format(osm, vector, boundary, vector)
+    else:
+        apply_call = "{}.apply(time, dt, {}, {}new);".format(osm, vector, vector)
 
     # Setup visualization
     visualize_initial_condition()
     vtk_sequence_writer = name_vtk_sequence_writer()
 
+    predicate = name_predicate()
+
     return ["",
             "double T = {}.get<double>(\"instat.T\", 1.0);".format(ini),
             "double dt = {}.get<double>(\"instat.dt\", 0.1);".format(ini),
             "int step_number(0);"
             "int output_every_nth = {}.get<int>(\"instat.output_every_nth\", 1);".format(ini),
             "while (time<T-1e-8){",
-            "  // Assemble constraints for new time step",
-            "  {}.setTime({}+dt);".format(params, time),
-            "  Dune::PDELab::constraints({}, {}, {});".format(bctype, gfs, cc),
-            "",
+            "{}".format(assemble_new_constraints),
             "  // Do time step",
             "  {} {}new({});".format(vector_type, vector, vector),
             "  {}".format(apply_call),
@@ -97,13 +109,16 @@ def time_loop():
             "  step_number += 1;",
             "  if (step_number%output_every_nth == 0){",
             "    // Output to VTK File",
+            "    {}.vtkWriter()->clear();".format(vtk_sequence_writer),
+            "    Dune::PDELab::addSolutionToVTKWriter(vtkSequenceWriter, {}, {},".format(gfs, vector),
+            "                                         Dune::PDELab::vtk::defaultNameScheme(), {});".format(predicate),
             "    {}.write({}, Dune::VTK::appendedraw);".format(vtk_sequence_writer, time),
             "  }",
             "}",
             ""]
 
 
-@preamble
+@preamble(section="init")
 def define_time(name):
     return "double {} = 0.0;".format(name)
 
@@ -113,7 +128,7 @@ def name_time():
     return "time"
 
 
-@preamble
+@preamble(section="instat")
 def typedef_timesteppingmethod(name):
     r_type = type_range()
     explicit = get_option('explicit_time_stepping')
@@ -128,7 +143,7 @@ def type_timesteppingmethod():
     return "TSM"
 
 
-@preamble
+@preamble(section="instat")
 def define_timesteppingmethod(name):
     tsm_type = type_timesteppingmethod()
     explicit = get_option('explicit_time_stepping')
@@ -144,11 +159,11 @@ def name_timesteppingmethod():
     return "tsm"
 
 
-@preamble
+@preamble(section="gridoperator")
 def typedef_instationarygridoperator(name):
     include_file("dune/pdelab/gridoperator/onestep.hh", filetag="driver")
-    go_type = type_gridoperator(get_formdata())
-    mass_go_type = type_gridoperator(get_mass_formdata())
+    go_type = type_gridoperator(get_form_ident())
+    mass_go_type = type_gridoperator("mass")
     explicit = get_option('explicit_time_stepping')
     if explicit:
         return "using {} = Dune::PDELab::OneStepGridOperator<{},{},false>;".format(name, go_type, mass_go_type)
@@ -161,11 +176,11 @@ def type_instationarygridoperator():
     return "IGO"
 
 
-@preamble
+@preamble(section="gridoperator")
 def define_instationarygridoperator(name):
     igo_type = type_instationarygridoperator()
-    go = name_gridoperator(get_formdata())
-    mass_go = name_gridoperator(get_mass_formdata())
+    go = name_gridoperator(get_form_ident())
+    mass_go = name_gridoperator("mass")
     return "{} {}({}, {});".format(igo_type, name, go, mass_go)
 
 
@@ -174,12 +189,12 @@ def name_instationarygridoperator():
     return "igo"
 
 
-@preamble
+@preamble(section="instat")
 def typedef_onestepmethod(name):
     r_type = type_range()
     igo_type = type_instationarygridoperator()
     snp_type = type_stationarynonlinearproblemssolver(igo_type)
-    vector_type = type_vector(get_formdata())
+    vector_type = type_vector(get_form_ident())
     return "using {} = Dune::PDELab::OneStepMethod<{}, {}, {}, {}, {}>;".format(name, r_type, igo_type, snp_type, vector_type, vector_type)
 
 
@@ -188,7 +203,7 @@ def type_onestepmethod():
     return "OSM"
 
 
-@preamble
+@preamble(section="instat")
 def define_onestepmethod(name):
     ilptype = type_onestepmethod()
     tsm = name_timesteppingmethod()
@@ -203,12 +218,12 @@ def name_onestepmethod():
     return "osm"
 
 
-@preamble
+@preamble(section="instat")
 def typedef_explicitonestepmethod(name):
     r_type = type_range()
     igo_type = type_instationarygridoperator()
     ls_type = type_linearsolver()
-    vector_type = type_vector(get_formdata())
+    vector_type = type_vector(get_form_ident())
     return "using {} = Dune::PDELab::ExplicitOneStepMethod<{}, {}, {}, {}>;".format(name, r_type, igo_type, ls_type, vector_type)
 
 
@@ -217,7 +232,7 @@ def type_explicitonestepmethod():
     return "EOSM"
 
 
-@preamble
+@preamble(section="instat")
 def define_explicitonestepmethod(name):
     eosm_type = type_explicitonestepmethod()
     tsm = name_timesteppingmethod()
diff --git a/python/dune/perftool/pdelab/driver/interpolate.py b/python/dune/perftool/pdelab/driver/interpolate.py
index ebbdbfd3d044d276a8692660252d2a7768a9389a..75846a1f57bead0d7ca87c0014a3e8715c2836d6 100644
--- a/python/dune/perftool/pdelab/driver/interpolate.py
+++ b/python/dune/perftool/pdelab/driver/interpolate.py
@@ -5,7 +5,7 @@ from dune.perftool.generation import (cached,
                                       preamble,
                                       )
 from dune.perftool.pdelab.driver import (FEM_name_mangling,
-                                         get_formdata,
+                                         get_form_ident,
                                          get_trial_element,
                                          is_stationary,
                                          preprocess_leaf_data,
@@ -13,29 +13,19 @@ from dune.perftool.pdelab.driver import (FEM_name_mangling,
 from dune.perftool.pdelab.driver.gridfunctionspace import (name_trial_gfs,
                                                            name_leafview,
                                                            )
-from dune.perftool.pdelab.driver.gridoperator import (name_parameters,)
-
 from ufl import FiniteElement, MixedElement, TensorElement, VectorElement, TensorProductElement
 
 
-def _do_interpolate(dirichlet):
-    if isinstance(dirichlet, (list, tuple)):
-        return any(bool(d) for d in dirichlet)
-    else:
-        return bool(dirichlet)
-
-
 def interpolate_dirichlet_data(name):
     element = get_trial_element()
-    is_dirichlet = preprocess_leaf_data(element, "is_dirichlet")
-    if _do_interpolate(is_dirichlet) or not is_stationary():
+    func = preprocess_leaf_data(element, "interpolate_expression", applyZeroDefault=False)
+    if func is not None:
+        bf = name_boundary_function(element, func)
         gfs = name_trial_gfs()
-        dirichlet = preprocess_leaf_data(element, "dirichlet_expression")
-        bf = name_boundary_function(element, dirichlet)
         interpolate_vector(bf, gfs, name)
 
 
-@preamble
+@preamble(section="vector")
 def interpolate_vector(func, gfs, name):
     return "Dune::PDELab::interpolate({}, {}, {});".format(func,
                                                            gfs,
@@ -52,6 +42,8 @@ def name_boundary_function(element, func):
             childs.append(name_boundary_function(subel, func[k:k + subel.value_size()]))
             k = k + subel.value_size()
         name = "_".join(childs)
+        if len(childs) == 1:
+            name = "{}_dummy".format(name)
         define_compositegfs_parameterfunction(name, tuple(childs))
         return name
     else:
@@ -61,14 +53,14 @@ def name_boundary_function(element, func):
         return name
 
 
-@preamble
+@preamble(section="vector")
 def define_compositegfs_parameterfunction(name, children):
     return "Dune::PDELab::CompositeGridFunction<{}> {}({});".format(', '.join('decltype({})'.format(c) for c in children),
                                                                     name,
                                                                     ', '.join(children))
 
 
-@preamble
+@preamble(section="vector")
 def define_boundary_function(name, dirichlet):
     gv = name_leafview()
     lambdaname = name_boundary_lambda(dirichlet)
@@ -79,11 +71,13 @@ def define_boundary_function(name, dirichlet):
                                                                                       lambdaname,
                                                                                       )
     else:
-        params = name_parameters(get_formdata())
+        from dune.perftool.pdelab.driver.gridoperator import name_localoperator
+        lop = name_localoperator(get_form_ident())
         return "auto {} = Dune::PDELab::makeInstationaryGridFunctionFromCallable({}, {}, {});".format(name,
                                                                                                       gv,
                                                                                                       lambdaname,
-                                                                                                      params)
+                                                                                                      lop,
+                                                                                                      )
 
 
 @cached
@@ -93,7 +87,7 @@ def name_boundary_lambda(boundary):
     return name
 
 
-@preamble
+@preamble(section="vector")
 def define_boundary_lambda(name, boundary):
     if boundary is None:
         boundary = 0.0
diff --git a/python/dune/perftool/pdelab/driver/solve.py b/python/dune/perftool/pdelab/driver/solve.py
index e877ece672cf79f7db0ed373e96257d9fbfb98a7..0648df399d809fd84d30624711dfc2fb02bbc631 100644
--- a/python/dune/perftool/pdelab/driver/solve.py
+++ b/python/dune/perftool/pdelab/driver/solve.py
@@ -1,32 +1,38 @@
 from dune.perftool.generation import (include_file,
                                       preamble,
                                       )
-from dune.perftool.options import get_option
-from dune.perftool.pdelab.driver import (form_name_suffix,
-                                         get_formdata,
+from dune.perftool.options import (get_form_option,
+                                   get_option,
+                                   )
+from dune.perftool.pdelab.driver import (get_form_ident,
                                          is_linear,
                                          name_initree,
                                          )
-from dune.perftool.pdelab.driver.gridfunctionspace import name_trial_gfs
+from dune.perftool.pdelab.driver.gridfunctionspace import (name_trial_gfs,
+                                                           type_domainfield,
+                                                           type_trial_gfs,
+                                                           )
+from dune.perftool.pdelab.driver.constraints import (type_constraintscontainer,
+                                                     name_assembled_constraints,
+                                                     )
 from dune.perftool.pdelab.driver.gridoperator import (name_gridoperator,
                                                       type_gridoperator,
                                                       )
 from dune.perftool.pdelab.driver.interpolate import interpolate_dirichlet_data
 
 
-@preamble
+@preamble(section="solver")
 def dune_solve():
+    form_ident = get_form_ident()
     # Test if form is linear in ansatzfunction
     linear = is_linear()
 
     # Test wether we want to do matrix free operator evaluation
-    matrix_free = get_option('matrix_free')
-
+    matrix_free = get_form_option('matrix_free')
     # Get right solve command
     if linear and matrix_free:
-        formdata = get_formdata()
-        go = name_gridoperator(formdata)
-        x = name_vector(formdata)
+        go = name_gridoperator(form_ident)
+        x = name_vector(form_ident)
         include_file("dune/perftool/matrixfree.hh", filetag="driver")
         solve = "solveMatrixFree({},{});".format(go, x)
     elif linear and not matrix_free:
@@ -34,14 +40,13 @@ def dune_solve():
         solve = "{}.apply();".format(slp)
     elif not linear and matrix_free:
         # TODO copy of linear case and obviously broken, used to generate something ;)
-        formdata = get_formdata()
-        go = name_gridoperator(formdata)
-        x = name_vector(formdata)
+        go = name_gridoperator(form_ident)
+        x = name_vector(form_ident)
         include_file("dune/perftool/matrixfree.hh", filetag="driver")
         solve = "solveNonlinearMatrixFree({},{});".format(go, x)
     elif not linear and not matrix_free:
-        go_type = type_gridoperator(get_formdata())
-        go = name_gridoperator(get_formdata())
+        go_type = type_gridoperator(form_ident)
+        go = name_gridoperator(form_ident)
         snp = name_stationarynonlinearproblemsolver(go_type, go)
         solve = "{}.apply();".format(snp)
 
@@ -49,62 +54,62 @@ def dune_solve():
     print_matrix()
 
     if get_option('instrumentation_level') >= 2:
-        from dune.perftool.pdelab.driver.timings import setup_timer, name_timing_stream
+        from dune.perftool.pdelab.driver.timings import setup_timer, name_timing_stream, name_timing_identifier
+        timestream = name_timing_stream()
         setup_timer()
         from dune.perftool.generation import post_include
         post_include("HP_DECLARE_TIMER(solve);", filetag="driver")
 
-        # Print times after solving
-        from dune.perftool.generation import get_global_context_value
-        formdatas = get_global_context_value("formdatas")
-        print_times = []
-        for formdata in formdatas:
-            from dune.perftool.pdelab.driver.gridoperator import name_localoperator
-            lop_name = name_localoperator(formdata)
-            timestream = name_timing_stream()
-            print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
-
         solve = ["HP_TIMER_START(solve);",
                  "{}".format(solve),
                  "HP_TIMER_STOP(solve);",
                  "DUMP_TIMER({}, solve, {}, true);".format(get_option("instrumentation_level"), timestream),
                  ]
+
         if get_option('instrumentation_level') >= 3:
-            solve.extend(print_times)
+            from dune.perftool.pdelab.driver.gridoperator import name_localoperator
+            lop_name = name_localoperator(form_ident)
+            solve.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
 
     return solve
 
 
-def name_vector(formdata):
-    name = form_name_suffix("x", formdata)
-    define_vector(name, formdata)
+def name_vector(form_ident):
+    name = "x_{}".format(form_ident)
+    define_vector(name, form_ident)
     interpolate_dirichlet_data(name)
     return name
 
 
-@preamble
-def typedef_vector(name, formdata):
-    go_type = type_gridoperator(formdata)
-    return "using {} = {}::Traits::Domain;".format(name, go_type)
+@preamble(section="vector")
+def typedef_vector(name, form_ident):
+    gfs = type_trial_gfs()
+    df = type_domainfield()
+    return "using {} = Dune::PDELab::Backend::Vector<{},{}>;".format(name, gfs, df)
 
 
-def type_vector(formdata):
-    name = form_name_suffix("V", formdata).upper()
-    typedef_vector(name, formdata)
+def type_vector(form_ident):
+    name = "V_{}".format(form_ident.upper())
+    typedef_vector(name, form_ident)
     return name
 
 
-@preamble
-def define_vector(name, formdata):
-    vtype = type_vector(formdata)
+@preamble(section="vector")
+def define_vector(name, form_ident):
+    vtype = type_vector(form_ident)
     gfs = name_trial_gfs()
     return ["{} {}({});".format(vtype, name, gfs), "{} = 0.0;".format(name)]
 
 
-@preamble
+@preamble(section="solver")
 def typedef_linearsolver(name):
     include_file("dune/pdelab/backend/istl.hh", filetag="driver")
-    return "using {} = Dune::PDELab::ISTLBackend_SEQ_SuperLU;".format(name)
+    if get_option('overlapping'):
+        gfs = type_trial_gfs()
+        cc = type_constraintscontainer()
+        return "using {} = Dune::PDELab::ISTLBackend_OVLP_BCGS_ILU0<{},{}>;".format(name, gfs, cc)
+    else:
+        return "using {} = Dune::PDELab::ISTLBackend_SEQ_SuperLU;".format(name)
 
 
 def type_linearsolver():
@@ -113,10 +118,15 @@ def type_linearsolver():
     return name
 
 
-@preamble
+@preamble(section="solver")
 def define_linearsolver(name):
     lstype = type_linearsolver()
-    return "{} {}(false);".format(lstype, name)
+    if get_option('overlapping'):
+        gfs = name_trial_gfs()
+        cc = name_assembled_constraints()
+        return "{} {}({}, {});".format(lstype, name, gfs, cc)
+    else:
+        return "{} {}(false);".format(lstype, name)
 
 
 def name_linearsolver():
@@ -125,7 +135,7 @@ def name_linearsolver():
     return name
 
 
-@preamble
+@preamble(section="solver")
 def define_reduction(name):
     ini = name_initree()
     return "double {} = {}.get<double>(\"reduction\", 1e-12);".format(name, ini)
@@ -137,12 +147,12 @@ def name_reduction():
     return name
 
 
-@preamble
+@preamble(section="solver")
 def typedef_stationarylinearproblemsolver(name):
     include_file("dune/pdelab/stationary/linearproblem.hh", filetag="driver")
-    gotype = type_gridoperator(get_formdata())
+    gotype = type_gridoperator(get_form_ident())
     lstype = type_linearsolver()
-    xtype = type_vector(get_formdata())
+    xtype = type_vector(get_form_ident())
     return "using {} = Dune::PDELab::StationaryLinearProblemSolver<{}, {}, {}>;".format(name, gotype, lstype, xtype)
 
 
@@ -151,13 +161,12 @@ def type_stationarylinearproblemsolver():
     return "SLP"
 
 
-@preamble
+@preamble(section="solver")
 def define_stationarylinearproblemsolver(name):
     slptype = type_stationarylinearproblemsolver()
-    formdata = get_formdata()
-    go = name_gridoperator(formdata)
+    go = name_gridoperator(get_form_ident())
     ls = name_linearsolver()
-    x = name_vector(formdata)
+    x = name_vector(get_form_ident())
     red = name_reduction()
     return "{} {}({}, {}, {}, {});".format(slptype, name, go, ls, x, red)
 
@@ -167,11 +176,11 @@ def name_stationarylinearproblemsolver():
     return "slp"
 
 
-@preamble
+@preamble(section="solver")
 def typedef_stationarynonlinearproblemsolver(name, go_type):
     include_file("dune/pdelab/newton/newton.hh", filetag="driver")
     ls_type = type_linearsolver()
-    x_type = type_vector(get_formdata())
+    x_type = type_vector(get_form_ident())
     return "using {} = Dune::PDELab::Newton<{}, {}, {}>;".format(name, go_type, ls_type, x_type)
 
 
@@ -181,10 +190,10 @@ def type_stationarynonlinearproblemssolver(go_type):
     return name
 
 
-@preamble
+@preamble(section="solver")
 def define_stationarynonlinearproblemsolver(name, go_type, go):
     snptype = type_stationarynonlinearproblemssolver(go_type)
-    x = name_vector(get_formdata())
+    x = name_vector(get_form_ident())
     ls = name_linearsolver()
     return "{} {}({}, {}, {});".format(snptype, name, go, x, ls)
 
@@ -195,13 +204,12 @@ def name_stationarynonlinearproblemsolver(go_type, go):
     return name
 
 
-@preamble
+@preamble(section="printing")
 def print_residual():
     ini = name_initree()
-    formdata = get_formdata()
-    n_go = name_gridoperator(formdata)
-    v = name_vector(formdata)
-    t_v = type_vector(formdata)
+    n_go = name_gridoperator(get_form_ident())
+    v = name_vector(get_form_ident())
+    t_v = type_vector(get_form_ident())
     include_file("random", system=True, filetag="driver")
 
     return ["if ({}.get<bool>(\"printresidual\", false)) {{".format(ini),
@@ -219,14 +227,13 @@ def print_residual():
             "}"]
 
 
-@preamble
+@preamble(section="printing")
 def print_matrix():
-    formdata = get_formdata()
     ini = name_initree()
-    t_go = type_gridoperator(formdata)
-    n_go = name_gridoperator(formdata)
-    v = name_vector(formdata)
-    t_v = type_vector(formdata)
+    t_go = type_gridoperator(get_form_ident())
+    n_go = name_gridoperator(get_form_ident())
+    v = name_vector(get_form_ident())
+    t_v = type_vector(get_form_ident())
 
     return ["if ({}.get<bool>(\"printmatrix\", false)) {{".format(ini),
             "  // Setup random input",
diff --git a/python/dune/perftool/pdelab/driver/timings.py b/python/dune/perftool/pdelab/driver/timings.py
index d003c1c6226a243788fb0bf4e777bee7364538c2..84f0ce839d7684d165faa6ab07b57dfe340d6b13 100644
--- a/python/dune/perftool/pdelab/driver/timings.py
+++ b/python/dune/perftool/pdelab/driver/timings.py
@@ -1,13 +1,13 @@
 """ Timing related generator functions """
 
-from dune.perftool.options import get_option, set_option
+from dune.perftool.options import get_option
 from dune.perftool.generation import (cached,
                                       include_file,
                                       pre_include,
                                       post_include,
                                       preamble,
                                       )
-from dune.perftool.pdelab.driver import (get_formdata,
+from dune.perftool.pdelab.driver import (get_form_ident,
                                          name_initree,
                                          name_mpihelper,
                                          )
@@ -24,7 +24,7 @@ from dune.perftool.pdelab.driver.solve import (name_vector,
                                                )
 
 
-@preamble
+@preamble(section="timings")
 def define_timing_identifier(name):
     ini = name_initree()
     return "auto {} = {}.get<std::string>(\"identifier\", std::string(argv[0]));".format(name, ini)
@@ -36,7 +36,7 @@ def name_timing_identifier():
     return name
 
 
-@preamble
+@preamble(section="timings")
 def dump_dof_numbers(stream):
     ident = name_timing_identifier()
     level = get_option("instrumentation_level")
@@ -51,7 +51,7 @@ def dump_dof_numbers(stream):
             ]
 
 
-@preamble
+@preamble(section="timings")
 def define_timing_stream(name):
     include_file('fstream', filetag='driver', system=True)
     include_file('sstream', filetag='driver', system=True)
@@ -81,12 +81,11 @@ def setup_timer():
     include_file("dune/perftool/common/timer.hh", filetag="driver")
 
 
-@preamble
+@preamble(section="timings")
 def evaluate_residual_timer():
-    formdata = get_formdata()
-    n_go = name_gridoperator(formdata)
-    v = name_vector(formdata)
-    t_v = type_vector(formdata)
+    n_go = name_gridoperator(get_form_ident())
+    v = name_vector(get_form_ident())
+    t_v = type_vector(get_form_ident())
     setup_timer()
 
     if get_option('instrumentation_level') >= 2:
@@ -96,12 +95,9 @@ def evaluate_residual_timer():
         timestream = name_timing_stream()
         print_times = []
 
-    from dune.perftool.generation import get_global_context_value
-    formdatas = get_global_context_value("formdatas")
-    for formdata in formdatas:
-        lop_name = name_localoperator(formdata)
-        if get_option('instrumentation_level') >= 3:
-            print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
+    lop_name = name_localoperator(get_form_ident())
+    if get_option('instrumentation_level') >= 3:
+        print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
 
     if get_option('instrumentation_level') >= 2:
         evaluation = ["HP_TIMER_START(residual_evaluation);",
@@ -117,15 +113,11 @@ def evaluate_residual_timer():
     return evaluation
 
 
-@preamble
+@preamble(section="timings")
 def apply_jacobian_timer():
-    # Set the matrix_free option to True!
-    set_option("matrix_free", True)
-
-    formdata = get_formdata()
-    n_go = name_gridoperator(formdata)
-    v = name_vector(formdata)
-    t_v = type_vector(formdata)
+    n_go = name_gridoperator(get_form_ident())
+    v = name_vector(get_form_ident())
+    t_v = type_vector(get_form_ident())
     setup_timer()
 
     if get_option('instrumentation_level') >= 2:
@@ -135,12 +127,9 @@ def apply_jacobian_timer():
         timestream = name_timing_stream()
         print_times = []
 
-    from dune.perftool.generation import get_global_context_value
-    formdatas = get_global_context_value("formdatas")
-    for formdata in formdatas:
-        lop_name = name_localoperator(formdata)
-        if get_option('instrumentation_level') >= 3:
-            print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
+    lop_name = name_localoperator(get_form_ident())
+    if get_option('instrumentation_level') >= 3:
+        print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
 
     if get_option('instrumentation_level') >= 2:
         evaluation = ["HP_TIMER_START(apply_jacobian);",
@@ -156,13 +145,12 @@ def apply_jacobian_timer():
     return evaluation
 
 
-@preamble
+@preamble(section="timings")
 def assemble_matrix_timer():
-    formdata = get_formdata()
-    t_go = type_gridoperator(formdata)
-    n_go = name_gridoperator(formdata)
-    v = name_vector(formdata)
-    t_v = type_vector(formdata)
+    t_go = type_gridoperator(get_form_ident())
+    n_go = name_gridoperator(get_form_ident())
+    v = name_vector(get_form_ident())
+    t_v = type_vector(get_form_ident())
     setup_timer()
 
     if get_option('instrumentation_level') >= 2:
@@ -172,12 +160,9 @@ def assemble_matrix_timer():
         timestream = name_timing_stream()
         print_times = []
 
-    from dune.perftool.generation import get_global_context_value
-    formdatas = get_global_context_value("formdatas")
-    for formdata in formdatas:
-        lop_name = name_localoperator(formdata)
-        if get_option('instrumentation_level') >= 3:
-            print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
+    lop_name = name_localoperator(get_form_ident())
+    if get_option('instrumentation_level') >= 3:
+        print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
 
     if get_option('instrumentation_level') >= 2:
         assembly = ["HP_TIMER_START(matrix_assembly);",
diff --git a/python/dune/perftool/pdelab/driver/visitor.py b/python/dune/perftool/pdelab/driver/visitor.py
index 8ef37f0d8975fee436fc91c7092a62f5a27dc84f..6549c9a18b8780a983abcfd0cc0be5f07d2e1e3f 100644
--- a/python/dune/perftool/pdelab/driver/visitor.py
+++ b/python/dune/perftool/pdelab/driver/visitor.py
@@ -28,12 +28,21 @@ class DriverUFL2PymbolicVisitor(UFL2LoopyVisitor):
         driver_using_statement("std::min")
         return UFL2LoopyVisitor.min_value(self, o)
 
+    def coefficient(self, o):
+        if o.count() == 2:
+            from dune.perftool.pdelab.driver import get_form_ident
+            from dune.perftool.pdelab.driver.gridoperator import name_localoperator
+            lop = name_localoperator(get_form_ident())
+            return prim.Call(prim.Variable("{}.getTime".format(lop)), ())
+        else:
+            return UFL2LoopyVisitor.coefficient(self, o)
+
 
 def ufl_to_code(expr, boundary=True):
     # So far, we only considered this code branch on boundaries!
     assert boundary
-    from dune.perftool.pdelab.driver import get_formdata
-    with global_context(integral_type="exterior_facet", formdata=get_formdata()):
+    from dune.perftool.pdelab.driver import get_form_ident
+    with global_context(integral_type="exterior_facet", form_identifier=get_form_ident()):
         visitor = DriverUFL2PymbolicVisitor()
         from pymbolic.mapper.c_code import CCodeMapper
         ccm = CCodeMapper()
diff --git a/python/dune/perftool/pdelab/driver/vtk.py b/python/dune/perftool/pdelab/driver/vtk.py
index 6b8b52adc22cf16ec493592dfc79fa5462be8f0b..3004c4892870d078e5611c5f1ac3eabd4da704ee 100644
--- a/python/dune/perftool/pdelab/driver/vtk.py
+++ b/python/dune/perftool/pdelab/driver/vtk.py
@@ -1,8 +1,8 @@
 from dune.perftool.generation import (include_file,
                                       preamble,
                                       )
-from dune.perftool.options import get_option
-from dune.perftool.pdelab.driver import (get_formdata,
+from dune.perftool.options import get_form_option
+from dune.perftool.pdelab.driver import (get_form_ident,
                                          get_trial_element,
                                          name_initree,
                                          preprocess_leaf_data,
@@ -15,7 +15,7 @@ from dune.perftool.pdelab.driver.gridfunctionspace import (name_leafview,
 from dune.perftool.pdelab.driver.solve import name_vector
 
 
-@preamble
+@preamble(section="vtk")
 def define_vtkfile(name):
     ini = name_initree()
     include_file("string", filetag="driver")
@@ -27,7 +27,7 @@ def name_vtkfile():
     return "vtkfile"
 
 
-@preamble
+@preamble(section="vtk")
 def typedef_vtkwriter(name):
     include_file("dune/grid/io/file/vtk/subsamplingvtkwriter.hh", filetag="driver")
     gv = type_leafview()
@@ -39,14 +39,14 @@ def type_vtkwriter():
     return "VTKWriter"
 
 
-@preamble
+@preamble(section="vtk")
 def define_subsamplinglevel(name):
     ini = name_initree()
     degree = get_trial_element().degree()
     if isinstance(degree, tuple):
         degree = max(degree)
-    if get_option("blockstructured"):
-        degree *= get_option("number_of_blocks")
+    if get_form_option("blockstructured"):
+        degree *= get_form_option("number_of_blocks")
     return "Dune::RefinementIntervals {}({}.get<int>(\"vtk.subsamplinglevel\", {}));".format(name, ini, max(degree, 1))
 
 
@@ -55,7 +55,7 @@ def name_subsamplingintervals():
     return "subint"
 
 
-@preamble
+@preamble(section="vtk")
 def define_vtkwriter(name):
     _type = type_vtkwriter()
     gv = name_leafview()
@@ -68,14 +68,14 @@ def name_vtkwriter():
     return "vtkwriter"
 
 
-@preamble
+@preamble(section="vtk")
 def vtkoutput():
     include_file("dune/pdelab/gridfunctionspace/vtk.hh", filetag="driver")
     vtkwriter = name_vtkwriter()
     gfs = name_trial_gfs()
     vtkfile = name_vtkfile()
     predicate = name_predicate()
-    vec = name_vector(get_formdata())
+    vec = name_vector(get_form_ident())
 
     return ["Dune::PDELab::addSolutionToVTKWriter({}, {}, {}, Dune::PDELab::vtk::defaultNameScheme(), {});".format(vtkwriter, gfs, vec, predicate),
             "{}.write({}, Dune::VTK::ascii);".format(vtkwriter, vtkfile)]
@@ -86,7 +86,7 @@ def type_predicate():
     return "CuttingPredicate"
 
 
-@preamble
+@preamble(section="vtk")
 def define_predicate(name):
     t = type_predicate()
     return "{} {};".format(t, name)
@@ -97,7 +97,7 @@ def name_predicate():
     return "predicate"
 
 
-@preamble
+@preamble(section="vtk")
 def typedef_vtk_sequence_writer(name):
     include_file("dune/grid/io/file/vtk/vtksequencewriter.hh", filetag="driver")
     gv_type = type_leafview()
@@ -109,7 +109,7 @@ def type_vtk_sequence_writer():
     return "VTKSW"
 
 
-@preamble
+@preamble(section="vtk")
 def define_vtk_sequence_writer(name):
     vtksw_type = type_vtk_sequence_writer()
     vtkw_type = type_vtkwriter()
@@ -123,13 +123,13 @@ def name_vtk_sequence_writer():
     return "vtkSequenceWriter"
 
 
-@preamble
+@preamble(section="vtk")
 def visualize_initial_condition():
     include_file("dune/pdelab/gridfunctionspace/vtk.hh", filetag="driver")
     vtkwriter = name_vtk_sequence_writer()
     element = get_trial_element()
     gfs = name_trial_gfs()
-    vector = name_vector(get_formdata())
+    vector = name_vector(get_form_ident())
     predicate = name_predicate()
     from dune.perftool.pdelab.driver.instationary import name_time
     time = name_time()
diff --git a/python/dune/perftool/pdelab/function.py b/python/dune/perftool/pdelab/function.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1dadef18fdcb8a60a88d06aff5fac186a80432d
--- /dev/null
+++ b/python/dune/perftool/pdelab/function.py
@@ -0,0 +1,52 @@
+from dune.perftool.generation import (get_backend,
+                                      instruction,
+                                      kernel_cached,
+                                      preamble,
+                                      temporary_variable,
+                                      )
+from dune.perftool.pdelab.geometry import (name_cell,
+                                           world_dimension,
+                                           )
+from dune.perftool.pdelab.localoperator import name_gridfunction_member
+
+import pymbolic.primitives as prim
+
+
+@preamble
+def bind_gridfunction_to_element(gf, restriction):
+    element = name_cell(restriction)
+    return "{}.bind({});".format(gf, element)
+
+
+def declare_grid_function_range(gridfunction):
+    def _decl(name, kernel, decl_info):
+        return "typename decltype({})::Range {};".format(gridfunction, name)
+
+    return _decl
+
+
+@kernel_cached
+def pymbolic_evaluate_gridfunction(name, coeff, restriction, grad):
+    diffOrder = 1 if grad else 0
+
+    gridfunction = name_gridfunction_member(coeff, restriction, diffOrder)
+    bind_gridfunction_to_element(gridfunction, restriction)
+
+    temporary_variable(name,
+                       shape=(1,) + (world_dimension(),) * diffOrder,
+                       decl_method=declare_grid_function_range(gridfunction),
+                       managed=False,
+                       )
+
+    quadpos = get_backend(interface="qp_in_cell")(restriction)
+    instruction(code="{} = {}({});".format(name, gridfunction, quadpos),
+                assignees=frozenset({name}),
+                within_inames=frozenset(get_backend(interface="quad_inames")()),
+                within_inames_is_final=True,
+                )
+
+
+def pymbolic_gridfunction(coeff, restriction, grad):
+    name = "coeff{}{}".format(coeff.count(), "_grad" if grad else "")
+    pymbolic_evaluate_gridfunction(name, coeff, restriction, grad)
+    return prim.Subscript(prim.Variable(name), (0,))
diff --git a/python/dune/perftool/pdelab/geometry.py b/python/dune/perftool/pdelab/geometry.py
index 0bf2f9143134db9e7335943f750fccb8cf4e5819..d308917adbd3391d4b61832938b3a1c4f3355eb5 100644
--- a/python/dune/perftool/pdelab/geometry.py
+++ b/python/dune/perftool/pdelab/geometry.py
@@ -13,7 +13,7 @@ from dune.perftool.generation import (backend,
                                       temporary_variable,
                                       valuearg,
                                       )
-from dune.perftool.options import (get_option,
+from dune.perftool.options import (get_form_option,
                                    option_switch,
                                    )
 from dune.perftool.loopy.target import dtype_floatingpoint, type_floatingpoint
@@ -60,8 +60,7 @@ def _component_iname(context, count):
     if context:
         context = '_' + context
     name = 'idim{}{}'.format(context, str(count))
-    formdata = get_global_context_value('formdata')
-    dim = formdata.geometric_dimension
+    dim = world_dimension()
     domain(name, dim)
     return name
 
@@ -113,7 +112,7 @@ def type_geometry_wrapper():
 @preamble
 def define_restricted_cell(name, restriction):
     ig = name_intersection_geometry_wrapper()
-    which = "inside" if restriction == Restriction.NEGATIVE else "outside"
+    which = "inside" if restriction == Restriction.POSITIVE else "outside"
     return "const auto& {} = {}.{}();".format(name,
                                               ig,
                                               which,
@@ -125,7 +124,7 @@ def name_cell(restriction):
         eg = name_element_geometry_wrapper()
         return "{}.entity()".format(eg)
     else:
-        which = "inside" if restriction == Restriction.NEGATIVE else "outside"
+        which = "inside" if restriction == Restriction.POSITIVE else "outside"
         name = "{}_cell".format(which)
         define_restricted_cell(name, restriction)
         return name
@@ -187,7 +186,7 @@ def name_geometry():
 @preamble
 def define_in_cell_geometry(restriction, name):
     ig = name_intersection_geometry_wrapper()
-    which = "In" if restriction == Restriction.NEGATIVE else "Out"
+    which = "In" if restriction == Restriction.POSITIVE else "Out"
     return "auto {} = {}.geometryIn{}side();".format(name,
                                                      ig,
                                                      which
@@ -197,7 +196,7 @@ def define_in_cell_geometry(restriction, name):
 def name_in_cell_geometry(restriction):
     assert restriction is not Restriction.NONE
 
-    name = "geo_in_{}side".format("in" if restriction is Restriction.NEGATIVE else "out")
+    name = "geo_in_{}side".format("in" if restriction is Restriction.POSITIVE else "out")
     define_in_cell_geometry(restriction, name)
     return name
 
@@ -217,7 +216,7 @@ def apply_in_cell_transformation(name, local, restriction):
 
 def pymbolic_in_cell_coordinates(local, restriction):
     basename = get_pymbolic_basename(local)
-    name = "{}_in_{}side".format(basename, "in" if restriction is Restriction.NEGATIVE else "out")
+    name = "{}_in_{}side".format(basename, "in" if restriction is Restriction.POSITIVE else "out")
     temporary_variable(name, shape=(world_dimension(),), shape_impl=("fv",))
     apply_in_cell_transformation(name, local, restriction)
     return Variable(name)
@@ -232,8 +231,11 @@ def to_cell_coordinates(local, restriction):
 
 
 def world_dimension():
-    formdata = get_global_context_value('formdata')
-    return formdata.geometric_dimension
+    data = get_global_context_value("data")
+    form = data.object_by_name[get_form_option("form")]
+    from dune.perftool.ufl.preprocess import preprocess_form
+    form = preprocess_form(form).preprocessed_form
+    return form.ufl_cell().geometric_dimension()
 
 
 def intersection_dimension():
@@ -259,14 +261,14 @@ def evaluate_unit_outer_normal(name):
 
 
 @preamble
-def declare_normal(name, shape, shape_impl):
+def declare_normal(name, kernel, decl_info):
     ig = name_intersection_geometry_wrapper()
     return "auto {} = {}.centerUnitOuterNormal();".format(name, ig)
 
 
 def pymbolic_unit_outer_normal():
     name = "outer_normal"
-    if not get_option("diagonal_transformation_matrix"):
+    if not get_form_option("diagonal_transformation_matrix"):
         temporary_variable(name, shape=(world_dimension(),), decl_method=declare_normal)
         evaluate_unit_outer_normal(name)
     else:
@@ -291,19 +293,14 @@ def pymbolic_unit_inner_normal():
 
 
 def type_jacobian_inverse_transposed(restriction):
-    if get_option('turn_off_diagonal_jacobian'):
-        dim = world_dimension()
-        ftype = type_floatingpoint()
-        return "typename Dune::FieldMatrix<{},{},{}>".format(ftype, dim, dim)
-    else:
-        geo = type_cell_geometry(restriction)
-        return "typename {}::JacobianInverseTransposed".format(geo)
+    geo = type_cell_geometry(restriction)
+    return "typename {}::JacobianInverseTransposed".format(geo)
 
 
 @kernel_cached
 def define_jacobian_inverse_transposed_temporary(restriction):
     @preamble
-    def _define_jacobian_inverse_transposed_temporary(name, shape, shape_impl):
+    def _define_jacobian_inverse_transposed_temporary(name, kernel, decl_info):
         t = type_jacobian_inverse_transposed(restriction)
         return "{} {};".format(t,
                                name,
@@ -475,7 +472,7 @@ def define_cell_volume(name, restriction):
 
 
 def pymbolic_cell_volume(restriction):
-    if get_option("constant_transformation_matrix"):
+    if get_form_option("constant_transformation_matrix"):
         return pymbolic_jacobian_determinant()
     else:
         name = restricted_name("volume", restriction)
@@ -491,7 +488,7 @@ def define_facet_area(name):
 
 
 def pymbolic_facet_area():
-    if get_option("constant_transformation_matrix"):
+    if get_form_option("constant_transformation_matrix"):
         return pymbolic_facet_jacobian_determinant()
     else:
         name = "area"
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 38a41bb06a70d0a08b47dc3144ffba412bec01d4..06c404ea861fcf5be8cb05c7b24bb0f6938b5fb7 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -3,7 +3,10 @@ from os.path import splitext
 
 import logging
 
-from dune.perftool.options import (get_option,
+import numpy as np
+
+from dune.perftool.options import (get_form_option,
+                                   get_option,
                                    option_switch)
 from dune.perftool.generation import (backend,
                                       base_class,
@@ -13,6 +16,7 @@ from dune.perftool.generation import (backend,
                                       domain,
                                       dump_accumulate_timer,
                                       end_of_file,
+                                      function_mangler,
                                       generator_factory,
                                       get_backend,
                                       get_global_context_value,
@@ -42,31 +46,6 @@ import loopy as lp
 import cgen
 
 
-def name_form(formdata, data):
-    # Check wether the formdata has a name in UFL
-    try:
-        name = data.object_names[id(formdata.original_form)]
-        return name
-    except:
-        for index, form in enumerate(data.forms):
-            if formdata.preprocessed_form.equals(form):
-                name = str(index)
-                return name
-    # If the form has no name and can not be found in data.forms something went wrong
-    assert False
-
-
-def name_localoperator_file(formdata, data):
-    from dune.perftool.options import get_option
-    if len(data.forms) == 1:
-        filename = get_option("operator_file")
-    else:
-        suffix = '_' + name_form(formdata, data)
-        basename, extension = splitext(get_option("operator_file"))
-        filename = basename + suffix + extension
-    return filename
-
-
 @template_parameter(classtag="operator")
 def lop_template_ansatz_gfs():
     name = "GFSU"
@@ -176,9 +155,45 @@ def name_initree_member():
 
 
 @class_basename(classtag="operator")
-def localoperator_basename(formdata, data):
-    form_name = name_form(formdata, data)
-    return "LocalOperator" + form_name.capitalize()
+def localoperator_basename(form_ident):
+    return get_form_option("classname", form_ident)
+
+
+def name_gridfunction_member(coeff, restriction, diffOrder=0):
+    # We reuse the grid function for volume integrals in skeleton integrals
+    if restriction == Restriction.POSITIVE:
+        restriction = Restriction.NONE
+    restr = "_n" if restriction == Restriction.NEGATIVE else ""
+    name = "local_gridfunction_coeff{}_diff{}{}".format(coeff.count(), diffOrder, restr)
+    define_gridfunction_member(name, coeff, restriction, diffOrder)
+    return name
+
+
+def name_gridfunction_constructor_argument(coeff):
+    _type = type_gridfunction_template_parameter(coeff)
+    name = "gridfunction_coeff{}_".format(coeff.count())
+    constructor_parameter("const {}&".format(_type), name, classtag="operator")
+    return name
+
+
+@class_member(classtag="operator")
+def define_gridfunction_member(name, coeff, restriction, diffOrder):
+    _type = type_gridfunction_template_parameter(coeff)
+    param = name_gridfunction_constructor_argument(coeff)
+    if diffOrder > 0:
+        other = name_gridfunction_member(coeff, restriction, diffOrder - 1)
+        init = "derivative({})".format(other)
+        initializer_list(name, [init], classtag="operator")
+        return "mutable decltype({}) {};".format(init, name)
+    else:
+        init = "localFunction({})".format(param)
+        initializer_list(name, [init], classtag="operator")
+        return "mutable typename {}::LocalFunction {};".format(_type, name)
+
+
+@template_parameter(classtag="operator")
+def type_gridfunction_template_parameter(coeff):
+    return "GRIDFUNCTION_COEFF{}".format(coeff.count())
 
 
 def class_type_from_cache(classtag):
@@ -244,7 +259,7 @@ def determine_accumulation_space(info, number):
     from loopy.types import NumpyType
     valuearg(lfs, dtype=NumpyType("str"))
 
-    if get_option("blockstructured"):
+    if get_form_option("blockstructured"):
         from dune.perftool.blockstructured.tools import micro_index_to_macro_index
         from dune.perftool.blockstructured.spaces import lfs_inames
         lfsi = micro_index_to_macro_index(subel, lfs_inames(subel, info.restriction, count=number))
@@ -276,7 +291,8 @@ def boundary_predicates(expr, measure, subdomain_id):
 
         # Get the original form and inspect the present measures
         from dune.perftool.generation import get_global_context_value
-        original_form = get_global_context_value("formdata").original_form
+        data = get_global_context_value("data")
+        original_form = data.object_by_name[get_form_option("form")]
 
         sd = original_form.subdomain_data()
         assert len(sd) == 1
@@ -295,16 +311,7 @@ def boundary_predicates(expr, measure, subdomain_id):
             visitor = get_visitor(measure, subdomain_id)
             cond = visitor(subdomain_data, do_predicates=True)
         else:
-            # Determine the name of the parameter function
-            cond = get_global_context_value("data").object_names[id(subdomain_data)]
-
-            # Trigger the generation of code for this thing in the parameter class
-            from ufl.checks import is_cellwise_constant
-            cellwise_constant = is_cellwise_constant(expr)
-            from dune.perftool.pdelab.parameter import intersection_parameter_function
-            intersection_parameter_function(cond, subdomain_data, cellwise_constant, t='int32')
-
-            cond = prim.Variable(cond)
+            raise NotImplementedError("Only UFL expressions allowed in subdomain_data right now.")
 
         predicates = predicates.union([prim.Comparison(cond, '==', subdomain_id)])
 
@@ -341,13 +348,12 @@ def _list_infos(expr, number, visitor):
         return
     element = ma[0].argexpr.ufl_element()
 
-    from dune.perftool.ufl.modified_terminals import Restriction
     if visitor.measure == "cell":
         restrictions = (Restriction.NONE,)
     elif visitor.measure == "exterior_facet":
-        restrictions = (Restriction.NEGATIVE,)
+        restrictions = (Restriction.POSITIVE,)
     elif visitor.measure == "interior_facet":
-        restrictions = (Restriction.NEGATIVE, Restriction.POSITIVE)
+        restrictions = (Restriction.POSITIVE, Restriction.NEGATIVE)
     for res in restrictions:
         for ei in range(element.value_size()):
             yield PDELabAccumulationInfo(element_index=ei, restriction=res)
@@ -372,8 +378,7 @@ def get_accumulation_info(expr, visitor):
 
     restriction = visitor.restriction
     if visitor.measure == 'exterior_facet':
-        from dune.perftool.pdelab.restriction import Restriction
-        restriction = Restriction.NEGATIVE
+        restriction = Restriction.POSITIVE
 
     inames = visitor.interface.lfs_inames(leaf_element,
                                           restriction,
@@ -425,10 +430,10 @@ def generate_accumulation_instruction(expr, visitor):
 
 def get_visitor(measure, subdomain_id):
     # Get a transformer instance for this kernel
-    if get_option('sumfact'):
+    if get_form_option('sumfact'):
         from dune.perftool.sumfact import SumFactInterface
         interface = SumFactInterface()
-    elif get_option('blockstructured'):
+    elif get_form_option('blockstructured'):
         from dune.perftool.blockstructured import BlockStructuredInterface
         interface = BlockStructuredInterface()
     else:
@@ -469,7 +474,11 @@ def generate_kernel(integrals):
     delete_cache_items("kernel_default")
     for integral in integrals:
         visit_integral(integral)
-    knl = extract_kernel_from_cache("kernel_default")
+
+    from dune.perftool.pdelab.signatures import kernel_name, assembly_routine_signature
+    name = kernel_name()
+    signature = assembly_routine_signature()
+    knl = extract_kernel_from_cache("kernel_default", name, signature)
     delete_cache_items("kernel_default")
 
     # Reset the quadrature degree
@@ -487,7 +496,7 @@ def generate_kernels_per_integral(integrals):
     yield generate_kernel(integrals)
 
 
-def extract_kernel_from_cache(tag, wrap_in_cgen=True):
+def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timings=True):
     # Now extract regular loopy kernel components
     from dune.perftool.loopy.target import DuneTarget
     domains = [i for i in retrieve_cache_items("{} and domain".format(tag))]
@@ -506,15 +515,9 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
     from loopy import Options
     opt = Options(ignore_boostable_into=True,
                   check_dep_resolution=False,
+                  enforce_variable_access_ordered="no_check",
                   )
 
-    # Find a name for the kernel
-    if wrap_in_cgen:
-        from dune.perftool.pdelab.signatures import kernel_name
-        name = kernel_name()
-    else:
-        name = "constructor_kernel"
-
     # Create the kernel
     from loopy import make_kernel, preprocess_kernel
     kernel = make_kernel(domains,
@@ -525,35 +528,21 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
                          options=opt,
                          silenced_warnings=silenced,
                          name=name,
+                         lang_version=(2017, 2, 1),
                          )
     from loopy import make_reduction_inames_unique
     kernel = make_reduction_inames_unique(kernel)
 
-    from dune.perftool.loopy.transformations.disjointgroups import make_groups_conflicting
-    kernel = make_groups_conflicting(kernel)
-
     # Apply the transformations that were gathered during tree traversals
     for trafo in transformations:
-        kernel = trafo[0](kernel, *trafo[1])
-
-    # Precompute all the substrules
-    for sr in kernel.substitutions:
-        tmpname = "precompute_{}".format(sr)
-        kernel = lp.precompute(kernel,
-                               sr,
-                               temporary_name=tmpname,
-                               )
-        # Vectorization strategies are actually very likely to eliminate the
-        # precomputation temporary. To avoid the temporary elimination warning
-        # we need to explicitly disable it.
-        kernel = kernel.copy(silenced_warnings=kernel.silenced_warnings + ["temp_to_write({})".format(tmpname)])
+        kernel = trafo[0](kernel, *trafo[1], **trafo[2])
 
     from dune.perftool.loopy import heuristic_duplication
     kernel = heuristic_duplication(kernel)
 
     # Maybe apply vectorization strategies
-    if get_option("vectorization_quadloop"):
-        if get_option("sumfact"):
+    if get_form_option("vectorization_quadloop"):
+        if get_form_option("sumfact"):
             from dune.perftool.loopy.transformations.vectorize_quad import vectorize_quadrature_loop
             kernel = vectorize_quadrature_loop(kernel)
         else:
@@ -578,11 +567,20 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
     from dune.perftool.loopy.transformations.matchfma import match_fused_multiply_add
     kernel = match_fused_multiply_add(kernel)
 
+    # Add instrumentation to the kernel
+    from dune.perftool.loopy.transformations.instrumentation import add_instrumentation
+    if add_timings and get_form_option("sumfact"):
+        from dune.perftool.pdelab.signatures import assembler_routine_name
+        kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage1"), "{}_kernel_stage1".format(assembler_routine_name()), 4)
+        kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage2"), "{}_kernel_quadratureloop".format(assembler_routine_name()), 4)
+        kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage3"), "{}_kernel_stage3".format(assembler_routine_name()), 4)
+
     if wrap_in_cgen:
         # Wrap the kernel in something which can generate code
-        from dune.perftool.pdelab.signatures import assembly_routine_signature
-        signature = assembly_routine_signature()
-        kernel = LoopyKernelMethod(signature, kernel)
+        if signature is None:
+            from dune.perftool.pdelab.signatures import assembly_routine_signature
+            signature = assembly_routine_signature()
+        kernel = LoopyKernelMethod(signature, kernel, add_timings=add_timings)
 
     return kernel
 
@@ -663,12 +661,16 @@ class LoopyKernelMethod(ClassMember):
                 content.append('  ' + 'HP_TIMER_STOP({});'.format(timer_name))
 
         content.append('}')
-        ClassMember.__init__(self, content)
+        ClassMember.__init__(self, content, name=kernel.name if kernel is not None else "")
 
 
 def cgen_class_from_cache(tag, members=[]):
     from dune.perftool.generation import retrieve_cache_items
 
+    # Sort the given member functions by their name to help debugging by fixing
+    # the order
+    members = sorted(members, key=lambda m: m.name)
+
     # Generate the name by concatenating basename and template parameters
     basename, fullname = class_type_from_cache(tag)
 
@@ -679,12 +681,15 @@ def cgen_class_from_cache(tag, members=[]):
     tparams = [i for i in retrieve_cache_items('{} and template_param'.format(tag))]
 
     # Construct the constructor
-    constructor_knl = extract_kernel_from_cache(tag, wrap_in_cgen=False)
+    constructor_knl = extract_kernel_from_cache(tag, "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
     from dune.perftool.loopy.target import DuneTarget
     constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
     signature = "{}({})".format(basename, ", ".join(next(iter(p.generate(with_semicolon=False))) for p in constructor_params))
     constructor = LoopyKernelMethod([signature], constructor_knl, add_timings=False, initializer_list=il)
 
+    from loopy import get_one_scheduled_kernel
+    constructor_knl = get_one_scheduled_kernel(constructor_knl)
+
     # Take any temporary declarations from the kernel and make them class members
     target = DuneTarget()
     from loopy.codegen import CodeGenerationState
@@ -705,16 +710,7 @@ def cgen_class_from_cache(tag, members=[]):
     return Class(basename, base_classes=base_classes, members=[constructor] + members + pm + decls, tparam_decls=tparams)
 
 
-def generate_localoperator_kernels(formdata, data):
-    logger = logging.getLogger(__name__)
-
-    # Extract the relevant attributes of the form data
-    form = formdata.preprocessed_form
-
-    # Reset the generation cache
-    from dune.perftool.generation import delete_cache_items
-    delete_cache_items()
-
+def local_operator_default_settings(operator, form):
     # Manage includes and base classes that we always need
     include_file('dune/pdelab/gridfunctionspace/gridfunctionspace.hh', filetag="operatorfile")
     include_file('dune/pdelab/localoperator/idefault.hh', filetag="operatorfile")
@@ -729,18 +725,26 @@ def generate_localoperator_kernels(formdata, data):
 
     # Trigger this one once early on to assure that template
     # parameters are set in the right order
-    localoperator_basename(formdata, data)
+    localoperator_basename(operator)
     lop_template_ansatz_gfs()
     lop_template_test_gfs()
     lop_template_range_field()
-    from dune.perftool.pdelab.parameter import parameterclass_basename
-    parameterclass_basename(formdata, data)
 
-    # Make sure there is always the same constructor arguments (even if parameter class is empty)
-    from dune.perftool.pdelab.localoperator import name_initree_member
+    # Make sure there is always the same constructor arguments, even if some of them are
+    # not strictly needed. Also ensure the order.
     name_initree_member()
-    from dune.perftool.pdelab.parameter import name_paramclass
-    name_paramclass()
+
+    # Iterate over the needed grid functions in correct order
+    for c in sorted(filter(lambda c: c.count() > 2, form.coefficients()), key=lambda c: c.count()):
+        name_gridfunction_constructor_argument(c)
+
+    # Set some options!
+    from dune.perftool.pdelab.driver import isQuadrilateral
+    if isQuadrilateral(form.arguments()[0].ufl_element().cell()):
+        from dune.perftool.options import set_form_option
+        # For Yasp Grids the jacobian of the transformation is diagonal and constant on each cell
+        set_form_option('diagonal_transformation_matrix', True)
+        set_form_option('constant_transformation_matrix', True)
 
     # Add right base classes for stationary/instationary operators
     base_class('Dune::PDELab::LocalOperatorDefaultFlags', classtag="operator")
@@ -750,18 +754,18 @@ def generate_localoperator_kernels(formdata, data):
         base_class('Dune::PDELab::InstationaryLocalOperatorDefaultMethods<{}>'
                    .format(rf), classtag="operator")
 
-        # Create set time method in parameter class
-        from dune.perftool.pdelab.parameter import define_set_time_method
-        define_set_time_method()
 
-    # Have a data structure collect the generated kernels
-    operator_kernels = {}
+def generate_residual_kernels(form, original_form):
+    if not get_form_option("generate_residuals"):
+        return {}
 
-    logger.info("generate_localoperator_kernels: create residual methods")
+    logger = logging.getLogger(__name__)
     with global_context(form_type='residual'):
+        operator_kernels = {}
+
         # Generate the necessary residual methods
         for measure in set(i.integral_type() for i in form.integrals()):
-            logger.info("generate_localoperator_kernels: measure {}".format(measure))
+            logger.info("generate_residual_kernels: measure {}".format(measure))
             with global_context(integral_type=measure):
                 enum_pattern()
                 pattern_baseclass()
@@ -772,7 +776,7 @@ def generate_localoperator_kernels(formdata, data):
                     kernel = [k for k in get_backend(interface="generate_kernels_per_integral")(form.integrals_by_type(measure))]
 
                 # Maybe add numerical differentiation
-                if get_option("numerical_jacobian"):
+                if get_form_option("numerical_jacobian"):
                     # Include headers for numerical methods
                     include_file("dune/pdelab/localoperator/defaultimp.hh", filetag="operatorfile")
 
@@ -791,9 +795,9 @@ def generate_localoperator_kernels(formdata, data):
                                      )
 
                     # In the case of matrix free operator evaluation we need jacobian apply methods
-                    if get_option("matrix_free"):
+                    if get_form_option("matrix_free"):
                         from dune.perftool.pdelab.driver import is_linear
-                        if is_linear(formdata.original_form):
+                        if is_linear(original_form):
                             # Numeical jacobian apply base class
                             base_class("Dune::PDELab::NumericalJacobianApply{}<{}>".format(which, loptype), classtag="operator")
 
@@ -812,21 +816,34 @@ def generate_localoperator_kernels(formdata, data):
                                              classtag="operator",
                                              )
 
-                operator_kernels[(measure, 'residual')] = kernel
+            operator_kernels[(measure, 'residual')] = kernel
+
+        return operator_kernels
+
+
+def generate_jacobian_kernels(form, original_form):
+    logger = logging.getLogger(__name__)
+
+    from ufl import derivative
+    jacform = derivative(original_form, original_form.coefficients()[0])
 
-    # Generate the necessary jacobian methods
-    if not get_option("numerical_jacobian"):
-        logger.info("generate_localoperator_kernels: create jacobian methods")
-        from ufl import derivative
-        jacform = derivative(formdata.original_form, formdata.original_form.coefficients()[0])
+    from dune.perftool.ufl.preprocess import preprocess_form
+    jacform = preprocess_form(jacform).preprocessed_form
 
-        from dune.perftool.ufl.preprocess import preprocess_form
-        jacform = preprocess_form(jacform).preprocessed_form
+    if get_form_option("block_preconditioner_diagonal"):
+        from dune.perftool.ufl.transformations.blockpreconditioner import diagonal_block_jacobian
+        jacform = diagonal_block_jacobian(jacform)
+    if get_form_option("block_preconditioner_offdiagonal"):
+        from dune.perftool.ufl.transformations.blockpreconditioner import offdiagonal_block_jacobian
+        jacform = offdiagonal_block_jacobian(jacform)
 
-        with global_context(form_type="jacobian"):
+    operator_kernels = {}
+    with global_context(form_type="jacobian"):
+        if get_form_option("generate_jacobians"):
             for measure in set(i.integral_type() for i in jacform.integrals()):
-                logger.info("generate_localoperator_kernels: measure {}".format(measure))
+                logger.info("generate_jacobian_kernels: measure {}".format(measure))
                 with global_context(integral_type=measure):
+                    from dune.perftool.pdelab.signatures import assembler_routine_name
                     with global_context(kernel=assembler_routine_name()):
                         kernel = [k for k in get_backend(interface="generate_kernels_per_integral")(jacform.integrals_by_type(measure))]
                 operator_kernels[(measure, 'jacobian')] = kernel
@@ -841,43 +858,207 @@ def generate_localoperator_kernels(formdata, data):
                     from dune.perftool.pdelab.signatures import assembly_routine_signature
                     operator_kernels[(it, 'jacobian')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)]
 
-        # Jacobian apply methods for matrix-free computations
-        if get_option("matrix_free"):
-            # The apply vector has reserved index 1 so we directly use Coefficient class from ufl
-            from ufl import Coefficient
-            apply_coefficient = Coefficient(form.coefficients()[0].ufl_element(), 1)
-
-            # Create application of jacobian on vector
-            from ufl import action
-            jac_apply_form = action(jacform, apply_coefficient)
-
-            # Create kernel for jacobian application
-            with global_context(form_type="jacobian_apply"):
-                for measure in set(i.integral_type() for i in jac_apply_form.integrals()):
-                    with global_context(integral_type=measure):
-                        with global_context(kernel=assembler_routine_name()):
-                            kernel = [k for k in get_backend(interface="generate_kernels_per_integral")(jac_apply_form.integrals_by_type(measure))]
-                    operator_kernels[(measure, 'jacobian_apply')] = kernel
-
-                    # Generate dummy functions for those kernels, that vanished in the differentiation process
-                    # We *could* solve this problem by using lambda_* terms but we do not really want that, so
-                    # we use empty jacobian assembly methods instead
-                    alpha_measures = set(i.integral_type() for i in form.integrals())
-                    jacobian_apply_measures = set(i.integral_type() for i in jac_apply_form.integrals())
-                    for it in alpha_measures - jacobian_apply_measures:
-                        with global_context(integral_type=it):
-                            from dune.perftool.pdelab.signatures import assembly_routine_signature
-                            operator_kernels[(it, 'jacobian_apply')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)]
+    # Jacobian apply methods for matrix-free computations
+    if get_form_option("matrix_free"):
+        # The apply vector has reserved index 1 so we directly use Coefficient class from ufl
+        from ufl import Coefficient
+        apply_coefficient = Coefficient(form.coefficients()[0].ufl_element(), 1)
+
+        # Create application of jacobian on vector
+        from ufl import action
+        jac_apply_form = action(jacform, apply_coefficient)
+
+        # Create kernel for jacobian application
+        with global_context(form_type="jacobian_apply"):
+            for measure in set(i.integral_type() for i in jac_apply_form.integrals()):
+                with global_context(integral_type=measure):
+                    from dune.perftool.pdelab.signatures import assembler_routine_name
+                    with global_context(kernel=assembler_routine_name()):
+                        kernel = [k for k in get_backend(interface="generate_kernels_per_integral")(jac_apply_form.integrals_by_type(measure))]
+                operator_kernels[(measure, 'jacobian_apply')] = kernel
+
+                # Generate dummy functions for those kernels, that vanished in the differentiation process
+                # We *could* solve this problem by using lambda_* terms but we do not really want that, so
+                # we use empty jacobian assembly methods instead
+                alpha_measures = set(i.integral_type() for i in form.integrals())
+                jacobian_apply_measures = set(i.integral_type() for i in jac_apply_form.integrals())
+                for it in alpha_measures - jacobian_apply_measures:
+                    with global_context(integral_type=it):
+                        from dune.perftool.pdelab.signatures import assembly_routine_signature
+                        operator_kernels[(it, 'jacobian_apply')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)]
+
+    return operator_kernels
+
+
+def generate_control_kernels(forms):
+    # All forms will we written in the residual method and
+    # accumulation will be done in a class member instead of the
+    # residual.
+    logger = logging.getLogger(__name__)
+    with global_context(form_type='residual'):
+        operator_kernels = {}
+
+        # Generate the necessary residual methods
+        for measure in set(i.integral_type() for form in forms for i in form.integrals()):
+            logger.info("generate_control_kernels: measure {}".format(measure))
+            with global_context(integral_type=measure):
+                enum_pattern()
+                pattern_baseclass()
+                enum_alpha()
+
+                from dune.perftool.pdelab.signatures import assembler_routine_name
+                with global_context(kernel=assembler_routine_name()):
+                    # TODO: Sumfactorization not yet implemented
+                    assert not get_form_option('sumfact')
+
+                    from dune.perftool.pdelab.adjoint import control_generate_kernels_per_integral
+                    forms_measure = [form.integrals_by_type(measure) for form in forms]
+                    kernel = [k for k in control_generate_kernels_per_integral(forms_measure)]
+
+            operator_kernels[(measure, 'residual')] = kernel
+
+        return operator_kernels
+
+
+def generate_localoperator_kernels(operator):
+    logger = logging.getLogger(__name__)
+
+    data = get_global_context_value("data")
+    original_form = data.object_by_name[get_form_option("form")]
+    from dune.perftool.ufl.preprocess import preprocess_form
+
+    if get_form_option("adjoint"):
+        # Generate adjoint operator
+        #
+        # The jacobian of the adjoint form is just the jacobian of the
+        # original form with test and ansazt function swapped. A a
+        # linear form you have to subtract the derivative of the
+        # objective function w.r.t the ansatz function to get the
+        # final residual formulation of the adjoint.
+        #
+        # Might not be true in all cases but works for the simple ones.
+        assert get_form_option("objective_function") is not None
+        assert get_form_option("control") is False
+
+        from ufl import derivative, adjoint, action, replace
+        from ufl.classes import Coefficient
+
+        # Jacobian of the adjoint form
+        jacform = derivative(original_form, original_form.coefficients()[0])
+        adjoint_jacform = adjoint(jacform)
+
+        # Derivative of objective function w.r.t. state
+        objective = data.object_by_name[get_form_option("objective_function")]
+        objective_jacobian = derivative(objective, objective.coefficients()[0])
+
+        # Replace coefficient belonging to ansatz function with new coefficient
+        element = objective.coefficients()[0].ufl_element()
+        coeff = Coefficient(element, count=3)
+        objective_jacobian = replace(objective_jacobian, {objective.coefficients()[0]: coeff})
+        if len(adjoint_jacform.coefficients()) > 0:
+            adjoint_jacform = replace(adjoint_jacform, {adjoint_jacform.coefficients()[0]: coeff})
+
+        # Residual of the adjoint form
+        adjoint_form = action(adjoint_jacform, original_form.coefficients()[0])
+        adjoint_form = adjoint_form + objective_jacobian
+
+        # Update form and original_form
+        original_form = adjoint_form
+        form = preprocess_form(adjoint_form).preprocessed_form
+
+    elif get_form_option("control"):
+        # Generate control operator
+        #
+        # This is the normal form derived w.r.t. the control
+        # variable. We generate a form for every row of:
+        #
+        # \nabla  \hat{J}(m) = (\nabla R(z,m))^T \lambda + \nabla_m J(z,m)
+        #
+        # These forms will not depend on the test function anymore and
+        # will need special treatment for the accumulation process.
+        from ufl import action, diff
+        from ufl.classes import Coefficient
+
+        # Get control variables
+        assert get_form_option("control_variable") is not None
+        controls = [data.object_by_name[ctrl.strip()] for ctrl in get_form_option("control_variable").split(",")]
+
+        # Transoform flat index to multiindex. Wrapper around numpy
+        # unravel since we need to transform numpy ints to native
+        # ints.
+        def _unravel(flat_index, shape):
+            multi_index = np.unravel_index(flat_index, shape)
+            multi_index = tuple(int(i) for i in multi_index)
+            return multi_index
+
+        # Will be used to replace ansatz function with adjoint function
+        element = original_form.coefficients()[0].ufl_element()
+        coeff = Coefficient(element, count=3)
+
+        # Store a form for every control
+        forms = []
+        for control in controls:
+            shape = control.ufl_shape
+            flat_length = int(np.prod(shape))
+            for i in range(flat_length):
+                c = control[_unravel(i, shape)]
+                control_form = diff(original_form, c)
+                control_form = action(control_form, coeff)
+                objective = data.object_by_name[get_form_option("objective_function")]
+                objective_gradient = diff(objective, c)
+                control_form = control_form + objective_gradient
+                forms.append(preprocess_form(control_form).preprocessed_form)
+
+        # Used to create local operator default settings
+        form = preprocess_form(original_form).preprocessed_form
+
+    else:
+        form = preprocess_form(original_form).preprocessed_form
+
+    # Reset the generation cache
+    from dune.perftool.generation import delete_cache_items
+    delete_cache_items()
+
+    # Have a data structure collect the generated kernels
+    operator_kernels = {}
+
+    # Generate things needed for all local operator files
+    local_operator_default_settings(operator, form)
+
+    if get_form_option("control"):
+        logger.info("generate_localoperator_kernels: create methods for control operator")
+        operator_kernels.update(generate_control_kernels(forms))
+    else:
+        logger.info("generate_localoperator_kernels: create residual methods")
+        operator_kernels.update(generate_residual_kernels(form, original_form))
+
+        # Generate the necessary jacobian methods
+        if not get_form_option("numerical_jacobian"):
+            logger.info("generate_localoperator_kernels: create jacobian methods")
+            operator_kernels.update(generate_jacobian_kernels(form, original_form))
 
     # Return the set of generated kernels
     return operator_kernels
 
 
-def generate_localoperator_file(formdata, kernels, filename):
+def generate_localoperator_file(kernels, filename):
+    logger = logging.getLogger(__name__)
+
     operator_methods = []
     for k in kernels.values():
         operator_methods.extend(k)
 
+    # Generate all the realizations of sum factorization kernel objects needed in this operator
+    sfkernels = [sf for sf in retrieve_cache_items("kernelimpl")]
+    if sfkernels:
+        logger.info("generate_localoperator_kernels: Create {} sumfact kernel realizations".format(len(sfkernels)))
+
+    from dune.perftool.sumfact.realization import realize_sumfact_kernel_function
+    for sf, qp in sfkernels:
+        from dune.perftool.sumfact.tabulation import set_quadrature_points
+        set_quadrature_points(qp)
+        operator_methods.append(realize_sumfact_kernel_function(sf))
+
     if get_option('instrumentation_level') >= 3:
         include_file('dune/perftool/common/timer.hh', filetag='operatorfile')
         operator_methods.append(TimerMethod())
@@ -886,17 +1067,6 @@ def generate_localoperator_file(formdata, kernels, filename):
 
     # Write the file!
     from dune.perftool.file import generate_file
-    param = cgen_class_from_cache("parameterclass")
     # TODO take the name of this thing from the UFL file
     lop = cgen_class_from_cache("operator", members=operator_methods)
-    generate_file(filename, "operatorfile", [param, lop])
-
-
-def generate_localoperator_basefile(formdatas, data):
-    filename = get_option("operator_file")
-    for formdata in formdatas:
-        lop_filename = name_localoperator_file(formdata, data)
-        include_file(lop_filename, filetag="operatorbasefile")
-
-    from dune.perftool.file import generate_file
-    generate_file(filename, "operatorbasefile", [])
+    generate_file(filename, "operatorfile", [lop])
diff --git a/python/dune/perftool/pdelab/parameter.py b/python/dune/perftool/pdelab/parameter.py
deleted file mode 100644
index 63d47bf090ea69a4e14663f0ff94e2b42b5fa981..0000000000000000000000000000000000000000
--- a/python/dune/perftool/pdelab/parameter.py
+++ /dev/null
@@ -1,264 +0,0 @@
-""" Generators for parameter functions """
-
-from dune.perftool.generation import (class_basename,
-                                      class_member,
-                                      constructor_parameter,
-                                      generator_factory,
-                                      get_backend,
-                                      get_global_context_value,
-                                      initializer_list,
-                                      kernel_cached,
-                                      preamble,
-                                      temporary_variable
-                                      )
-from dune.perftool.pdelab.geometry import (name_cell,
-                                           name_intersection,
-                                           )
-from dune.perftool.pdelab.quadrature import quadrature_preamble
-from dune.perftool.tools import get_pymbolic_basename
-from dune.perftool.cgen.clazz import AccessModifier
-from dune.perftool.pdelab.localoperator import (class_type_from_cache,
-                                                localoperator_basename,
-                                                )
-from dune.perftool.loopy.target import type_floatingpoint
-
-from loopy.match import Writes
-
-
-@class_basename(classtag="parameterclass")
-def parameterclass_basename(formdata, data):
-    lopbase = localoperator_basename(formdata, data)
-    return "{}Params".format(lopbase)
-
-
-@class_member(classtag="operator")
-def define_parameterclass(name):
-    _, t = class_type_from_cache("parameterclass")
-    constructor_parameter("{}&".format(t), name + "_", classtag="operator")
-    initializer_list(name, [name + "_"], classtag="operator")
-    return "{}& {};".format(t, name)
-
-
-def name_paramclass():
-    formdata = get_global_context_value("formdata")
-    from dune.perftool.pdelab.driver.gridoperator import name_parameters
-    name = name_parameters(formdata)
-    define_parameterclass(name)
-    return name
-
-
-@class_member(classtag="parameterclass")
-def define_time(name):
-    initializer_list(name, ["0.0"], classtag="parameterclass")
-    ftype = type_floatingpoint()
-    return "{} {};".format(ftype, name)
-
-
-def name_time():
-    define_time("t")
-    return "t"
-
-
-def define_set_time_method():
-    define_set_time_method_parameterclass()
-    define_set_time_method_operator()
-
-
-@class_member(classtag="operator")
-def define_set_time_method_operator():
-    time_name = name_time()
-    param = name_paramclass()
-    ftype = type_floatingpoint()
-
-    result = ["// Set time in instationary case",
-              "void setTime ({} t_)".format(ftype),
-              "{",
-              "  Dune::PDELab::InstationaryLocalOperatorDefaultMethods<{}>::setTime(t_);".format(ftype),
-              "  {}.setTime(t_);".format(param),
-              "}"
-              ]
-
-    return result
-
-
-@class_member(classtag="parameterclass")
-def define_set_time_method_parameterclass():
-    time_name = name_time()
-    ftype = type_floatingpoint()
-
-    result = ["// Set time in instationary case",
-              "void setTime ({} t_)".format(ftype),
-              "{",
-              "  {} = t_;".format(time_name),
-              "}"
-              ]
-
-    return result
-
-
-def combine_tree_path_argnumber(element, tree_path_int):
-    # Return string combining tree_path and argnumber.
-    subel = element.extract_subelement_component(tree_path_int)
-
-    def _flatten(x):
-        if isinstance(x, tuple):
-            return '_'.join(_flatten(i) for i in x if i != ())
-        else:
-            return str(x)
-
-    return _flatten(subel)
-
-
-@class_member(classtag="parameterclass")
-def define_parameter_function_class_member(name, expr, baset, shape, cell):
-    t = construct_nested_fieldvector(baset, shape)
-
-    geot = "E" if cell else "I"
-    geo = geot.lower()
-    result = ["template<typename {}, typename X>".format(geot),
-              "{} {}(const {}& {}, const X& local) const".format(t, name, geot, geo),
-              "{",
-              ]
-
-    # In the case of a non-scalar parameter function, recurse into leafs
-    if expr.element.value_shape():
-        # Check that this is a VectorElement, as I have no idea how a parameter function
-        # over a non-vector mixed element should be well-defined in PDELab.
-        from ufl import VectorElement
-        assert isinstance(expr.element, VectorElement)
-
-        result.append("  {} result(0.0);".format(t))
-
-        from dune.perftool.ufl.execution import split_expression
-        for i, subexpr in enumerate(split_expression(expr)):
-            child_name = "{}_{}".format(name, combine_tree_path_argnumber(expr.element, i))
-            result.append("  result[{}] = {}({}, local);".format(i, child_name, geo))
-            define_parameter_function_class_member(child_name, subexpr, baset, shape[1:], cell)
-
-        result.append("  return result;")
-
-    else:
-        # Evaluate a scalar parameter function
-        if expr.is_global:
-            result.append("  auto x = {}.geometry().global(local);".format(geo))
-        else:
-            result.append("  auto x = local;")
-
-        result.append("  " + expr.c_expr[0])
-
-    result.append("}")
-
-    return result
-
-
-@preamble
-def evaluate_cellwise_constant_parameter_function(name, restriction):
-    param = name_paramclass()
-    entity = name_cell(restriction)
-    from dune.perftool.pdelab.geometry import name_localcenter
-    pos = name_localcenter()
-
-    from dune.perftool.generation.loopy import valuearg
-    import numpy
-    valuearg(name)
-
-    return 'auto {} = {}.{}({}, {});'.format(name,
-                                             name_paramclass(),
-                                             name,
-                                             entity,
-                                             pos,
-                                             )
-
-
-@preamble
-def evaluate_intersectionwise_constant_parameter_function(name):
-    # Check that this is not a volume term, as that would not be well-defined
-    from dune.perftool.generation import get_global_context_value
-    it = get_global_context_value("integral_type")
-    assert it is not 'cell'
-
-    param = name_paramclass()
-    intersection = name_intersection()
-    pos = name_localcenter()
-
-    from dune.perftool.generation.loopy import valuearg
-    import numpy
-    valuearg(name)
-
-    return 'auto {} = {}.{}({}, {});'.format(name,
-                                             name_paramclass(),
-                                             name,
-                                             intersection,
-                                             pos,
-                                             )
-
-
-def evaluate_cell_parameter_function(name, restriction):
-    param = name_paramclass()
-    entity = name_cell(restriction)
-    pos = get_backend(interface="qp_in_cell")(restriction)
-    return quadrature_preamble('{} = {}.{}({}, {});'.format(name,
-                                                            name_paramclass(),
-                                                            name,
-                                                            entity,
-                                                            str(pos),
-                                                            ),
-                               assignees=frozenset({name}),
-                               read_variables=frozenset({get_pymbolic_basename(pos)}),
-                               depends_on=frozenset({Writes(get_pymbolic_basename(pos))}),
-                               )
-
-
-def evaluate_intersection_parameter_function(name):
-    # Check that this is not a volume term, as that would not be well-defined
-    from dune.perftool.generation import get_global_context_value
-    it = get_global_context_value("integral_type")
-    assert it is not 'cell'
-
-    param = name_paramclass()
-    intersection = name_intersection()
-    pos = get_backend("quad_pos")()
-    return quadrature_preamble('{} = {}.{}({}, {});'.format(name,
-                                                            name_paramclass(),
-                                                            name,
-                                                            intersection,
-                                                            str(pos),
-                                                            ),
-                               assignees=frozenset({name}),
-                               read_variables=frozenset({get_pymbolic_basename(pos)}),
-                               depends_on=frozenset({Writes(get_pymbolic_basename(pos))}),
-                               )
-
-
-def construct_nested_fieldvector(t, shape):
-    if len(shape) == 0:
-        return t
-    return 'Dune::FieldVector<{}, {}>'.format(construct_nested_fieldvector(t, shape[1:]), shape[0])
-
-
-@kernel_cached
-def cell_parameter_function(name, expr, restriction, cellwise_constant, t='float64'):
-    shape = expr.ufl_element().value_shape()
-    shape_impl = ('fv',) * len(shape)
-    from dune.perftool.loopy.target import numpy_to_cpp_dtype
-    t = numpy_to_cpp_dtype(t)
-    define_parameter_function_class_member(name, expr, t, shape, True)
-    if cellwise_constant:
-        evaluate_cellwise_constant_parameter_function(name, restriction)
-    else:
-        temporary_variable(name, shape=shape, shape_impl=shape_impl)
-        evaluate_cell_parameter_function(name, restriction)
-
-
-@kernel_cached
-def intersection_parameter_function(name, expr, cellwise_constant, t='float64'):
-    shape = expr.ufl_element().value_shape()
-    shape_impl = ('fv',) * len(shape)
-    from dune.perftool.loopy.target import numpy_to_cpp_dtype
-    t = numpy_to_cpp_dtype(t)
-    define_parameter_function_class_member(name, expr, t, shape, False)
-    if cellwise_constant:
-        evaluate_intersectionwise_constant_parameter_function(name)
-    else:
-        temporary_variable(name, shape=shape, shape_impl=shape_impl)
-        evaluate_intersection_parameter_function(name)
diff --git a/python/dune/perftool/pdelab/quadrature.py b/python/dune/perftool/pdelab/quadrature.py
index 02e4a428348d81764ec9672c25370e9be6d14969..031d97b019df0cab8355d4295b5229584e077b9b 100644
--- a/python/dune/perftool/pdelab/quadrature.py
+++ b/python/dune/perftool/pdelab/quadrature.py
@@ -14,8 +14,7 @@ from dune.perftool.generation import (backend,
                                       valuearg,
                                       )
 from dune.perftool.pdelab.localoperator import lop_template_range_field
-from dune.perftool.options import get_option
-from dune.perftool.ufl.modified_terminals import Restriction
+from dune.perftool.options import get_form_option
 
 from pymbolic.primitives import Variable, Subscript
 
@@ -184,7 +183,10 @@ def _estimate_quadrature_order():
     """Estimate quadrature order using polynomial degree estimation from UFL"""
     # According to UFL documentation estimate_total_polynomial_degree
     # should only be called on preprocessed forms.
-    form = get_global_context_value("formdata").preprocessed_form
+    data = get_global_context_value("data")
+    form = data.object_by_name[get_form_option("form")]
+    from dune.perftool.ufl.preprocess import preprocess_form
+    form = preprocess_form(form).preprocessed_form
 
     # Estimate polynomial degree of integrals of current type (eg 'Cell')
     integral_type = get_global_context_value("integral_type")
@@ -223,8 +225,8 @@ def quadrature_order():
     - If you use sum factorization and TensorProductElement it is
       possible to use a different quadrature_order per direction.
     """
-    if get_option("quadrature_order"):
-        quadrature_order = tuple(map(int, get_option("quadrature_order").split(',')))
+    if get_form_option("quadrature_order"):
+        quadrature_order = tuple(map(int, get_form_option("quadrature_order").split(',')))
     else:
         quadrature_order = _estimate_quadrature_order()
 
@@ -235,7 +237,7 @@ def quadrature_order():
         if len(quadrature_order) == 1:
             quadrature_order = quadrature_order[0]
     if isinstance(quadrature_order, tuple):
-        if not get_option('sumfact'):
+        if not get_form_option('sumfact'):
             raise NotImplementedError("Different quadrature order per direction is only implemented for kernels using sum factorization.")
         from dune.perftool.pdelab.geometry import world_dimension
         assert(len(quadrature_order) == world_dimension())
diff --git a/python/dune/perftool/pdelab/restriction.py b/python/dune/perftool/pdelab/restriction.py
index 7d77a17b6c7e14107a359f711d110584b1a83cb2..03c55eaec4874c2611d892d74d8a14476cf8435d 100644
--- a/python/dune/perftool/pdelab/restriction.py
+++ b/python/dune/perftool/pdelab/restriction.py
@@ -2,9 +2,24 @@ from dune.perftool.ufl.modified_terminals import Restriction
 
 
 def restricted_name(name, restriction):
+    """Adapt name according to the restictrion
+
+    Some remarks:
+
+    - UFL defines the jump the following: jump(v) = v('+') - v('-').
+
+    - The corresponding outer normal vector is n =
+      FacetNormal(cell)('+'). The user needs to make the right choice
+      in the UFL file.
+
+    - In the literature this convention is sometimes swapped. In order
+      to be consistent with UFL we choose ('+') as self and ('-') as
+      neighbor and choose the outer unit normal vector accordingly.
+
+    """
     if restriction == Restriction.NONE:
         return name
     if restriction == Restriction.POSITIVE:
-        return name + '_n'
-    if restriction == Restriction.NEGATIVE:
         return name + '_s'
+    if restriction == Restriction.NEGATIVE:
+        return name + '_n'
diff --git a/python/dune/perftool/pdelab/signatures.py b/python/dune/perftool/pdelab/signatures.py
index 97560a248a4aa28f3db489859c64dcb48d54c58c..2990c8b24b9c89bcca243dce78c9144914c26b44 100644
--- a/python/dune/perftool/pdelab/signatures.py
+++ b/python/dune/perftool/pdelab/signatures.py
@@ -53,7 +53,6 @@ def kernel_name():
 def assembly_routine_signature():
     integral_type = get_global_context_value("integral_type")
     form_type = get_global_context_value("form_type")
-    formdata = get_global_context_value("formdata")
 
     templates, args = {('residual', 'cell'): (alpha_volume_templates, alpha_volume_args),
                        ('residual', 'exterior_facet'): (alpha_boundary_templates, alpha_boundary_args),
@@ -66,7 +65,7 @@ def assembly_routine_signature():
     if templates is None:
         # Check if form is linear
         from dune.perftool.pdelab.driver import is_linear
-        linear = is_linear(formdata.original_form)
+        linear = is_linear()
 
         templates, args = {('jacobian_apply', 'cell', True): (jacobian_apply_volume_templates, jacobian_apply_volume_args),
                            ('jacobian_apply', 'exterior_facet', True): (jacobian_apply_boundary_templates, jacobian_apply_boundary_args),
@@ -82,7 +81,6 @@ def assembly_routine_signature():
 def assembly_routine_args():
     integral_type = get_global_context_value("integral_type")
     form_type = get_global_context_value("form_type")
-    formdata = get_global_context_value("formdata")
 
     args = {('residual', 'cell'): alpha_volume_args,
             ('residual', 'exterior_facet'): alpha_boundary_args,
@@ -95,7 +93,7 @@ def assembly_routine_args():
     if args is None:
         # Check if form is linear
         from dune.perftool.pdelab.driver import is_linear
-        linear = is_linear(formdata.original_form)
+        linear = is_linear()
 
         args = {('jacobian_apply', 'cell', True): jacobian_apply_volume_args,
                 ('jacobian_apply', 'exterior_facet', True): jacobian_apply_boundary_args,
@@ -143,10 +141,10 @@ def alpha_boundary_templates():
 
 def alpha_boundary_args():
     geo = name_geometry_wrapper()
-    lfsu = name_trialfunctionspace(Restriction.NEGATIVE)
-    lfsv = name_testfunctionspace(Restriction.NEGATIVE)
-    cc = name_coefficientcontainer(Restriction.NEGATIVE)
-    av = name_accumulation_variable((Restriction.NEGATIVE,))
+    lfsu = name_trialfunctionspace(Restriction.POSITIVE)
+    lfsv = name_testfunctionspace(Restriction.POSITIVE)
+    cc = name_coefficientcontainer(Restriction.POSITIVE)
+    av = name_accumulation_variable((Restriction.POSITIVE,))
     return ((True, geo), (True, lfsu), (True, cc), (True, lfsv), (False, av))
 
 
@@ -161,14 +159,14 @@ def alpha_skeleton_templates():
 
 def alpha_skeleton_args():
     geo = name_geometry_wrapper()
-    lfsu_s = name_trialfunctionspace(Restriction.NEGATIVE)
-    lfsu_n = name_trialfunctionspace(Restriction.POSITIVE)
-    lfsv_s = name_testfunctionspace(Restriction.NEGATIVE)
-    lfsv_n = name_testfunctionspace(Restriction.POSITIVE)
-    cc_s = name_coefficientcontainer(Restriction.NEGATIVE)
-    cc_n = name_coefficientcontainer(Restriction.POSITIVE)
-    av_s = name_accumulation_variable((Restriction.NEGATIVE,))
-    av_n = name_accumulation_variable((Restriction.POSITIVE,))
+    lfsu_s = name_trialfunctionspace(Restriction.POSITIVE)
+    lfsu_n = name_trialfunctionspace(Restriction.NEGATIVE)
+    lfsv_s = name_testfunctionspace(Restriction.POSITIVE)
+    lfsv_n = name_testfunctionspace(Restriction.NEGATIVE)
+    cc_s = name_coefficientcontainer(Restriction.POSITIVE)
+    cc_n = name_coefficientcontainer(Restriction.NEGATIVE)
+    av_s = name_accumulation_variable((Restriction.POSITIVE,))
+    av_n = name_accumulation_variable((Restriction.NEGATIVE,))
     return ((True, geo), (True, lfsu_s), (True, cc_s), (True, lfsv_s), (True, lfsu_n), (True, cc_n), (True, lfsv_n), (False, av_s), (False, av_n))
 
 
@@ -201,10 +199,10 @@ def jacobian_boundary_templates():
 
 def jacobian_boundary_args():
     geo = name_geometry_wrapper()
-    lfsu = name_trialfunctionspace(Restriction.NEGATIVE)
-    lfsv = name_testfunctionspace(Restriction.NEGATIVE)
-    cc = name_coefficientcontainer(Restriction.NEGATIVE)
-    av = name_accumulation_variable((Restriction.NEGATIVE, Restriction.NEGATIVE))
+    lfsu = name_trialfunctionspace(Restriction.POSITIVE)
+    lfsv = name_testfunctionspace(Restriction.POSITIVE)
+    cc = name_coefficientcontainer(Restriction.POSITIVE)
+    av = name_accumulation_variable((Restriction.POSITIVE, Restriction.POSITIVE))
     return ((True, geo), (True, lfsu), (True, cc), (True, lfsv), (False, av))
 
 
@@ -219,16 +217,16 @@ def jacobian_skeleton_templates():
 
 def jacobian_skeleton_args():
     geo = name_geometry_wrapper()
-    lfsu_s = name_trialfunctionspace(Restriction.NEGATIVE)
-    lfsu_n = name_trialfunctionspace(Restriction.POSITIVE)
-    lfsv_s = name_testfunctionspace(Restriction.NEGATIVE)
-    lfsv_n = name_testfunctionspace(Restriction.POSITIVE)
-    cc_s = name_coefficientcontainer(Restriction.NEGATIVE)
-    cc_n = name_coefficientcontainer(Restriction.POSITIVE)
-    av_ss = name_accumulation_variable((Restriction.NEGATIVE, Restriction.NEGATIVE))
-    av_sn = name_accumulation_variable((Restriction.NEGATIVE, Restriction.POSITIVE))
-    av_ns = name_accumulation_variable((Restriction.POSITIVE, Restriction.NEGATIVE))
-    av_nn = name_accumulation_variable((Restriction.POSITIVE, Restriction.POSITIVE))
+    lfsu_s = name_trialfunctionspace(Restriction.POSITIVE)
+    lfsu_n = name_trialfunctionspace(Restriction.NEGATIVE)
+    lfsv_s = name_testfunctionspace(Restriction.POSITIVE)
+    lfsv_n = name_testfunctionspace(Restriction.NEGATIVE)
+    cc_s = name_coefficientcontainer(Restriction.POSITIVE)
+    cc_n = name_coefficientcontainer(Restriction.NEGATIVE)
+    av_ss = name_accumulation_variable((Restriction.POSITIVE, Restriction.POSITIVE))
+    av_sn = name_accumulation_variable((Restriction.POSITIVE, Restriction.NEGATIVE))
+    av_ns = name_accumulation_variable((Restriction.NEGATIVE, Restriction.POSITIVE))
+    av_nn = name_accumulation_variable((Restriction.NEGATIVE, Restriction.NEGATIVE))
     return ((True, geo), (True, lfsu_s), (True, cc_s), (True, lfsv_s), (True, lfsu_n), (True, cc_n), (True, lfsv_n), (False, av_ss), (False, av_sn), (False, av_ns), (False, av_nn))
 
 
@@ -261,10 +259,10 @@ def jacobian_apply_boundary_templates():
 
 def jacobian_apply_boundary_args():
     geo = name_geometry_wrapper()
-    lfsu = name_trialfunctionspace(Restriction.NEGATIVE)
-    lfsv = name_testfunctionspace(Restriction.NEGATIVE)
-    ac = name_applycontainer(Restriction.NEGATIVE)
-    av = name_accumulation_variable((Restriction.NEGATIVE,))
+    lfsu = name_trialfunctionspace(Restriction.POSITIVE)
+    lfsv = name_testfunctionspace(Restriction.POSITIVE)
+    ac = name_applycontainer(Restriction.POSITIVE)
+    av = name_accumulation_variable((Restriction.POSITIVE,))
     return ((True, geo), (True, lfsu), (True, ac), (True, lfsv), (False, av))
 
 
@@ -279,14 +277,14 @@ def jacobian_apply_skeleton_templates():
 
 def jacobian_apply_skeleton_args():
     geo = name_geometry_wrapper()
-    lfsu_s = name_trialfunctionspace(Restriction.NEGATIVE)
-    lfsu_n = name_trialfunctionspace(Restriction.POSITIVE)
-    lfsv_s = name_testfunctionspace(Restriction.NEGATIVE)
-    lfsv_n = name_testfunctionspace(Restriction.POSITIVE)
-    ac_s = name_applycontainer(Restriction.NEGATIVE)
-    ac_n = name_applycontainer(Restriction.POSITIVE)
-    av_s = name_accumulation_variable((Restriction.NEGATIVE,))
-    av_n = name_accumulation_variable((Restriction.POSITIVE,))
+    lfsu_s = name_trialfunctionspace(Restriction.POSITIVE)
+    lfsu_n = name_trialfunctionspace(Restriction.NEGATIVE)
+    lfsv_s = name_testfunctionspace(Restriction.POSITIVE)
+    lfsv_n = name_testfunctionspace(Restriction.NEGATIVE)
+    ac_s = name_applycontainer(Restriction.POSITIVE)
+    ac_n = name_applycontainer(Restriction.NEGATIVE)
+    av_s = name_accumulation_variable((Restriction.POSITIVE,))
+    av_n = name_accumulation_variable((Restriction.NEGATIVE,))
     return ((True, geo), (True, lfsu_s), (True, ac_s), (True, lfsv_s), (True, lfsu_n), (True, ac_n), (True, lfsv_n), (False, av_s), (False, av_n))
 
 
@@ -320,11 +318,11 @@ def nonlinear_jacobian_apply_boundary_templates():
 
 def nonlinear_jacobian_apply_boundary_args():
     geo = name_geometry_wrapper()
-    lfsu = name_trialfunctionspace(Restriction.NEGATIVE)
-    lfsv = name_testfunctionspace(Restriction.NEGATIVE)
-    cc = name_coefficientcontainer(Restriction.NEGATIVE)
-    ac = name_applycontainer(Restriction.NEGATIVE)
-    av = name_accumulation_variable((Restriction.NEGATIVE,))
+    lfsu = name_trialfunctionspace(Restriction.POSITIVE)
+    lfsv = name_testfunctionspace(Restriction.POSITIVE)
+    cc = name_coefficientcontainer(Restriction.POSITIVE)
+    ac = name_applycontainer(Restriction.POSITIVE)
+    av = name_accumulation_variable((Restriction.POSITIVE,))
     return ((True, geo), (True, lfsu), (True, cc), (True, ac), (True, lfsv), (False, av))
 
 
@@ -339,14 +337,14 @@ def nonlinear_jacobian_apply_skeleton_templates():
 
 def nonlinear_jacobian_apply_skeleton_args():
     geo = name_geometry_wrapper()
-    lfsu_s = name_trialfunctionspace(Restriction.NEGATIVE)
-    lfsu_n = name_trialfunctionspace(Restriction.POSITIVE)
-    lfsv_s = name_testfunctionspace(Restriction.NEGATIVE)
-    lfsv_n = name_testfunctionspace(Restriction.POSITIVE)
-    cc_s = name_coefficientcontainer(Restriction.NEGATIVE)
-    cc_n = name_coefficientcontainer(Restriction.POSITIVE)
-    ac_s = name_applycontainer(Restriction.NEGATIVE)
-    ac_n = name_applycontainer(Restriction.POSITIVE)
-    av_s = name_accumulation_variable((Restriction.NEGATIVE,))
-    av_n = name_accumulation_variable((Restriction.POSITIVE,))
+    lfsu_s = name_trialfunctionspace(Restriction.POSITIVE)
+    lfsu_n = name_trialfunctionspace(Restriction.NEGATIVE)
+    lfsv_s = name_testfunctionspace(Restriction.POSITIVE)
+    lfsv_n = name_testfunctionspace(Restriction.NEGATIVE)
+    cc_s = name_coefficientcontainer(Restriction.POSITIVE)
+    cc_n = name_coefficientcontainer(Restriction.NEGATIVE)
+    ac_s = name_applycontainer(Restriction.POSITIVE)
+    ac_n = name_applycontainer(Restriction.NEGATIVE)
+    av_s = name_accumulation_variable((Restriction.POSITIVE,))
+    av_n = name_accumulation_variable((Restriction.NEGATIVE,))
     return ((True, geo), (True, lfsu_s), (True, cc_s), (True, ac_s), (True, lfsv_s), (True, lfsu_n), (True, cc_n), (True, ac_n), (True, lfsv_n), (False, av_s), (False, av_n))
diff --git a/python/dune/perftool/pdelab/spaces.py b/python/dune/perftool/pdelab/spaces.py
index 706be5b91a186b990719227fe61f1336d01313a0..d5f62735ea0b1d98c3496f49f0d062fc62bd1e5a 100644
--- a/python/dune/perftool/pdelab/spaces.py
+++ b/python/dune/perftool/pdelab/spaces.py
@@ -125,7 +125,7 @@ type_gfs = partial(_function_space_traversal, defaultname=available_gfs_names, r
 def initialize_function_spaces(expr, visitor):
     restriction = visitor.restriction
     if visitor.measure == 'exterior_facet':
-        restriction = Restriction.NEGATIVE
+        restriction = Restriction.POSITIVE
 
     index = None
     from ufl import MixedElement
diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
index 3c935161e5d837d2397736a24a996fff4f312384..4e7c2a6991b1c6bda49862de0aa47167c1b1ea2a 100644
--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -9,34 +9,41 @@ from dune.perftool.generation import (backend,
                                       generator_factory,
                                       get_counted_variable,
                                       get_counter,
+                                      get_global_context_value,
+                                      globalarg,
                                       iname,
                                       instruction,
                                       post_include,
                                       kernel_cached,
                                       temporary_variable,
                                       transform,
+                                      valuearg
                                       )
-from dune.perftool.options import get_option
+from dune.perftool.options import (get_form_option,
+                                   get_option,
+                                   )
 from dune.perftool.loopy.flatten import flatten_index
-from dune.perftool.loopy.buffer import get_buffer_temporary
+from dune.perftool.loopy.target import type_floatingpoint
 from dune.perftool.sumfact.quadrature import nest_quadrature_loops
+from dune.perftool.pdelab.driver import FEM_name_mangling
 from dune.perftool.pdelab.localoperator import determine_accumulation_space
 from dune.perftool.pdelab.restriction import restricted_name
 from dune.perftool.pdelab.signatures import assembler_routine_name
 from dune.perftool.pdelab.geometry import world_dimension
+from dune.perftool.pdelab.spaces import name_lfs
 from dune.perftool.sumfact.tabulation import (basis_functions_per_direction,
                                               construct_basis_matrix_sequence,
                                               )
 from dune.perftool.sumfact.switch import (get_facedir,
                                           get_facemod,
                                           )
-from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInputBase
+from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInterfaceBase
 from dune.perftool.ufl.modified_terminals import extract_modified_arguments
-from dune.perftool.tools import get_pymbolic_basename
+from dune.perftool.tools import get_pymbolic_basename, get_leaf
 from dune.perftool.error import PerftoolError
 from dune.perftool.sumfact.quadrature import quadrature_inames
 
-from pytools import ImmutableRecord
+from pytools import ImmutableRecord, product
 
 import loopy as lp
 import numpy as np
@@ -79,21 +86,182 @@ def accum_iname(element, bound, i):
     return sumfact_iname(bound, "accum{}".format(suffix))
 
 
-class AlreadyAssembledInput(SumfactKernelInputBase):
-    def __init__(self, index):
-        self.index = index
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.index == other.index
+class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord):
+    def __init__(self,
+                 accumvar=None,
+                 restriction=None,
+                 test_element=None,
+                 test_element_index=None,
+                 trial_element=None,
+                 trial_element_index=None,
+                 ):
+        # TODO: Isnt accumvar superfluous in the presence of all the other infos?
+        ImmutableRecord.__init__(self,
+                                 accumvar=accumvar,
+                                 restriction=restriction,
+                                 test_element=test_element,
+                                 test_element_index=test_element_index,
+                                 trial_element=trial_element,
+                                 trial_element_index=trial_element_index,
+                                 )
 
     def __repr__(self):
-        return "AlreadyAssembledInput({})".format(self.index)
+        return ImmutableRecord.__repr__(self)
+
+    @property
+    def stage(self):
+        return 3
+
+    @property
+    def direct_is_possible(self):
+        return get_form_option("fastdg")
+
+    @property
+    def within_inames(self):
+        if self.trial_element is None:
+            return ()
+        else:
+            from dune.perftool.sumfact.basis import lfs_inames
+            return lfs_inames(get_leaf(self.trial_element, self.trial_element_index), self.restriction)
+
+    def realize(self, sf, result, insn_dep, inames=None, additional_inames=()):
+        trial_leaf_element = get_leaf(self.trial_element, self.trial_element_index) if self.trial_element is not None else None
+
+        basis_size = tuple(mat.basis_size for mat in sf.matrix_sequence)
+
+        if inames is None:
+            inames = tuple(accum_iname(trial_leaf_element, mat.rows, i)
+                           for i, mat in enumerate(sf.matrix_sequence))
+
+            # Determine the expression to accumulate with. This depends on the vectorization strategy!
+            from dune.perftool.tools import maybe_wrap_subscript
+            result = maybe_wrap_subscript(result, tuple(prim.Variable(i) for i in inames))
+
+        # Collect the lfs and lfs indices for the accumulate call
+        restriction = (0, 0) if self.restriction is None else self.restriction
+        test_lfs = name_lfs(self.test_element, restriction[0], self.test_element_index)
+        valuearg(test_lfs, dtype=lp.types.NumpyType("str"))
+        test_lfs_index = flatten_index(tuple(prim.Variable(i) for i in inames),
+                                       basis_size,
+                                       order="f"
+                                       )
+
+        accum_args = [prim.Variable(test_lfs), test_lfs_index]
+
+        # In the jacobian case, also determine the space for the ansatz space
+        if sf.within_inames:
+            # TODO the next line should get its inames from
+            # elsewhere. This is *NOT* robust (but works right now)
+            ansatz_lfs = name_lfs(self.trial_element, restriction[1], self.trial_element_index)
+            valuearg(ansatz_lfs, dtype=lp.types.NumpyType("str"))
+            from dune.perftool.sumfact.basis import _basis_functions_per_direction
+            ansatz_lfs_index = flatten_index(tuple(prim.Variable(sf.within_inames[i])
+                                                   for i in range(world_dimension())),
+                                             _basis_functions_per_direction(trial_leaf_element),
+                                             order="f"
+                                             )
+
+            accum_args.append(prim.Variable(ansatz_lfs))
+            accum_args.append(ansatz_lfs_index)
+
+        accum_args.append(result)
+
+        if not get_form_option("fastdg"):
+            rank = 2 if self.within_inames else 1
+            expr = prim.Call(PDELabAccumulationFunction(self.accumvar, rank),
+                             tuple(accum_args)
+                             )
+            dep = instruction(assignees=(),
+                              expression=expr,
+                              forced_iname_deps=frozenset(inames + additional_inames + self.within_inames),
+                              forced_iname_deps_is_final=True,
+                              depends_on=insn_dep,
+                              predicates=sf.predicates,
+                              tags=frozenset({"sumfact_stage3"}),
+                              )
+
+        return frozenset({dep})
+
+    def realize_direct(self, result, inames, shape, which=0, **args):
+        direct_output = "fastdg{}".format(which)
+        ftags = ",".join(["f"] * len(shape))
+
+        if self.trial_element is None:
+            globalarg(direct_output,
+                      shape=shape,
+                      dim_tags=ftags,
+                      offset=_dof_offset(self.test_element, self.test_element_index),
+                      )
+            lhs = prim.Subscript(prim.Variable(direct_output), inames)
+        else:
+            rowsize = sum(tuple(s for s in _local_sizes(self.trial_element)))
+            manual_strides = tuple("stride:{}".format(rowsize * product(shape[:i])) for i in range(len(shape)))
+            offset = "jacobian_offset{}".format(which)
+            valuearg(offset)
+            globalarg(direct_output,
+                      shape=shape,
+                      offset=prim.Variable(offset) + rowsize * _dof_offset(self.test_element, self.test_element_index) + _dof_offset(self.trial_element, self.trial_element_index),
+                      dim_tags=manual_strides,
+                      )
+            lhs = prim.Subscript(prim.Variable(direct_output), inames)
+
+        result = prim.Sum((lhs, result))
+        return frozenset({instruction(assignee=lhs,
+                                      expression=result,
+                                      tags=frozenset({"sumfact_stage3"}),
+                                      **args)})
+
+    @property
+    def function_name_suffix(self):
+        if get_form_option("fastdg"):
+            suffix = "_fastdg1_{}comp{}".format(FEM_name_mangling(self.test_element), self.test_element_index)
+            if self.within_inames:
+                suffix = "{}x{}comp{}".format(suffix, FEM_name_mangling(self.trial_element), self.trial_element_index)
+            return suffix
+        else:
+            return ""
+
+    @property
+    def function_args(self):
+        if get_form_option("fastdg"):
+            ret = ("{}.data()".format(self.accumvar),)
+            if get_form_option("fastdg") and self.within_inames:
+                element = get_leaf(self.trial_element, self.trial_element_index)
+                shape = tuple(element.degree() + 1 for e in range(element.cell().geometric_dimension()))
+                jacobian_index = flatten_index(tuple(prim.Variable(i) for i in self.within_inames), shape, order="f")
+                ret = ret + (str(jacobian_index),)
+            return ret
+        else:
+            return ()
+
+    @property
+    def signature_args(self):
+        if get_form_option('fastdg'):
+            ret = ("{}* fastdg0".format(type_floatingpoint()),)
+            if self.within_inames:
+                ret = ret + ("unsigned int jacobian_offset0",)
+            return ret
+        else:
+            return ()
+
+
+def _local_sizes(element):
+    from ufl import FiniteElement, MixedElement
+    if isinstance(element, MixedElement):
+        for subel in element.sub_elements():
+            for s in _local_sizes(subel):
+                yield s
+    else:
+        assert isinstance(element, FiniteElement)
+        yield (element.degree() + 1)**element.cell().geometric_dimension()
 
-    def __hash__(self):
-        return hash(self.index)
 
-    def __str__(self):
-        return "Input{}".format(self.index[0])
+def _dof_offset(element, component):
+    if component is None:
+        return 0
+    else:
+        sizes = tuple(s for s in _local_sizes(element))
+        return sum(sizes[0:component])
 
 
 class SumfactAccumulationInfo(ImmutableRecord):
@@ -131,7 +299,7 @@ def get_accumulation_info(expr, visitor):
     restriction = visitor.restriction
     if visitor.measure == 'exterior_facet':
         from dune.perftool.pdelab.restriction import Restriction
-        restriction = Restriction.NEGATIVE
+        restriction = Restriction.POSITIVE
 
     inames = visitor.interface.lfs_inames(leaf_element,
                                           restriction,
@@ -173,9 +341,9 @@ def _test_generator(expr, visitor):
     if visitor.measure == "cell":
         restrictions = (Restriction.NONE,)
     elif visitor.measure == "exterior_facet":
-        restrictions = (Restriction.NEGATIVE,)
+        restrictions = (Restriction.POSITIVE,)
     elif visitor.measure == "interior_facet":
-        restrictions = (Restriction.NEGATIVE, Restriction.POSITIVE)
+        restrictions = (Restriction.POSITIVE, Restriction.NEGATIVE)
     for res in restrictions:
         for ei, e in _get_childs(element):
             for grad in (None,) + tuple(range(dim)):
@@ -196,9 +364,9 @@ def _trial_generator(expr, visitor):
     if visitor.measure == "cell":
         restrictions = (Restriction.NONE,)
     elif visitor.measure == "exterior_facet":
-        restrictions = (Restriction.NEGATIVE,)
+        restrictions = (Restriction.POSITIVE,)
     elif visitor.measure == "interior_facet":
-        restrictions = (Restriction.NEGATIVE, Restriction.POSITIVE)
+        restrictions = (Restriction.POSITIVE, Restriction.NEGATIVE)
     for res in restrictions:
         for ei, e in _get_childs(element):
             yield SumfactAccumulationInfo(element_index=ei, restriction=res, element=e)
@@ -264,16 +432,17 @@ def generate_accumulation_instruction(expr, visitor):
     if priority is None:
         priority = 3
 
+    output = AccumulationOutput(accumvar=accumvar,
+                                restriction=(test_info.restriction, trial_info.restriction),
+                                test_element=test_info.element,
+                                test_element_index=test_info.element_index,
+                                trial_element=trial_info.element,
+                                trial_element_index=trial_info.element_index,
+                                )
+
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
-                       restriction=(test_info.restriction, trial_info.restriction),
-                       stage=3,
                        position_priority=priority,
-                       accumvar=accumvar,
-                       test_element=test_info.element,
-                       test_element_index=test_info.element_index,
-                       trial_element=trial_info.element,
-                       trial_element_index=trial_info.element_index,
-                       input=AlreadyAssembledInput(index=(test_info.element_index,)),
+                       interface=output,
                        predicates=predicates,
                        )
 
@@ -287,11 +456,14 @@ def generate_accumulation_instruction(expr, visitor):
 
     vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset()
 
-    temp = get_buffer_temporary(buffer,
-                                shape=vsf.quadrature_shape,
-                                dim_tags=vsf.quadrature_dimtags,
-                                name="input_{}".format(buffer),
-                                )
+    from dune.perftool.sumfact.realization import name_buffer_storage
+    temp = "input_{}".format(buffer)
+    temporary_variable(temp,
+                       shape=vsf.quadrature_shape,
+                       dim_tags=vsf.quadrature_dimtags,
+                       custom_base_storage=name_buffer_storage(buffer, 0),
+                       managed=True,
+                       )
 
     # Those input fields, that are padded need to be set to zero
     # in order to do a horizontal_add later on
@@ -304,16 +476,6 @@ def generate_accumulation_instruction(expr, visitor):
                     tags=frozenset(["quadvec", "gradvec"]),
                     )
 
-    # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1)
-    timer_dep = frozenset()
-    if get_option("instrumentation_level") >= 4:
-        timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
-        post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-        dump_accumulate_timer(timer_name)
-        if(jacobian_inames):
-            timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
-                                               within_inames=frozenset(jacobian_inames))})
-
     # Determine dependencies
     from loopy.match import Or, Writes
     from loopy.symbolic import DependencyMapper
@@ -328,68 +490,17 @@ def generate_accumulation_instruction(expr, visitor):
                               expression=expr,
                               forced_iname_deps=frozenset(quadrature_inames(trial_leaf_element) + jacobian_inames),
                               forced_iname_deps_is_final=True,
-                              tags=frozenset({"quadvec"}).union(vectag),
-                              depends_on=frozenset({deps}).union(timer_dep).union(frozenset({lp.match.Tagged("sumfact_stage1")})),
+                              tags=frozenset({"quadvec", "sumfact_stage2"}).union(vectag),
+                              depends_on=frozenset({deps}).union(frozenset({lp.match.Tagged("sumfact_stage1")})),
                               )
 
     if insn_dep is None:
         insn_dep = frozenset({contrib_dep})
 
-    if get_option("instrumentation_level") >= 4:
-        insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
-                                          depends_on=insn_dep,
-                                          within_inames=frozenset(jacobian_inames))})
-
-    inames = tuple(accum_iname(trial_leaf_element, mat.rows, i)
-                   for i, mat in enumerate(vsf.matrix_sequence))
-
-    # Collect the lfs and lfs indices for the accumulate call
-    test_lfs.index = flatten_index(tuple(prim.Variable(i) for i in inames),
-                                   basis_size,
-                                   order="f"
-                                   )
-
-    # In the jacobian case, also determine the space for the ansatz space
-    if jacobian_inames:
-        # TODO the next line should get its inames from
-        # elsewhere. This is *NOT* robust (but works right now)
-        from dune.perftool.sumfact.basis import _basis_functions_per_direction
-        ansatz_lfs.index = flatten_index(tuple(prim.Variable(jacobian_inames[i])
-                                               for i in range(world_dimension())),
-                                         _basis_functions_per_direction(trial_leaf_element),
-                                         order="f"
-                                         )
-
     # Add a sum factorization kernel that implements the multiplication
     # with the test function (stage 3)
     from dune.perftool.sumfact.realization import realize_sum_factorization_kernel
     result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep)))
 
-    # Determine the expression to accumulate with. This depends on the vectorization strategy!
-    result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames))
-    vecinames = ()
-
-    if vsf.vectorized:
-        iname = accum_iname(trial_leaf_element, vsf.vector_width, "vec")
-        vecinames = (iname,)
-        transform(lp.tag_inames, [(iname, "vec")])
-        from dune.perftool.tools import maybe_wrap_subscript
-        result = prim.Call(prim.Variable("horizontal_add"),
-                           (maybe_wrap_subscript(result, prim.Variable(iname)),),
-                           )
-
-    if not get_option("fastdg"):
-        rank = 2 if jacobian_inames else 1
-        expr = prim.Call(PDELabAccumulationFunction(accumvar, rank),
-                         (test_lfs.get_args() +
-                          ansatz_lfs.get_args() +
-                          (result,)
-                          )
-                         )
-        instruction(assignees=(),
-                    expression=expr,
-                    forced_iname_deps=frozenset(inames + vecinames + jacobian_inames),
-                    forced_iname_deps_is_final=True,
-                    depends_on=insn_dep,
-                    predicates=predicates
-                    )
+    if not get_form_option("fastdg"):
+        insn_dep = vsf.interface.realize(vsf, result, insn_dep)
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 46563ee1f6805037b13b13136e16f2fcc1d23ff0..39bba49e67fe66a4541ac0ae633e6e403be32c52 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -11,11 +11,13 @@ from dune.perftool.generation import (backend,
                                       get_counted_variable,
                                       get_counter,
                                       get_global_context_value,
+                                      globalarg,
                                       iname,
                                       instruction,
                                       kernel_cached,
                                       temporary_variable,
                                       )
+from dune.perftool.loopy.target import type_floatingpoint
 from dune.perftool.sumfact.tabulation import (basis_functions_per_direction,
                                               construct_basis_matrix_sequence,
                                               BasisTabulationMatrix,
@@ -31,9 +33,8 @@ from dune.perftool.pdelab.argument import name_coefficientcontainer
 from dune.perftool.pdelab.geometry import (local_dimension,
                                            world_dimension,
                                            )
-from dune.perftool.loopy.buffer import initialize_buffer, get_buffer_temporary
-from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInputBase
-from dune.perftool.options import get_option
+from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInterfaceBase
+from dune.perftool.options import get_form_option
 from dune.perftool.pdelab.driver import FEM_name_mangling
 from dune.perftool.pdelab.restriction import restricted_name
 from dune.perftool.pdelab.spaces import name_lfs, name_lfs_bound, name_leaf_lfs
@@ -50,7 +51,7 @@ from loopy.match import Writes
 import pymbolic.primitives as prim
 
 
-class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
+class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord):
     def __init__(self,
                  coeff_func=None,
                  element=None,
@@ -64,10 +65,21 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
                                  restriction=restriction,
                                  )
 
-    def __str__(self):
+    def __repr__(self):
         return "{}_{}".format(self.coeff_func(self.restriction), self.element_index)
 
-    def realize(self, sf, index, insn_dep):
+    def __str__(self):
+        return repr(self)
+
+    @property
+    def stage(self):
+        return 1
+
+    @property
+    def direct_is_possible(self):
+        return get_form_option("fastdg")
+
+    def realize(self, sf, insn_dep, index=0):
         lfs = name_lfs(self.element, self.restriction, self.element_index)
         basisiname = sumfact_iname(name_lfs_bound(lfs), "basis")
         container = self.coeff_func(self.restriction)
@@ -75,25 +87,57 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
         coeff = pc(container, lfs, basisiname)
 
         # Get the input temporary!
-        name = get_buffer_temporary(sf.buffer,
-                                    shape=(product(mat.basis_size for mat in sf.matrix_sequence), sf.vector_width),
-                                    name="input_{}".format(sf.buffer)
-                                    )
+        from dune.perftool.sumfact.realization import name_buffer_storage
+        name = "input_{}".format(sf.buffer)
+        temporary_variable(name,
+                           shape=(product(mat.basis_size for mat in sf.matrix_sequence), sf.vector_width),
+                           custom_base_storage=name_buffer_storage(sf.buffer, 0),
+                           managed=True,
+                           )
 
         assignee = prim.Subscript(prim.Variable(name),
                                   (prim.Variable(basisiname),) + (index,))
-        instruction(assignee=assignee,
-                    expression=coeff,
-                    depends_on=sf.insn_dep.union(insn_dep),
-                    tags=frozenset({"sumfact_stage{}".format(sf.stage)}),
-                    )
+        insn = instruction(assignee=assignee,
+                           expression=coeff,
+                           depends_on=sf.insn_dep.union(insn_dep),
+                           tags=frozenset({"sumfact_stage{}".format(sf.stage)}),
+                           )
+
+        return insn_dep.union(frozenset({insn}))
+
+    def realize_direct(self, shape, inames, which=0):
+        arg = "fastdg{}".format(which)
+
+        from dune.perftool.sumfact.accumulation import _dof_offset
+        globalarg(arg,
+                  shape=shape,
+                  dim_tags=",".join("f" * len(shape)),
+                  offset=_dof_offset(self.element, self.element_index),
+                  )
+
+        return prim.Subscript(prim.Variable(arg), inames)
+
+    @property
+    def function_name_suffix(self):
+        if get_form_option("fastdg"):
+            return "_fastdg1_{}comp{}".format(FEM_name_mangling(self.element), self.element_index)
+        else:
+            return ""
+
+    @property
+    def function_args(self):
+        if get_form_option("fastdg"):
+            func = self.coeff_func(self.restriction)
+            return ("{}.data()".format(func),)
+        else:
+            return ()
 
     @property
-    def direct_input(self):
-        if get_option("fastdg"):
-            return self.coeff_func(self.restriction)
+    def signature_args(self):
+        if get_form_option("fastdg"):
+            return ("const {}* fastdg0".format(type_floatingpoint()),)
         else:
-            return None
+            return ()
 
 
 def _basis_functions_per_direction(element):
@@ -141,7 +185,7 @@ def pymbolic_coefficient_gradient(element, restriction, index, coeff_func, visit
     # The sum factorization kernel object gathering all relevant information
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
                        position_priority=grad_index,
-                       input=inp,
+                       interface=inp,
                        )
 
     from dune.perftool.sumfact.vectorization import attach_vectorization_info
@@ -182,7 +226,7 @@ def pymbolic_coefficient(element, restriction, index, coeff_func, visitor):
                                 )
 
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
-                       input=inp,
+                       interface=inp,
                        position_priority=3,
                        )
 
diff --git a/python/dune/perftool/sumfact/geometry.py b/python/dune/perftool/sumfact/geometry.py
index e17b7aac660718cb424cae73fe2c7c60452fb5f2..7b78de412d2e2c5a893e2d1d1d7e32315bd6aafd 100644
--- a/python/dune/perftool/sumfact/geometry.py
+++ b/python/dune/perftool/sumfact/geometry.py
@@ -12,15 +12,14 @@ from dune.perftool.generation import (backend,
                                       temporary_variable,
                                       globalarg,
                                       )
-from dune.perftool.loopy.buffer import get_buffer_temporary
 from dune.perftool.pdelab.geometry import (local_dimension,
                                            world_dimension,
                                            name_geometry,
                                            )
 from dune.perftool.sumfact.switch import get_facedir
-from dune.perftool.sumfact.symbolic import SumfactKernelInputBase
+from dune.perftool.sumfact.symbolic import SumfactKernelInterfaceBase
 from dune.perftool.sumfact.vectorization import attach_vectorization_info
-from dune.perftool.options import get_option, option_switch
+from dune.perftool.options import get_form_option, option_switch
 from dune.perftool.ufl.modified_terminals import Restriction
 
 from pytools import ImmutableRecord
@@ -36,15 +35,18 @@ def corner_iname():
     return name
 
 
-class GeoCornersInput(SumfactKernelInputBase, ImmutableRecord):
+class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableRecord):
     def __init__(self, dir):
         ImmutableRecord.__init__(self, dir=dir)
 
     def realize(self, sf, index, insn_dep):
-        name = get_buffer_temporary(sf.buffer,
-                                    shape=(2 ** local_dimension(), sf.vector_width),
-                                    name="input_{}".format(sf.buffer)
-                                    )
+        from dune.perftool.sumfact.realization import name_buffer_storage
+        name = "input_{}".format(sf.buffer)
+        temporary_variable(name,
+                           shape=(2 ** local_dimension(), sf.vector_width),
+                           custom_base_storage=name_buffer_storage(sf.buffer, 0),
+                           managed=True,
+                           )
 
         ciname = corner_iname()
         geo = name_geometry()
@@ -152,7 +154,7 @@ def pymbolic_spatial_coordinate_axiparallel(do_predicates, visitor):
     restriction = Restriction.NONE
     from dune.perftool.generation import get_global_context_value
     if get_global_context_value("integral_type") == "interior_facet":
-        restriction = Restriction.NEGATIVE
+        restriction = Restriction.POSITIVE
     from dune.perftool.sumfact.switch import get_facedir
     face = get_facedir(restriction)
 
@@ -181,10 +183,10 @@ def pymbolic_spatial_coordinate_axiparallel(do_predicates, visitor):
 def pymbolic_unit_outer_normal(visitor_indices):
     index, = visitor_indices
     assert isinstance(index, int)
-    if get_option("diagonal_transformation_matrix"):
+    if get_form_option("diagonal_transformation_matrix"):
         from dune.perftool.sumfact.switch import get_facedir, get_facemod
-        if index == get_facedir(Restriction.NEGATIVE):
-            if get_facemod(Restriction.NEGATIVE):
+        if index == get_facedir(Restriction.POSITIVE):
+            if get_facemod(Restriction.POSITIVE):
                 return 1, None
             else:
                 return -1, None
@@ -198,10 +200,10 @@ def pymbolic_unit_outer_normal(visitor_indices):
 def pymbolic_unit_inner_normal(visitor_indices):
     index, = visitor_indices
     assert isinstance(index, int)
-    if get_option("diagonal_transformation_matrix"):
+    if get_form_option("diagonal_transformation_matrix"):
         from dune.perftool.sumfact.switch import get_facedir, get_facemod
-        if index == get_facedir(Restriction.NEGATIVE):
-            if get_facemod(Restriction.NEGATIVE):
+        if index == get_facedir(Restriction.POSITIVE):
+            if get_facemod(Restriction.POSITIVE):
                 return -1, None
             else:
                 return 1, None
@@ -213,7 +215,7 @@ def pymbolic_unit_inner_normal(visitor_indices):
 
 
 def pymbolic_facet_jacobian_determinant():
-    if get_option("constant_transformation_matrix"):
+    if get_form_option("constant_transformation_matrix"):
         return pymbolic_constant_facet_jacobian_determinant()
     else:
         from dune.perftool.pdelab.geometry import pymbolic_facet_jacobian_determinant as _norm
@@ -221,7 +223,7 @@ def pymbolic_facet_jacobian_determinant():
 
 
 def pymbolic_constant_facet_jacobian_determinant():
-    facedir = get_facedir(Restriction.NEGATIVE)
+    facedir = get_facedir(Restriction.POSITIVE)
     assert isinstance(facedir, int)
 
     name = "fdetjac"
@@ -256,7 +258,7 @@ def define_constant_facet_jacobian_determinant_eval(name):
 
 
 def pymbolic_facet_area():
-    if get_option("constant_transformation_matrix"):
+    if get_form_option("constant_transformation_matrix"):
         return pymbolic_facet_jacobian_determinant()
     else:
         from dune.perftool.pdelab.geometry import pymbolic_facet_area as _norm
diff --git a/python/dune/perftool/sumfact/quadrature.py b/python/dune/perftool/sumfact/quadrature.py
index 8209a41f2890078aba8ffb25b708937fda2c7d01..2b709293a7f2c556c43a3812654c6e178e7d971f 100644
--- a/python/dune/perftool/sumfact/quadrature.py
+++ b/python/dune/perftool/sumfact/quadrature.py
@@ -18,7 +18,7 @@ from dune.perftool.pdelab.argument import name_accumulation_variable
 from dune.perftool.pdelab.geometry import (local_dimension,
                                            world_dimension,
                                            )
-from dune.perftool.options import get_option
+from dune.perftool.options import get_form_option
 from dune.perftool.sumfact.switch import get_facedir
 from dune.perftool.loopy.target import dtype_floatingpoint
 
@@ -142,7 +142,7 @@ def recursive_quadrature_weight(visitor, direction=0):
 
 def quadrature_weight(visitor):
     # Return non-precomputed version
-    if not get_option("precompute_quadrature_info"):
+    if not get_form_option("precompute_quadrature_info"):
         return recursive_quadrature_weight(visitor)
 
     # Quadrature points per (local) direction
@@ -195,7 +195,7 @@ def define_quadrature_position(name, index):
 @backend(interface="quad_pos", name="sumfact")
 def pymbolic_quadrature_position(index, visitor):
     # Return the non-precomputed version
-    if not get_option("precompute_quadrature_info"):
+    if not get_form_option("precompute_quadrature_info"):
         name = 'pos'
         temporary_variable(name, shape=(local_dimension(),), shape_impl=("fv",))
         define_quadrature_position(name, index)
diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py
index 703e3e062b72a5615d57d205033a861bb5388ba3..777f8dab972edba7ee55398f7a2d41c37d3c03b4 100644
--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -3,34 +3,42 @@ The code that triggers the creation of the necessary code constructs
 to realize a sum factorization kernel
 """
 from dune.perftool.generation import (barrier,
+                                      delete_cache_items,
                                       dump_accumulate_timer,
                                       generator_factory,
                                       get_global_context_value,
                                       globalarg,
                                       instruction,
+                                      kernel_cached,
                                       post_include,
                                       preamble,
                                       silenced_warning,
                                       temporary_variable,
                                       transform,
                                       )
-from dune.perftool.loopy.buffer import (get_buffer_temporary,
-                                        switch_base_storage,
-                                        )
+from dune.perftool.loopy.flatten import flatten_index
 from dune.perftool.pdelab.argument import pymbolic_coefficient
 from dune.perftool.pdelab.basis import shape_as_pymbolic
 from dune.perftool.pdelab.geometry import world_dimension
-from dune.perftool.options import get_option
+from dune.perftool.options import (get_form_option,
+                                   get_option,
+                                   )
 from dune.perftool.pdelab.signatures import assembler_routine_name
 from dune.perftool.sumfact.permutation import (sumfact_permutation_strategy,
                                                permute_backward,
                                                permute_forward,
                                                )
+from dune.perftool.sumfact.quadrature import quadrature_points_per_direction
+from dune.perftool.sumfact.symbolic import (SumfactKernel,
+                                            VectorizedSumfactKernel,
+                                            )
 from dune.perftool.sumfact.vectorization import attach_vectorization_info
 from dune.perftool.sumfact.accumulation import sumfact_iname
 from dune.perftool.loopy.target import dtype_floatingpoint
 from dune.perftool.loopy.vcl import ExplicitVCLCast
+from dune.perftool.tools import get_leaf, remove_duplicates
 
+from pytools import product
 from ufl import MixedElement
 
 import loopy as lp
@@ -38,6 +46,11 @@ import numpy as np
 import pymbolic.primitives as prim
 
 
+# Have a generator function store the necessary sum factorization kernel implementations
+# This way then can easily be extracted at the end of the form visiting process
+necessary_kernel_implementations = generator_factory(item_tags=("kernelimpl",), cache_key_generator=lambda a: a[0].function_name, no_deco=True)
+
+
 def realize_sum_factorization_kernel(sf, **kwargs):
     if get_global_context_value("dry_run", False):
         return sf, sf.insn_dep
@@ -45,50 +58,88 @@ def realize_sum_factorization_kernel(sf, **kwargs):
         return _realize_sum_factorization_kernel(sf, **kwargs)
 
 
-@preamble
-def alias_data_array(name, data):
-    return "auto {} = {}.data();".format(name, data)
+def name_buffer_storage(buff, which):
+    name = "{}_{}".format(buff, which)
+    return name
 
 
-@generator_factory(item_tags=("sumfactkernel",),
-                   context_tags=("kernel",),
-                   cache_key_generator=lambda s, **kw: s.cache_key)
+@kernel_cached
 def _realize_sum_factorization_kernel(sf):
     insn_dep = sf.insn_dep
 
-    # Measure times and count operations in c++ code
-    if get_option("instrumentation_level") >= 4:
-        if sf.stage == 1:
-            setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
-            insn_dep = insn_dep.union(frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer),
-                                                             within_inames=frozenset(sf.within_inames),
-                                                             depends_on=insn_dep)}))
-
-        timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(sf.stage)
-        post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-        dump_accumulate_timer(timer_name)
-        insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
-                                                         within_inames=frozenset(sf.within_inames),
-                                                         depends_on=insn_dep,
-                                                         ),
-                                             }))
-
-    direct_input = sf.input.direct_input
-
-    # Set up the input for stage 1
-    if direct_input is None:
-        if sf.vectorized:
-            for i, inputsf in enumerate(sf.kernels):
-                inputsf.input.realize(sf, i, inputsf.insn_dep.union(insn_dep))
-        else:
-            sf.input.realize(sf, 0, insn_dep)
+    # Get all the necessary pieces for a function call
+    buffers = tuple(name_buffer_storage(sf.buffer, i) for i in range(2))
+
+    # Make sure that the storage is allocated and has a certain minimum size
+    # This is necessary to allocate buffers that will be passed to sumfact kernel
+    # functions. Loopy has no knowledge of what happens with those...
+    for buf in buffers:
+        # Determine the necessary size of the buffer. We assume that we do not
+        # underintegrate the form!!!
+        size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width,
+                   product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width)
+        temporary_variable("{}_dummy".format(buf),
+                           shape=(size,),
+                           custom_base_storage=buf,
+                           decl_method=lambda n, k, di: None,
+                           )
+
+    # Realize the input if it is not direct
+    if sf.stage == 1 and not sf.interface.direct_is_possible:
+        insn_dep = insn_dep.union(sf.interface.realize(sf, insn_dep))
+
+    # Trigger generation of the sum factorization kernel function
+    qp = quadrature_points_per_direction()
+    necessary_kernel_implementations((sf, qp))
+
+    # Call the function
+    code = "{}({});".format(sf.function_name, ", ".join(buffers + sf.interface.function_args))
+    tag = "sumfact_stage{}".format(sf.stage)
+    insn_dep = frozenset({instruction(code=code,
+                                      depends_on=insn_dep,
+                                      within_inames=frozenset(sf.within_inames),
+                                      tags=frozenset({tag}),
+                                      predicates=sf.predicates,
+                                      )
+                          })
+
+    # Interpret the output as a temporary of correct shape
+    out = "{}_output".format(sf.buffer)
+    temporary_variable(out,
+                       shape=sf.output_shape,
+                       dim_tags=sf.output_dimtags,
+                       custom_base_storage=buffers[sf.length % 2],
+                       managed=True,
+                       )
+    silenced_warning("read_no_write({})".format(out))
 
-        insn_dep = insn_dep.union(frozenset({lp.match.Writes("input_{}".format(sf.buffer))}))
-    else:
-        if sf.input.element_index is None:
-            direct_input_arg = "{}_access".format(direct_input)
-        else:
-            direct_input_arg = "{}_access_comp{}".format(direct_input, sf.input.element_index)
+    return lp.TaggedVariable(out, sf.tag), insn_dep
+
+
+class BufferSwitcher(object):
+    def __init__(self):
+        self.current = 0
+
+    def get_temporary(self, name=None, **kwargs):
+        assert name
+        bs = "buffer{}".format(self.current)
+        globalarg(bs)
+        temporary_variable(name,
+                           managed=True,
+                           custom_base_storage=bs,
+                           **kwargs
+                           )
+
+        return name
+
+    def switch(self):
+        self.current = (self.current + 1) % 2
+
+
+def realize_sumfact_kernel_function(sf):
+    # Get a buffer switcher instance
+    buffer = BufferSwitcher()
+    insn_dep = frozenset()
 
     # Prepare some dim_tags/shapes for later use
     ftags = ",".join(["f"] * sf.length)
@@ -141,24 +192,12 @@ def _realize_sum_factorization_kernel(sf):
         # * a global data structure (if FastDGGridOperator is in use)
         # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator)
         input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])
-        if l == 0 and direct_input is not None:
+        if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible:
             # See comment below
             input_inames = permute_backward(input_inames, perm)
             inp_shape = permute_backward(inp_shape, perm)
 
-            globalarg(direct_input_arg,
-                      shape=inp_shape,
-                      dim_tags=novec_ftags,
-                      offset=_dof_offset(sf.input.element, sf.input.element_index),
-                      )
-            alias_data_array(direct_input_arg, direct_input)
-            if matrix.vectorized:
-                input_summand = prim.Call(ExplicitVCLCast(dtype_floatingpoint(), vector_width=sf.vector_width),
-                                          (prim.Subscript(prim.Variable(direct_input_arg),
-                                                          input_inames),))
-            else:
-                input_summand = prim.Subscript(prim.Variable(direct_input_arg),
-                                               input_inames + vec_iname)
+            input_summand = sf.interface.realize_direct(inp_shape, input_inames)
         else:
             # If we did permute the order of a matrices above we also
             # permuted the order of out_inames. Unfortunately the
@@ -171,9 +210,10 @@ def _realize_sum_factorization_kernel(sf):
             # Get a temporary that interprets the base storage of the input
             # as a column-major matrix. In later iteration of the matrix loop
             # this reinterprets the output of the previous iteration.
-            inp = get_buffer_temporary(sf.buffer,
+            inp = buffer.get_temporary("buff_step{}_in".format(l),
                                        shape=inp_shape + vec_shape,
-                                       dim_tags=ftags)
+                                       dim_tags=ftags,
+                                       )
 
             # The input temporary will only be read from, so we need to silence the loopy warning
             silenced_warning('read_no_write({})'.format(inp))
@@ -181,7 +221,7 @@ def _realize_sum_factorization_kernel(sf):
             input_summand = prim.Subscript(prim.Variable(inp),
                                            input_inames + vec_iname)
 
-        switch_base_storage(sf.buffer)
+        buffer.switch()
 
         # Get a temporary that interprets the base storage of the output.
         #
@@ -195,9 +235,10 @@ def _realize_sum_factorization_kernel(sf):
         output_shape = tuple(out_shape[1:]) + (out_shape[0],)
         if l == len(matrix_sequence) - 1:
             output_shape = permute_backward(output_shape, perm)
-        out = get_buffer_temporary(sf.buffer,
+        out = buffer.get_temporary("buff_step{}_out".format(l),
                                    shape=output_shape + vec_shape,
-                                   dim_tags=ftags)
+                                   dim_tags=ftags,
+                                   )
 
         # Write the matrix-matrix multiplication expression
         matprod = prim.Product((matrix.pymbolic((prim.Variable(out_inames[0]), k_expr) + vec_iname),
@@ -213,108 +254,28 @@ def _realize_sum_factorization_kernel(sf):
         if l == len(matrix_sequence) - 1:
             output_inames = permute_backward(output_inames, perm)
 
+        # Collect the key word arguments for the loopy instruction
+        insn_args = {"depends_on": insn_dep}
+
         # In case of direct output we directly accumulate the result
         # of the Sumfactorization into some global data structure.
-        if l == len(matrix_sequence) - 1 and get_option('fastdg') and sf.stage == 3:
-            ft = get_global_context_value("form_type")
-            if sf.test_element_index is None:
-                direct_output = "{}_access".format(sf.accumvar)
-            else:
-                direct_output = "{}_access_comp{}".format(sf.accumvar, sf.test_element_index)
-            if ft == 'residual' or ft == 'jacobian_apply':
-                globalarg(direct_output,
-                          shape=output_shape,
-                          dim_tags=novec_ftags,
-                          offset=_dof_offset(sf.test_element, sf.test_element_index),
-                          )
-                alias_data_array(direct_output, sf.accumvar)
-
-                assignee = prim.Subscript(prim.Variable(direct_output), output_inames)
-            else:
-                assert ft == 'jacobian'
-
-                direct_output = "{}x{}".format(direct_output, sf.trial_element_index)
-                rowsize = sum(tuple(s for s in _local_sizes(sf.trial_element)))
-                element = sf.trial_element
-                if isinstance(element, MixedElement):
-                    element = element.extract_component(sf.trial_element_index)[1]
-                other_shape = tuple(element.degree() + 1 for e in range(sf.length))
-                from pytools import product
-                manual_strides = tuple("stride:{}".format(rowsize * product(output_shape[:i])) for i in range(sf.length))
-                dim_tags = "{},{}".format(novec_ftags, ",".join(manual_strides))
-                globalarg(direct_output,
-                          shape=other_shape + output_shape,
-                          offset=rowsize * _dof_offset(sf.test_element, sf.test_element_index) + _dof_offset(sf.trial_element, sf.trial_element_index),
-                          dim_tags=dim_tags,
-                          )
-                alias_data_array(direct_output, sf.accumvar)
-                # TODO: It is at least questionnable, whether using the *order* of the inames in here
-                #       for indexing is a good idea. Then again, it is hard to find an alternative.
-                _ansatz_inames = tuple(prim.Variable(i) for i in sf.within_inames)
-                assignee = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + output_inames)
-
-            # In case of vectorization we need to apply a horizontal add
-            if matrix.vectorized:
-                matprod = prim.Call(prim.Variable("horizontal_add"),
-                                    (matprod,))
-
-            # We need to accumulate
-            matprod = prim.Sum((assignee, matprod))
+        if l == len(matrix_sequence) - 1 and get_form_option('fastdg') and sf.stage == 3:
+            if sf.vectorized:
+                insn_args["forced_iname_deps"] = frozenset({vec_iname[0].name})
+            insn_dep = sf.interface.realize_direct(matprod, output_inames, out_shape, **insn_args)
         else:
-            assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
-
-        tag = "sumfact_stage{}".format(sf.stage)
-        if sf.stage == 3:
-            tag = "{}_{}".format(tag, "_".join(sf.within_inames))
-
-        # Issue the reduction instruction that implements the multiplication
-        # at the same time store the instruction ID for the next instruction to depend on
-        insn_dep = frozenset({instruction(assignee=assignee,
-                                          expression=matprod,
-                                          forced_iname_deps=frozenset([iname for iname in out_inames]).union(frozenset(sf.within_inames)),
-                                          forced_iname_deps_is_final=True,
-                                          depends_on=insn_dep,
-                                          tags=frozenset({tag}),
-                                          predicates=sf.predicates,
-                                          groups=frozenset({sf.group_name}),
-                                          )
-                              })
-
-    # Measure times and count operations in c++ code
-    if get_option("instrumentation_level") >= 4:
-        stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
-                                           depends_on=frozenset({lp.match.Tagged(tag)}),
-                                           within_inames=frozenset(sf.within_inames))})
-        if sf.stage == 1:
-            qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
-            post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-            dump_accumulate_timer(timer_name)
-            frozenset({instruction(code="HP_TIMER_START({});".format(qp_timer_name),
-                                   depends_on=stop_insn)})
-
-    out = get_buffer_temporary(sf.buffer,
-                               shape=sf.output_shape,
-                               dim_tags=sf.output_dimtags,
-                               )
-    silenced_warning('read_no_write({})'.format(out))
-
-    return lp.TaggedVariable(out, sf.tag), insn_dep
-
-
-def _local_sizes(element):
-    from ufl import FiniteElement, MixedElement
-    if isinstance(element, MixedElement):
-        for subel in element.sub_elements():
-            for s in _local_sizes(subel):
-                yield s
-    else:
-        assert isinstance(element, FiniteElement)
-        yield (element.degree() + 1)**element.cell().geometric_dimension()
-
-
-def _dof_offset(element, component):
-    if component is None:
-        return 0
-    else:
-        sizes = tuple(s for s in _local_sizes(element))
-        return sum(sizes[0:component])
+            # Issue the reduction instruction that implements the multiplication
+            # at the same time store the instruction ID for the next instruction to depend on
+            insn_dep = frozenset({instruction(assignee=prim.Subscript(prim.Variable(out), output_inames + vec_iname),
+                                              expression=matprod,
+                                              **insn_args
+                                              )
+                                  })
+
+    # Construct a loopy kernel object
+    from dune.perftool.pdelab.localoperator import extract_kernel_from_cache
+    args = ("const char* buffer0", "const char* buffer1") + sf.interface.signature_args
+    signature = "void {}({}) const".format(sf.function_name, ", ".join(args))
+    kernel = extract_kernel_from_cache("kernel_default", sf.function_name, [signature], add_timings=False)
+    delete_cache_items("kernel_default")
+    return kernel
diff --git a/python/dune/perftool/sumfact/switch.py b/python/dune/perftool/sumfact/switch.py
index a420850d6bdecb4e479688f7be8c3e1039572351..8c6a0f13ace27b6e3081030607149b7b003aa82c 100644
--- a/python/dune/perftool/sumfact/switch.py
+++ b/python/dune/perftool/sumfact/switch.py
@@ -10,13 +10,13 @@ from dune.perftool.pdelab.signatures import (assembly_routine_args,
                                              assembly_routine_signature,
                                              kernel_name,
                                              )
-from dune.perftool.options import get_option
+from dune.perftool.options import get_form_option
 from dune.perftool.cgen.clazz import ClassMember
 
 
 @backend(interface="generate_kernels_per_integral", name="sumfact")
 def generate_kernels_per_integral(integrals):
-    dim = get_global_context_value("formdata").geometric_dimension
+    dim = world_dimension()
     measure = get_global_context_value("integral_type")
 
     if measure == "cell":
@@ -53,7 +53,7 @@ def get_kernel_name(facedir_s=None, facemod_s=None, facedir_n=None, facemod_n=No
 
 def decide_if_kernel_is_necessary(facedir_s, facemod_s, facedir_n, facemod_n):
     # If we are not using YaspGrid, all variants need to be realized
-    if not get_option("diagonal_transformation_matrix"):
+    if not get_form_option("diagonal_transformation_matrix"):
         return True
 
     # The PDELab machineries visit-once policy combined with Yasp avoids any visits
@@ -138,9 +138,9 @@ def generate_interior_facet_switch():
 
 def get_facedir(restriction):
     from dune.perftool.pdelab.restriction import Restriction
-    if restriction == Restriction.NEGATIVE or get_global_context_value("integral_type") == "exterior_facet":
+    if restriction == Restriction.POSITIVE or get_global_context_value("integral_type") == "exterior_facet":
         return get_global_context_value("facedir_s")
-    if restriction == Restriction.POSITIVE:
+    if restriction == Restriction.NEGATIVE:
         return get_global_context_value("facedir_n")
     if restriction == Restriction.NONE:
         return None
@@ -149,9 +149,9 @@ def get_facedir(restriction):
 
 def get_facemod(restriction):
     from dune.perftool.pdelab.restriction import Restriction
-    if restriction == Restriction.NEGATIVE or get_global_context_value("integral_type") == "exterior_facet":
+    if restriction == Restriction.POSITIVE or get_global_context_value("integral_type") == "exterior_facet":
         return get_global_context_value("facemod_s")
-    if restriction == Restriction.POSITIVE:
+    if restriction == Restriction.NEGATIVE:
         return get_global_context_value("facemod_n")
     if restriction == Restriction.NONE:
         return None
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index babb432bc324f6ce08cec9f931df7ba8191181fa..fb283a0536318c1585a0963a33ca76105571cb84 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -1,10 +1,16 @@
 """ A pymbolic node representing a sum factorization kernel """
 
-from dune.perftool.options import get_option
-from dune.perftool.generation import get_counted_variable
+from dune.perftool.options import get_form_option, get_option
+from dune.perftool.generation import (get_counted_variable,
+                                      subst_rule,
+                                      transform,
+                                      )
 from dune.perftool.pdelab.geometry import local_dimension, world_dimension
 from dune.perftool.sumfact.quadrature import quadrature_inames
 from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray
+from dune.perftool.loopy.target import dtype_floatingpoint, type_floatingpoint
+from dune.perftool.loopy.vcl import ExplicitVCLCast, VCLLowerUpperLoad
+from dune.perftool.tools import get_leaf, maybe_wrap_subscript, remove_duplicates
 
 from pytools import ImmutableRecord, product
 
@@ -16,13 +22,194 @@ import frozendict
 import inspect
 
 
-class SumfactKernelInputBase(object):
+class SumfactKernelInterfaceBase(object):
+    """ A base class for the input/output of a sum factorization kernel
+    In stage 1, this represents the input object, in stage 3 the output object.
+    """
+    def realize(self, *a, **kw):
+        raise NotImplementedError
+
+    def realize_direct(self, *a, **kw):
+        raise NotImplementedError
+
+    @property
+    def within_inames(self):
+        return ()
+
+    @property
+    def direct_is_possible(self):
+        return False
+
+    @property
+    def stage(self):
+        raise NotImplementedError
+
+    @property
+    def function_args(self):
+        return ()
+
+    @property
+    def signature_args(self):
+        return ()
+
+    @property
+    def function_name_suffix(self):
+        return ""
+
+    def __repr__(self):
+        return "SumfactKernelInterfaceBase()"
+
+
+class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
+    def __init__(self, interfaces):
+        assert(isinstance(interfaces, tuple))
+        self.interfaces = interfaces
+
+    def __repr__(self):
+        return "_".join(repr(i) for i in self.interfaces)
+
+    @property
+    def stage(self):
+        return 1
+
+    @property
+    def direct_is_possible(self):
+        return all(i.direct_is_possible for i in self.interfaces)
+
+    def realize(self, sf, dep):
+        for i, inp in enumerate(self.interfaces):
+            dep = dep.union(inp.realize(sf, dep, index=i))
+        return dep
+
+    def realize_direct(self, shape, inames):
+        # Check whether the input exhibits a favorable structure
+        # (whether we can broadcast scalar values into SIMD registers)
+        total = set(self.interfaces)
+        lower = set(self.interfaces[:len(self.interfaces) // 2])
+        upper = set(self.interfaces[len(self.interfaces) // 2:])
+
+        if len(total) == 1:
+            # All input coefficients use the exact same input coefficient.
+            # We implement this by broadcasting it into a SIMD register
+            return prim.Call(ExplicitVCLCast(dtype_floatingpoint()),
+                             (self.interfaces[0].realize_direct(shape, inames),)
+                             )
+        elif len(total) == 2 and len(lower) == 1 and len(upper) == 1:
+            # The lower and the upper part of the SIMD register use
+            # the same input coefficient, we combine the SIMD register
+            # from two shorter SIMD types
+            return prim.Call(VCLLowerUpperLoad(dtype_floatingpoint()),
+                             (self.interfaces[0].realize_direct(shape, inames),
+                              self.interfaces[len(self.interfaces) // 2].realize_direct(shape, inames, which=1),
+                              )
+                             )
+        else:
+            # The input does not exhibit a broadcastable structure, we
+            # need to load scalars into the SIMD vector.
+            raise NotImplementedError("SIMD loads from scalars not implemented!")
+
+    @property
+    def function_args(self):
+        return sum((i.function_args for i in remove_duplicates(self.interfaces)), ())
+
+    @property
+    def signature_args(self):
+        if get_form_option("fastdg"):
+            return tuple("const {}* fastdg{}".format(type_floatingpoint(), i) for i, _ in enumerate(remove_duplicates(self.interfaces)))
+        else:
+            return ()
+
+    @property
+    def function_name_suffix(self):
+        return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces))
+
+
+class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
+    def __init__(self, interfaces):
+        self.interfaces = interfaces
+
+    def __repr__(self):
+        return "_".join(repr(o) for o in self.interfaces)
+
     @property
-    def direct_input(self):
-        return None
+    def stage(self):
+        return 3
+
+    @property
+    def within_inames(self):
+        return self.interfaces[0].within_inames
+
+    def _add_hadd(self, o, result):
+        hadd_function = "horizontal_add"
+        if len(set(self.interfaces)) > 1:
+            pos = self.interfaces.index(o)
+            if pos == 0:
+                hadd_function = "horizontal_add_lower"
+            else:
+                hadd_function = "horizontal_add_upper"
+
+        return prim.Call(prim.Variable(hadd_function), (result,))
+
+    def realize(self, sf, result, insn_dep):
+        outputs = set(self.interfaces)
 
-    def realize(self, sf, i, dep):
-        pass
+        trial_element, = set(o.trial_element for o in self.interfaces)
+        trial_element_index, = set(o.trial_element_index for o in self.interfaces)
+        from dune.perftool.sumfact.accumulation import accum_iname
+        element = get_leaf(trial_element, trial_element_index) if trial_element is not None else None
+        inames = tuple(accum_iname(element, mat.rows, i)
+                       for i, mat in enumerate(sf.matrix_sequence))
+        veciname = accum_iname(element, sf.vector_width // len(outputs), "vec")
+        transform(lp.tag_inames, [(veciname, "vec")])
+
+        deps = frozenset()
+        for o in outputs:
+            hadd_result = self._add_hadd(o, maybe_wrap_subscript(result, tuple(prim.Variable(iname) for iname in inames + (veciname,))))
+            deps = deps.union(o.realize(sf, hadd_result, insn_dep, inames=inames, additional_inames=(veciname,)))
+
+        return deps
+
+    def realize_direct(self, result, inames, shape, **args):
+        outputs = set(self.interfaces)
+
+        # If multiple horizontal_add's are to be performed with 'result'
+        # we need to precompute the result!
+        if len(outputs) > 1:
+            substname = "haddsubst_{}".format("_".join([i.name for i in inames]))
+            subst_rule(substname, (), result)
+            result = prim.Call(prim.Variable(substname), ())
+            transform(lp.precompute, substname)
+
+        deps = frozenset()
+        for o in outputs:
+            hadd_result = self._add_hadd(o, result)
+            which = tuple(remove_duplicates(self.interfaces)).index(o)
+            deps = deps.union(o.realize_direct(hadd_result, inames, shape, which=which, **args))
+
+        return deps
+
+    @property
+    def function_args(self):
+        if get_form_option("fastdg"):
+            return sum((i.function_args for i in remove_duplicates(self.interfaces)), ())
+        else:
+            return()
+
+    @property
+    def signature_args(self):
+        if get_form_option("fastdg"):
+            def _get_pair(i):
+                ret = ("{}* fastdg{}".format(type_floatingpoint(), i),)
+                if self.within_inames:
+                    ret = ret + ("unsigned int jacobian_offset{}".format(i),)
+                return ret
+            return sum((_get_pair(i) for i, _ in enumerate(remove_duplicates(self.interfaces))), ())
+        else:
+            return ()
+
+    @property
+    def function_name_suffix(self):
+        return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces))
 
 
 class SumfactKernelBase(object):
@@ -33,16 +220,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     def __init__(self,
                  matrix_sequence=None,
                  buffer=None,
-                 stage=1,
                  position_priority=None,
-                 restriction=None,
                  insn_dep=frozenset(),
-                 input=None,
-                 accumvar=None,
-                 test_element=None,
-                 test_element_index=None,
-                 trial_element=None,
-                 trial_element_index=None,
+                 interface=SumfactKernelInterfaceBase(),
                  predicates=frozenset(),
                  ):
         """Create a sum factorization kernel
@@ -96,34 +276,18 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
             for intermediate results. The memory is expected to be
             pre-initialized with the input or you have to provide
             direct_input (FastDGGridOperator).
-        stage: 1 or 3
         position_priority: Will be used in the dry run to order kernels
             when doing vectorization e.g. (dx u,dy u,dz u, u).
-        restriction: Restriction for faces values.
         insn_dep: An instruction ID that the first issued instruction
             should depend upon. All following ones will depend on each
             other.
-        input: An SumfactKernelInputBase instance describing the input of the kernel
-        accumvar: The accumulation variable to accumulate into
-        trial_element: The leaf element of the trial function space.
-            Used to correctly nest stage 3 in the jacobian case.
-        test_element: The leaf element of the test function space
-            Used to compute offsets in the fastdg case.
-        test_element_index: the component of the test_element
-        trial_element_index: the component of the trial_element
+        interface: An SumfactKernelInterfaceBase instance describing the input
+            (stage 1) or output (stage 3) of the kernel
         """
         # Assert the inputs!
         assert isinstance(matrix_sequence, tuple)
         assert all(isinstance(m, BasisTabulationMatrixBase) for m in matrix_sequence)
-
-        assert stage in (1, 3)
-
-        if stage == 1:
-            assert isinstance(input, SumfactKernelInputBase)
-
-        if stage == 3:
-            assert isinstance(restriction, tuple)
-
+        assert isinstance(interface, SumfactKernelInterfaceBase)
         assert isinstance(insn_dep, frozenset)
 
         # The following construction is a bit weird: Dict comprehensions do not have
@@ -149,7 +313,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     def __str__(self):
         # Above stringifier just calls back into this
         return "SF{}:[{}]->[{}]".format(self.stage,
-                                        str(self.input),
+                                        str(self.interface),
                                         ", ".join(str(m) for m in self.matrix_sequence))
 
     mapper_method = "map_sumfact_kernel"
@@ -158,32 +322,56 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     # Some cache key definitions
     # Watch out for the documentation to see which key is used unter what circumstances
     #
+    @property
+    def function_name(self):
+        """ The name of the function that implements this kernel """
+        return "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence),
+                                    self.interface.function_name_suffix)
+
+    @property
+    def parallel_key(self):
+        """ A key that identifies parallellizable kernels. """
+        return tuple(m.basis_size for m in self.matrix_sequence) + (self.stage, self.buffer)
 
     @property
     def cache_key(self):
         """ The cache key that can be used in generation magic
         Any two sum factorization kernels having the same cache_key
-        are realized simulatenously!
+        are realized simultaneously!
         """
-        return (self.matrix_sequence, self.restriction, self.stage, self.buffer, self.test_element_index)
+        if self.buffer is None:
+            # During dry run, we return something unique to this kernel
+            return repr(self)
+        else:
+            # Later we identify parallely implemented kernels by the assigned buffer
+            return self.buffer
 
     @property
-    def input_key(self):
+    def inout_key(self):
         """ A cache key for the input coefficients
         Any two sum factorization kernels having the same input_key
-        work on the same input coefficient (and are suitable for simultaneous
-        treatment because of that)
+        work on the same input coefficient (stage 1) or accumulate
+        into the same thing (stage 3)
         """
-        return (self.input, self.restriction, self.accumvar, self.trial_element_index)
-
-    @property
-    def group_name(self):
-        return "sfgroup_{}_{}_{}_{}".format(self.input, self.restriction, self.accumvar, self.trial_element_index)
+        return repr(self.interface)
 
     #
     # Some convenience methods to extract information about the sum factorization kernel
     #
 
+    def __lt__(self, other):
+        if self.parallel_key != other.parallel_key:
+            return self.parallel_key < other.parallel_key
+        if self.inout_key != other.inout_key:
+            return self.inout_key < other.inout_key
+        if self.position_priority == other.position_priority:
+            return repr(self) < repr(other)
+        if self.position_priority is None:
+            return False
+        if other.position_priority is None:
+            return True
+        return self.position_priority < other.position_priority
+
     @property
     def length(self):
         """ The number of matrices to apply """
@@ -199,14 +387,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
 
     @property
     def within_inames(self):
-        if self.trial_element is None:
-            return ()
-        else:
-            from dune.perftool.sumfact.basis import lfs_inames
-            element = self.trial_element
-            if isinstance(element, MixedElement):
-                element = element.extract_component(self.trial_element_index)[1]
-            return lfs_inames(element, self.restriction)
+        return self.interface.within_inames
 
     def vec_index(self, sf):
         """ Map an unvectorized sumfact kernel object to its position
@@ -292,6 +473,10 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     def tag(self):
         return "sumfac"
 
+    @property
+    def stage(self):
+        return self.interface.stage
+
     #
     # Define properties for conformity with the interface of VectorizedSumfactKernel
     #
@@ -358,7 +543,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         # Assert all the properties that need to be the same across all subkernels
         assert len(set(k.stage for k in kernels)) == 1
         assert len(set(k.length for k in kernels)) == 1
-        assert len(set(k.restriction for k in kernels)) == 1
         assert len(set(k.within_inames for k in kernels)) == 1
         assert len(set(k.predicates for k in kernels)) == 1
 
@@ -366,7 +550,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         for i in range(kernels[0].length):
             assert len(set(tuple(k.matrix_sequence[i].rows for k in kernels))) == 1
             assert len(set(tuple(k.matrix_sequence[i].cols for k in kernels))) == 1
-            assert len(set(tuple(k.matrix_sequence[i].face for k in kernels))) == 1
+            assert len(set(tuple(k.matrix_sequence[i].direction for k in kernels))) == 1
             assert len(set(tuple(k.matrix_sequence[i].transpose for k in kernels))) == 1
 
         # Join the instruction dependencies of all subkernels
@@ -394,7 +578,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     def __str__(self):
         # Above stringifier just calls back into this
         return "VSF{}:[{}]->[{}]".format(self.stage,
-                                         ", ".join(str(k.input) for k in self.kernels),
+                                         ", ".join(str(k.interface) for k in self.kernels),
                                          ", ".join(str(mat) for mat in self.matrix_sequence))
 
     mapper_method = "map_vectorized_sumfact_kernel"
@@ -405,6 +589,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     # Some cache key definitions
     # Watch out for the documentation to see which key is used unter what circumstances
     #
+    @property
+    def function_name(self):
+        return "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence),
+                                    self.interface.function_name_suffix)
 
     @property
     def cache_key(self):
@@ -437,36 +625,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     def within_inames(self):
         return self.kernels[0].within_inames
 
-    @property
-    def test_element(self):
-        return self.kernels[0].test_element
-
-    @property
-    def test_element_index(self):
-        return self.kernels[0].test_element_index
-
-    @property
-    def trial_element(self):
-        return self.kernels[0].trial_element
-
-    @property
-    def trial_element_index(self):
-        return self.kernels[0].trial_element_index
-
     @property
     def predicates(self):
         return self.kernels[0].predicates
 
-    @property
-    def input(self):
-        assert len(set(k.input for k in self.kernels)) == 1
-        return self.kernels[0].input
-
-    @property
-    def accumvar(self):
-        assert len(set(k.accumvar for k in self.kernels)) == 1
-        return self.kernels[0].accumvar
-
     @property
     def transposed(self):
         return self.kernels[0].transposed
@@ -487,16 +649,23 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     #
 
     @property
-    def cache_key(self):
-        return (tuple(k.cache_key for k in self.kernels), self.buffer)
+    def stage(self):
+        return self.kernels[0].stage
+
+    @property
+    def interface(self):
+        if self.stage == 1:
+            return VectorSumfactKernelInput(tuple(k.interface for k in self.kernels))
+        else:
+            return VectorSumfactKernelOutput(tuple(k.interface for k in self.kernels))
 
     @property
-    def input_key(self):
-        return tuple(k.input_key for k in self.kernels)
+    def cache_key(self):
+        return (tuple(k.cache_key for k in self.kernels), self.buffer)
 
     @property
-    def group_name(self):
-        return "_".join(k.group_name for k in self.kernels)
+    def inout_key(self):
+        return tuple(k.inout_key for k in self.kernels)
 
     @property
     def length(self):
@@ -507,10 +676,11 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         return True
 
     def horizontal_index(self, sf):
-        key = tuple(mat.derivative for mat in sf.matrix_sequence)
         for i, k in enumerate(self.kernels):
-            if tuple(mat.derivative for mat in k.matrix_sequence) == key:
-                return i
+            if sf.inout_key == k.inout_key:
+                if tuple(mat.derivative for mat in sf.matrix_sequence) == tuple(mat.derivative for mat in k.matrix_sequence):
+                    return i
+
         return 0
 
     def _quadrature_index(self, sf, visitor):
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index cbe8fbbc374c6ada8e3c63a1a81adb5a3c742ac6..99107c9cd8f6a4429965985fb8fbbdcbd3e898e9 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -1,7 +1,5 @@
 from dune.perftool.ufl.modified_terminals import Restriction
 
-from dune.perftool.options import get_option
-
 from dune.perftool.pdelab.argument import name_coefficientcontainer
 from dune.perftool.pdelab.geometry import world_dimension, local_dimension
 from dune.perftool.generation import (class_member,
@@ -20,9 +18,8 @@ from dune.perftool.generation import (class_member,
                                       transform,
                                       valuearg
                                       )
-from dune.perftool.loopy.buffer import get_buffer_temporary
 from dune.perftool.loopy.target import dtype_floatingpoint
-from dune.perftool.loopy.vcl import ExplicitVCLCast
+from dune.perftool.loopy.vcl import ExplicitVCLCast, get_vcl_type_size
 from dune.perftool.pdelab.localoperator import (name_domain_field,
                                                 lop_template_range_field,
                                                 )
@@ -64,14 +61,27 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
                                  slice_index=slice_index,
                                  )
 
+    @property
+    def _shortname(self):
+        infos = ["d{}".format(self.basis_size),
+                 "q{}".format(self.quadrature_size)]
+
+        if self.transpose:
+            infos.append("T")
+
+        if self.derivative:
+            infos.append("dx")
+
+        if self.face is not None:
+            infos.append("f{}".format(self.face))
+
+        if self.slice_size is not None:
+            infos.append("s{}".format(self.slice_index))
+
+        return "".join(infos)
+
     def __str__(self):
-        return "{}{}A{}{}{}" \
-               .format("face{}_".format(self.face) if self.face is not None else "",
-                       "d" if self.derivative else "",
-                       self.basis_size,
-                       "T" if self.transpose else "",
-                       "_slice{}".format(self.slice_index) if self.slice_size is not None else "",
-                       )
+        return "Theta_{}".format(self._shortname)
 
     @property
     def rows(self):
@@ -98,14 +108,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
         return size
 
     def pymbolic(self, indices):
-        name = "{}{}Theta{}{}_qp{}_dof{}" \
-               .format("face{}_".format(self.face) if self.face is not None else "",
-                       "d" if self.derivative else "",
-                       "T" if self.transpose else "",
-                       "_slice{}".format(self.slice_index) if self.slice_size is not None else "",
-                       self.quadrature_size,
-                       self.basis_size,
-                       )
+        name = str(self)
         define_theta(name, self)
         return prim.Subscript(prim.Variable(name), indices)
 
@@ -133,7 +136,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
         assert len(set(t.quadrature_size for t in tabs)) == 1
         assert len(set(t.basis_size for t in tabs)) == 1
         assert len(set(t.transpose for t in tabs)) == 1
-        assert len(set(t.face for t in tabs)) == 1
+        assert len(set(t.direction for t in tabs)) == 1
         assert len(set(t.slice_size for t in tabs)) == 1
         self.tabs = tabs
 
@@ -142,11 +145,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
         self.width = width
 
     def __str__(self):
-        abbrevs = tuple("{}A{}{}".format("d" if t.derivative else "",
-                                         self.basis_size,
-                                         "s{}".format(t.slice_index) if t.slice_size is not None else "")
-                        for t in self.tabs)
-        return "_".join(abbrevs)
+        return "Theta{}".format("_".join((t._shortname for t in self.tabs)))
 
     @property
     def quadrature_size(self):
@@ -196,17 +195,10 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
         # Check whether we can realize this by broadcasting the values of a simple tabulation
         if len(set(self.tabs)) == 1:
             theta = self.tabs[0].pymbolic(indices[:-1])
-            return prim.Call(ExplicitVCLCast(dtype_floatingpoint(), vector_width=len(self.tabs)), (theta,))
-
-        abbrevs = tuple("{}x{}".format("d" if t.derivative else "",
-                                       "s{}".format(t.slice_index) if t.slice_size is not None else "")
-                        for t in self.tabs)
-        name = "ThetaLarge{}{}_{}_qp{}_dof{}".format("face{}_".format(self.face) if self.face is not None else "",
-                                                     "T" if self.transpose else "",
-                                                     "_".join(abbrevs),
-                                                     self.tabs[0].quadrature_size,
-                                                     self.tabs[0].basis_size,
-                                                     )
+            return prim.Call(ExplicitVCLCast(dtype_floatingpoint(), vector_width=get_vcl_type_size(dtype_floatingpoint())), (theta,))
+
+        name = str(self)
+
         for i, tab in enumerate(self.tabs):
             define_theta(name, tab, additional_indices=(i,), width=self.width)
 
@@ -288,7 +280,10 @@ def local_quadrature_points_per_direction():
 
 
 def polynomial_degree():
-    form = get_global_context_value("formdata").preprocessed_form
+    data = get_global_context_value("data")
+    form = data.object_by_name[get_form_option("form")]
+    from dune.perftool.ufl.preprocess import preprocess_form
+    form = preprocess_form(form).preprocessed_form
     degree = form.coefficients()[0].ufl_element().degree()
     if isinstance(degree, int):
         degree = (degree,) * world_dimension()
@@ -407,9 +402,7 @@ def define_theta(name, tabmat, additional_indices=(), width=None):
     bound = tabmat.quadrature_size
     if tabmat.slice_size is not None:
         bound *= tabmat.slice_size
-    qp = name_oned_quadrature_points(bound)
-    qw = name_oned_quadrature_weights(bound)
-    sort_quadrature_points_weights(qp, qw, bound)
+
     degree = tabmat.basis_size - 1
     polynomials = name_polynomials(degree)
 
@@ -438,9 +431,15 @@ def define_theta(name, tabmat, additional_indices=(), width=None):
     if tabmat.slice_size is not None:
         inames[0] = tabmat.slice_size * inames[0] + tabmat.slice_index
 
-    args = [inames[1], prim.Subscript(prim.Variable(qp), (inames[0],))]
-    if tabmat.face is not None:
-        args[1] = tabmat.face
+    args = [inames[1]]
+
+    if tabmat.face is None:
+        qp = name_oned_quadrature_points(bound)
+        qw = name_oned_quadrature_weights(bound)
+        sort_quadrature_points_weights(qp, qw, bound)
+        args.append(prim.Subscript(prim.Variable(qp), (inames[0],)))
+    else:
+        args.append(tabmat.face)
 
     instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices),
                 expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)),
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index f3fb96b99b0d7cb0a633e6d499f6e362db5da207..4f7abe67048b3016db2277153844757116996164 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -18,9 +18,9 @@ from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray,
                                               quadrature_points_per_direction,
                                               set_quadrature_points,
                                               )
-from dune.perftool.error import PerftoolError
-from dune.perftool.options import get_option
-from dune.perftool.tools import add_to_frozendict, round_to_multiple
+from dune.perftool.error import PerftoolVectorizationError
+from dune.perftool.options import get_form_option
+from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff
 
 from pytools import product
 from frozendict import frozendict
@@ -33,7 +33,7 @@ import math
 @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o)
 def _cache_vectorization_info(old, new):
     if new is None:
-        raise PerftoolError("Vectorization info for sum factorization kernel was not gathered correctly!")
+        raise PerftoolVectorizationError("Vectorization info for sum factorization kernel was not gathered correctly!")
     return new
 
 
@@ -48,13 +48,6 @@ def attach_vectorization_info(sf):
         return _cache_vectorization_info(sf, None)
 
 
-def position_penalty_factor(sf):
-    if isinstance(sf, SumfactKernel) or sf.vertical_width > 1:
-        return 1
-    else:
-        return 1 + sum(abs(sf.kernels[i].position_priority - i) if sf.kernels[i].position_priority is not None else 0 for i in range(sf.length))
-
-
 @backend(interface="vectorization_strategy", name="model")
 def costmodel(sf):
     # Penalize vertical vectorization
@@ -66,17 +59,17 @@ def costmodel(sf):
         scalar_penalty = get_vcl_type_size(dtype_floatingpoint())
 
     # Return total operations
-    return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty
+    return sf.operations * vertical_penalty * scalar_penalty
 
 
 @backend(interface="vectorization_strategy", name="explicit")
 def explicit_costfunction(sf):
     # Read the explicitly set values for horizontal and vertical vectorization
     width = get_vcl_type_size(dtype_floatingpoint())
-    horizontal = get_option("vectorization_horizontal")
+    horizontal = get_form_option("vectorization_horizontal")
     if horizontal is None:
         horizontal = width
-    vertical = get_option("vectorization_vertical")
+    vertical = get_form_option("vectorization_vertical")
     if vertical is None:
         vertical = 1
     horizontal = int(horizontal)
@@ -84,15 +77,17 @@ def explicit_costfunction(sf):
 
     if sf.horizontal_width == horizontal and sf.vertical_width == vertical:
         # Penalize position mapping
-        return position_penalty_factor(sf)
+        return sf.operations
     else:
         return 1000000000000
 
 
-def strategy_cost(strategy):
+def strategy_cost(strat_tuple):
+    qp, strategy = strat_tuple
     func = get_backend(interface="vectorization_strategy",
-                       selector=lambda: get_option("vectorization_strategy"))
+                       selector=lambda: get_form_option("vectorization_strategy"))
     keys = set(sf.cache_key for sf in strategy.values())
+    set_quadrature_points(qp)
 
     # Sum over all the sum factorization kernels in the realization
     score = 0.0
@@ -104,6 +99,13 @@ def strategy_cost(strategy):
     return score
 
 
+def fixedqp_strategy_costfunction(qp):
+    def _cost(strategy):
+        return strategy_cost((qp, strategy))
+
+    return _cost
+
+
 def stringify_vectorization_strategy(strategy):
     result = []
     qp, strategy = strategy
@@ -139,7 +141,7 @@ def decide_vectorization_strategy():
     from dune.perftool.generation import retrieve_cache_items
     all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")]
 
-    # Stage 1 sumfactorizations that were actually used
+    # Stage 1 sum factorizations that were actually used
     basis_sumfacts = [i for i in retrieve_cache_items('kernel_default and basis_sf_kernels')]
 
     # This means we can have sum factorizations that will not get used
@@ -149,7 +151,7 @@ def decide_vectorization_strategy():
     active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts]
 
     # If no vectorization is needed, abort now
-    if get_option("vectorization_strategy") == "none":
+    if get_form_option("vectorization_strategy") == "none":
         for sf in all_sumfacts:
             _cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer")))
         return
@@ -157,29 +159,22 @@ def decide_vectorization_strategy():
     logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes"
                  .format(len(active_sumfacts)))
 
-    # Find the best vectorization strategy by using a costmodel
-    width = get_vcl_type_size(dtype_floatingpoint())
-
     #
-    # Optimize over all the possible quadrature point tuples
+    # Find the best vectorization strategy by using a costmodel
     #
-    quad_points = [quadrature_points_per_direction()]
-
-    if get_option("vectorization_allow_quadrature_changes"):
-        sf = next(iter(active_sumfacts))
-        depth = 1
-        while depth <= width:
-            i = 0 if sf.matrix_sequence[0].face is None else 1
-            quad = list(quadrature_points_per_direction())
-            quad[i] = round_to_multiple(quad[i], depth)
-            quad_points.append(tuple(quad))
-            depth = depth * 2
-        quad_points = list(set(quad_points))
+    # Note that this optimization procedure uses a hierarchic approach to bypass
+    # the problems of unfavorable complexity of the set of all possible vectorization
+    # opportunities. Optimizations are performed at different levels (you find these
+    # levels in the function names implementing them), where optimal solutions at a
+    # higher level are combined into lower level solutions or optima of optimal solutions
+    # at higher level are calculated:
+    # * Level 1: Finding an optimal quadrature tuple (by finding optimum of level 2 optima)
+    # * Level 2: Split by parallelizability and combine optima into optimal solution
+    # * Level 3: Optimize number of different inputs to consider
+    # * Level 4: Optimize horizontal/vertical/hybrid strategy
+    width = get_vcl_type_size(dtype_floatingpoint())
+    qp, sfdict = level1_optimal_vectorization_strategy(active_sumfacts, width)
 
-    # Find the minimum cost strategy between all the quadrature point tuples
-    optimal_strategies = {qp: fixed_quadrature_optimal_vectorization(active_sumfacts, width, qp) for qp in quad_points}
-    qp = min(optimal_strategies, key=lambda qp: strategy_cost(optimal_strategies[qp]))
-    sfdict = optimal_strategies[qp]
     set_quadrature_points(qp)
 
     logger.debug("decide_vectorization_strategy: Decided for the following strategy:"
@@ -193,85 +188,104 @@ def decide_vectorization_strategy():
         _cache_vectorization_info(sf, sfdict[sf])
 
 
-def fixed_quadrature_optimal_vectorization(sumfacts, width, qp):
-    """ For a given quadrature point tuple, find the optimal strategy!
+def level1_optimal_vectorization_strategy(sumfacts, width):
+    # Gather a list of possible quadrature point tuples
+    quad_points = [quadrature_points_per_direction()]
+    if get_form_option("vectorization_allow_quadrature_changes"):
+        sf = next(iter(sumfacts))
+        depth = 1
+        while depth <= width:
+            i = 0 if sf.matrix_sequence[0].face is None else 1
+            quad = list(quadrature_points_per_direction())
+            quad[i] = round_to_multiple(quad[i], depth)
+            quad_points.append(tuple(quad))
+            depth = depth * 2
+        quad_points = list(set(quad_points))
+
+    # Find the minimum cost strategy between all the quadrature point tuples
+    optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points}
+    qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp])))
+
+    return qp, optimal_strategies[qp]
 
-    In order to have this scale sufficiently, we cannot simply list all vectorization
-    opportunities and score them individually, but we need to do a divide and conquer
-    approach.
-    """
-    set_quadrature_points(qp)
 
-    # Find the sets of simultaneously realizable kernels (thats an equivalence relation)
-    keys = frozenset(sf.input_key for sf in sumfacts)
+def level2_optimal_vectorization_strategy(sumfacts, width, qp):
+    # Find the sets of simultaneously realizable kernels
+    keys = frozenset(sf.parallel_key for sf in sumfacts)
 
     # Find minimums for each of these sets
     sfdict = frozendict()
+
     for key in keys:
-        key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key)
-        minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp),
-                      key=strategy_cost)
-        sfdict = add_to_frozendict(sfdict, minimum)
+        key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key)
+        key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp),
+                           key=fixedqp_strategy_costfunction(qp))
+        sfdict = add_to_frozendict(sfdict, key_strategy)
 
     return sfdict
 
 
-def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
+def level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
+    for opp in _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
+        # Add non-vectorized implementation information to all kernels that are not present in
+        # the optimal strategy
+        yield add_to_frozendict(opp,
+                                {sf: sf.copy(buffer=get_counted_variable("buffer")) for sf in sumfacts if sf not in opp})
+
+
+def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, already=frozendict()):
     if len(sumfacts) == 0:
-        # We have gone into recursion deep enough to have all sum factorization nodes
-        # assigned their vectorized counterpart. We can yield the result now!
         yield already
         return
 
-    # Otherwise we pick a random sum factorization kernel and construct all the vectorization
-    # opportunities realizing this particular kernel and go into recursion.
-    sf_to_decide = next(iter(sumfacts))
-
-    # Have "unvectorized" as an option, although it is not good
-    for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}),
-                                                              width,
-                                                              qp,
-                                                              add_to_frozendict(already,
-                                                                                {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
-                                                                                ),
-                                                              ):
-        yield opp
-
-    horizontal = 1
-    while horizontal <= width:
-        # Iterate over the possible combinations of sum factorization kernels
-        # taking into account all the permutations of kernels. This also includes
-        # combinations which use a padding of 1 - but only for pure horizontality.
-        generators = [it.permutations(sumfacts, horizontal)]
-        if horizontal >= 4:
-            generators.append(it.permutations(sumfacts, horizontal - 1))
-        for combo in it.chain(*generators):
-            # The chosen kernels must be part of the kernels for recursion
-            # to work correctly
-            if sf_to_decide not in combo:
-                continue
+    # We store the information whether a vectorization opportunity has been yielded from this
+    # generator to yield an incomplete strategy if not (which is then completed with unvectorized
+    # kernel implementations)
+    yielded = False
+
+    # Find the number of input coefficients we can work on
+    keys = frozenset(sf.inout_key for sf in sumfacts)
+
+    inoutkey_sumfacts = [tuple(sorted(filter(lambda sf: sf.inout_key == key, sumfacts))) for key in sorted(keys)]
+
+    for parallel in (1, 2):
+        if parallel > len(keys):
+            continue
+
+        horizontal = 1
+        while horizontal <= width // parallel:
+            combo = sum((inoutkey_sumfacts[part][:horizontal] for part in range(parallel)), ())
+
+            vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp)
+            horizontal *= 2
 
-            # Set up the vectorization dict for this combo
-            vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
             if vecdict is None:
                 # This particular choice was rejected for some reason.
                 # Possible reasons:
                 # * the quadrature point tuple not being suitable
                 #   for this vectorization strategy
+                # * there are not enough horizontal kernels
                 continue
 
             # Go into recursion to also vectorize all kernels not in this combo
-            for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference(combo),
-                                                                      width,
-                                                                      qp,
-                                                                      add_to_frozendict(already, vecdict),
-                                                                      ):
+            for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo),
+                                                                        width,
+                                                                        qp,
+                                                                        add_to_frozendict(already, vecdict),
+                                                                        ):
+                yielded = True
                 yield opp
 
-        horizontal = horizontal * 2
+    # If we did not yield on this recursion level, yield what we got so far
+    if not yielded:
+        yield already
 
 
 def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
+    # Discard opportunities that do not contain enough horizontal kernels
+    if len(sumfacts) not in (horizontal, horizontal - 1):
+        return None
+
     # Enhance the list of sumfact nodes by adding vertical splittings
     kernels = []
     for sf in sumfacts:
diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py
index 8f41f3357374f4d9791edc898dd4a0274a0141fe..d5c0a18ebe8f7e32316aa459c959ab1eb9ba75f8 100644
--- a/python/dune/perftool/tools.py
+++ b/python/dune/perftool/tools.py
@@ -76,3 +76,31 @@ def add_to_frozendict(fd, valdict):
     t = dict(fd)
     t.update(valdict)
     return frozendict.frozendict(t)
+
+
+def list_diff(l1, l2):
+        l = []
+        for item in l1:
+            if item not in l2:
+                l.append(item)
+        return l
+
+
+def get_leaf(element, index):
+    """ return a leaf element if the given element is a MixedElement """
+    leaf_element = element
+    from ufl import MixedElement
+    if isinstance(element, MixedElement):
+        assert isinstance(index, int)
+        leaf_element = element.extract_component(index)[1]
+
+    return leaf_element
+
+
+def remove_duplicates(iterable):
+    """ Remove duplicates from an iterable while preserving the order """
+    seen = set()
+    for i in iterable:
+        if i not in seen:
+            yield i
+            seen.add(i)
diff --git a/python/dune/perftool/ufl/modified_terminals.py b/python/dune/perftool/ufl/modified_terminals.py
index ac372d7fde8ba533fc52232f663a292026c2b7b4..bf3a6df939ddf787967b16c316cd2aaab308ce07 100644
--- a/python/dune/perftool/ufl/modified_terminals.py
+++ b/python/dune/perftool/ufl/modified_terminals.py
@@ -9,8 +9,8 @@ import ufl.classes as uc
 
 class Restriction:
     NONE = 0
-    NEGATIVE = 1
-    POSITIVE = 2
+    POSITIVE = 1
+    NEGATIVE = 2
 
 
 class ModifiedArgument(Record):
diff --git a/python/dune/perftool/ufl/preprocess.py b/python/dune/perftool/ufl/preprocess.py
index 4564fee060d339c93a1f362ccdc7523d258e6856..19ca10359de05dec5154dc194d18a9233645664b 100644
--- a/python/dune/perftool/ufl/preprocess.py
+++ b/python/dune/perftool/ufl/preprocess.py
@@ -1,8 +1,27 @@
 """ Preprocessing algorithms for UFL forms """
 
 import ufl.classes as uc
+import ufl.algorithms.apply_function_pullbacks as afp
 
+from pytools import memoize
 
+
+class FunctionPullbackApplier(afp.FunctionPullbackApplier):
+    def argument(self, o):
+        return afp.apply_single_function_pullbacks(o)
+
+    def coefficient(self, o):
+        if o.count() in (0, 1):
+            return afp.apply_single_function_pullbacks(o)
+        else:
+            return o
+
+
+# Monkey patch the pullback applier from UFL
+afp.FunctionPullbackApplier = FunctionPullbackApplier
+
+
+@memoize
 def preprocess_form(form):
     from ufl.algorithms import compute_form_data
     formdata = compute_form_data(form,
diff --git a/python/dune/perftool/ufl/transformations/__init__.py b/python/dune/perftool/ufl/transformations/__init__.py
index dde8a965177b657a0ed9554302fa3e95c8bf7825..de66173a2a04bf3d5f5b7873d8444d230e665ca0 100644
--- a/python/dune/perftool/ufl/transformations/__init__.py
+++ b/python/dune/perftool/ufl/transformations/__init__.py
@@ -19,10 +19,10 @@ class UFLTransformationWrapper(object):
             return
 
         # Write out a dot file
-        from dune.perftool.options import get_option
-        if get_option("print_transformations"):
+        from dune.perftool.options import get_form_option
+        if get_form_option("print_transformations"):
             import os
-            dir = get_option("print_transformations_dir")
+            dir = get_form_option("print_transformations_dir")
 
             for i, exprtowrite in enumerate(expr):
                 filename = "trafo_{}_{}_{}{}.dot".format(self.name, str(self.counter).zfill(4), "in" if before else "out", "_{}".format(i) if len(expr) > 1 else "")
diff --git a/python/dune/perftool/ufl/transformations/blockpreconditioner.py b/python/dune/perftool/ufl/transformations/blockpreconditioner.py
new file mode 100644
index 0000000000000000000000000000000000000000..c16c11520ccef27a5cf23d22ba387846b434b4a0
--- /dev/null
+++ b/python/dune/perftool/ufl/transformations/blockpreconditioner.py
@@ -0,0 +1,82 @@
+""" Derive block preconditioners from residual forms """
+
+from dune.perftool.ufl.modified_terminals import Restriction
+
+from ufl.algorithms import MultiFunction
+from ufl.algorithms.map_integrands import map_integrands
+
+import ufl.classes as uc
+import itertools
+
+
+class OffDiagonalBlockSwitcher(MultiFunction):
+    def __init__(self, restrictions):
+        self.restrictions = restrictions
+        self.res = Restriction.NONE
+        MultiFunction.__init__(self)
+
+    def expr(self, o):
+        return self.reuse_if_untouched(o, *tuple(self(op) for op in o.ufl_operands))
+
+    def positive_restricted(self, o):
+        self.res = Restriction.POSITIVE
+        ret = self(o.ufl_operands[0])
+        self.rest = Restriction.NONE
+        if isinstance(ret, uc.Zero):
+            return ret
+        else:
+            return o
+
+    def negative_restricted(self, o):
+        self.res = Restriction.NEGATIVE
+        ret = self(o.ufl_operands[0])
+        self.res = Restriction.NONE
+        if isinstance(ret, uc.Zero):
+            return ret
+        else:
+            return o
+
+    def reference_value(self, o):
+        ret = self(o.ufl_operands[0])
+        if isinstance(ret, uc.Zero):
+            return ret
+        else:
+            return o
+
+    def argument(self, o):
+        if self.res == self.restrictions[o.number()]:
+            return o
+        else:
+            return uc.Zero(shape=o.ufl_shape,
+                           free_indices=o.ufl_free_indices,
+                           index_dimensions=o.ufl_index_dimensions)
+
+
+def list_restriction_tuples(diagonal):
+    if diagonal:
+        yield (Restriction.NONE, Restriction.NONE)
+
+    res = (Restriction.POSITIVE, Restriction.NEGATIVE)
+    amount = 1 if diagonal else 2
+
+    for rtup in itertools.product(res, res):
+        if len(set(rtup)) == amount:
+            yield rtup
+
+
+def _block_jacobian(form, diagonal=True):
+    assert(len(form.arguments()) == 2)
+
+    forms = []
+    for rtup in list_restriction_tuples(diagonal):
+        forms.append(map_integrands(OffDiagonalBlockSwitcher(rtup), form))
+
+    return sum(forms)
+
+
+def diagonal_block_jacobian(form):
+    return _block_jacobian(form)
+
+
+def offdiagonal_block_jacobian(form):
+    return _block_jacobian(form, False)
diff --git a/python/dune/perftool/ufl/transformations/indexpushdown.py b/python/dune/perftool/ufl/transformations/indexpushdown.py
index 1dd7139d880d947cb8da33a26630cc008ce514f8..73b8d73a670f66517060ac61338a683da275d90e 100644
--- a/python/dune/perftool/ufl/transformations/indexpushdown.py
+++ b/python/dune/perftool/ufl/transformations/indexpushdown.py
@@ -16,9 +16,9 @@ class IndexPushDown(MultiFunction):
             terms = [uc.Indexed(self(term), idx) for term in get_operands(expr)]
             return construct_binary_operator(terms, uc.Sum)
         elif isinstance(expr, uc.Conditional):
-            return uc.Conditional(expr.ufl_operands[0],
-                                  uc.Indexed(self(expr.ufl_operands[1]), idx),
-                                  uc.Indexed(self(expr.ufl_operands[2]), idx)
+            return uc.Conditional(self(expr.ufl_operands[0]),
+                                  self(uc.Indexed(expr.ufl_operands[1], idx)),
+                                  self(uc.Indexed(expr.ufl_operands[2], idx))
                                   )
         else:
             # This is a normal indexed, we treat it as any other.
diff --git a/python/dune/perftool/ufl/visitor.py b/python/dune/perftool/ufl/visitor.py
index fcf2ee7728e96a57ebd94ada15966c992ff8120f..cbede6c30fa7d6754b58b57b662ba126a1dec6b5 100644
--- a/python/dune/perftool/ufl/visitor.py
+++ b/python/dune/perftool/ufl/visitor.py
@@ -13,8 +13,7 @@ from dune.perftool.ufl.modified_terminals import (ModifiedTerminalTracker,
                                                   Restriction,
                                                   )
 from dune.perftool.tools import maybe_wrap_subscript
-from dune.perftool.options import get_option
-from dune.perftool.pdelab.parameter import name_paramclass, name_time
+from dune.perftool.options import get_form_option
 from loopy import Reduction
 
 from pymbolic.primitives import (Call,
@@ -61,7 +60,7 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
             self.current_info = info
             expr = self._call(o, False)
             if expr != 0:
-                if get_option("simplify"):
+                if get_form_option("simplify"):
                     from dune.perftool.sympy import simplify_pymbolic_expression
                     expr = simplify_pymbolic_expression(expr)
                 self.interface.generate_accumulation_instruction(expr, self)
@@ -105,7 +104,7 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         # Correct the restriction on boundary integrals
         restriction = self.restriction
         if self.measure == 'exterior_facet':
-            restriction = Restriction.NEGATIVE
+            restriction = Restriction.POSITIVE
         leaf_element = o.ufl_element()
 
         # Select the correct leaf element in the case of this being a mixed finite element
@@ -128,13 +127,13 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
             return self.interface.pymbolic_basis(leaf_element, restriction, o.number())
 
     def coefficient(self, o):
+        # Correct the restriction on boundary integrals
+        restriction = self.restriction
+        if self.measure == 'exterior_facet':
+            restriction = Restriction.POSITIVE
+
         # Do something different for trial function and coefficients from jacobian apply
         if o.count() == 0 or o.count() == 1:
-            # Correct the restriction on boundary integrals
-            restriction = self.restriction
-            if self.measure == 'exterior_facet':
-                restriction = Restriction.NEGATIVE
-
             self.interface.initialize_function_spaces(o, self)
 
             index = None
@@ -161,31 +160,19 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
 
         # In this case it represents the time variable
         elif o.count() == 2:
-            param = name_paramclass()
-            time = name_time()
-            name = param + "." + time
-            valuearg(name)
-            return Variable(name)
-
-        # Check if this is a parameter function
+            # The base class 'InstationaryLocalOperatorDefaultMethods' stores the time
+            # and exports it through a getter method 'getTime'
+            return prim.Call(prim.Variable("getTime"), ())
         else:
-            raise NotImplementedError("Handling non-symbolic parameter functions is currently reevaluated!")
-            # We expect all coefficients to be of type Expression!
-            assert isinstance(o, Expression)
-
-            # Determine the name of the parameter function
-            name = get_global_context_value("data").object_names[id(o)]
-
-            cellwise_constant = is_cellwise_constant(o)
+            if self.reference_grad:
+                raise PerftoolUFLError("Coefficient gradients should not be transformed to reference element")
 
-            # Trigger the generation of code for this thing in the parameter class
-            if o.on_intersection:
-                self.interface.intersection_parameter_function(name, o, cellwise_constant)
-            else:
-                self.interface.cell_parameter_function(name, o, self.restriction, cellwise_constant)
+            return self.interface.pymbolic_gridfunction(o, restriction, self.grad)
 
-            # And return a symbol
-            return Variable(name)
+    def variable(self, o):
+        # Right now only scalar varibables are supported
+        assert o.ufl_shape is ()
+        return o.expression().value()
 
     #
     # Handlers for all indexing related stuff
@@ -248,6 +235,8 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         if all(isinstance(i, int) for i in self.indices):
             index = self.indices[0]
             self.indices = self.indices[1:]
+            if len(self.indices) == 0:
+                self.indices = None
             return self.call(o.ufl_operands[index])
         else:
             return self.interface.pymbolic_list_tensor(o)
@@ -354,6 +343,22 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
     def min_value(self, o):
         return self._minmax_impl(min, "min", tuple(self.call(op) for op in o.ufl_operands))
 
+    def math_function(self, o):
+        # MathFunction is a base class for unary functions. We use this to provide
+        # custom functions. Such a custom functions inherits from it and defines the
+        # following methods:
+        # * visit: This function is called from here to delegate the visiting process
+        #          to the user code. The only argument is this visitor instance.
+        # * derivative: It is called from UFL AD code to determine the derivative.
+        #               Upstream documentation indicates that FEniCS allows the same
+        #               (ab)use of the MathFunction node.
+        # Note that if the __init__ method of your function differs from MathFunction,
+        # you also need to implement the method _ufl_expr_reconstruct_
+        if hasattr(o, "visit"):
+            return o.visit(self)
+        else:
+            raise NotImplementedError("Function {} is not known to dune-perftool.".format(o._name))
+
     #
     # Handler for conditionals, use pymbolic base implementation
     #
@@ -365,7 +370,7 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         try:
             evaluated = eval(str(cond))
         except:
-            return prim.If(self.call(o.ufl_operands[0]),
+            return prim.If(cond,
                            self.call(o.ufl_operands[1]),
                            self.call(o.ufl_operands[2]))
 
@@ -425,6 +430,11 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         # The normal must be restricted to be well-defined
         assert self.restriction is not Restriction.NONE
 
+        # Note: In UFL the jump is defined as: jump(v) = v('+') -
+        # v('-'). The corresponding outer unit normal is
+        # n=FacetNormal(cell)('+'). In order to be consisten with UFL
+        # we need to create the outer unit normal if the restriction
+        # is positive.
         if self.restriction == Restriction.POSITIVE:
             return self.interface.pymbolic_unit_outer_normal()
         if self.restriction == Restriction.NEGATIVE:
@@ -443,14 +453,14 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
     def jacobian_inverse(self, o):
         restriction = self.restriction
         if self.measure == 'exterior_facet':
-            restriction = Restriction.NEGATIVE
+            restriction = Restriction.POSITIVE
 
         assert(len(self.indices) == 2)
         i, j = self.indices
         self.indices = None
 
         # Implement diagonal jacobians for unrolled matrices!
-        if get_option("diagonal_transformation_matrix"):
+        if get_form_option("diagonal_transformation_matrix"):
             if isinstance(i, int) and isinstance(j, int) and i != j:
                 return 0
 
@@ -465,7 +475,7 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
     def cell_volume(self, o):
         restriction = self.restriction
         if self.measure == 'exterior_facet':
-            restriction = Restriction.NEGATIVE
+            restriction = Restriction.POSITIVE
 
         return self.interface.pymbolic_cell_volume(restriction)
 
diff --git a/python/loopy b/python/loopy
index e4a05746af70ed6e6b7e5b91984f7303fe96f1f4..dedb956bd72a204a685e7aeb7788d1fa55969899 160000
--- a/python/loopy
+++ b/python/loopy
@@ -1 +1 @@
-Subproject commit e4a05746af70ed6e6b7e5b91984f7303fe96f1f4
+Subproject commit dedb956bd72a204a685e7aeb7788d1fa55969899
diff --git a/python/pymbolic b/python/pymbolic
index 915ecb96c1eb60b82973e8cf695e4ffcb622c90a..ffecfaebf21dc8799cd5d007a969e659b255a1e3 160000
--- a/python/pymbolic
+++ b/python/pymbolic
@@ -1 +1 @@
-Subproject commit 915ecb96c1eb60b82973e8cf695e4ffcb622c90a
+Subproject commit ffecfaebf21dc8799cd5d007a969e659b255a1e3
diff --git a/python/pytools b/python/pytools
index e4dd13899c9161ce641c29c55973bfce3df52972..747a1c1fac3fb4f2067f00c1a670f5a7b963b396 160000
--- a/python/pytools
+++ b/python/pytools
@@ -1 +1 @@
-Subproject commit e4dd13899c9161ce641c29c55973bfce3df52972
+Subproject commit 747a1c1fac3fb4f2067f00c1a670f5a7b963b396
diff --git a/python/setup.py b/python/setup.py
index ccbe9e56658fa87a0c574767f6c7f4282612a750..f193748a5e814895f23fee033c74c850aaf39573 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -44,6 +44,7 @@ setup(name='dune.perftool',
       cmdclass={'test': PyTest},
       entry_points = {
         "console_scripts": [
-            "ufl2pdelab = dune.perftool.compile:compile_form",
+            "generate_operators = dune.perftool.compile:entry_generate_operators",
+            "generate_driver = dune.perftool.compile:entry_generate_driver",
         ]
     })
diff --git a/python/ufl b/python/ufl
index 962d56f65821fb9c50ca4a5a858882c472243431..5a9593c956fc843eee6ce3a2ae2b9cbc4aec62bf 160000
--- a/python/ufl
+++ b/python/ufl
@@ -1 +1 @@
-Subproject commit 962d56f65821fb9c50ca4a5a858882c472243431
+Subproject commit 5a9593c956fc843eee6ce3a2ae2b9cbc4aec62bf
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d0e1df820ce14b12a2bada056ae67e8bce81c318..4be79371bc51e40c4334f0e91343c522d00d2147 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,10 +1,11 @@
 add_subdirectory(hyperbolic)
 add_subdirectory(heatequation)
-add_subdirectory(laplace)
+add_subdirectory(navier-stokes)
 add_subdirectory(nonlinear)
 add_subdirectory(poisson)
 add_subdirectory(stokes)
 
 add_subdirectory(sumfact)
-
-add_subdirectory(blockstructured)
\ No newline at end of file
+add_subdirectory(coeffeval)
+add_subdirectory(blockstructured)
+add_subdirectory(adjoint)
diff --git a/test/adjoint/CMakeLists.txt b/test/adjoint/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..056264ebf8792544fc98b2fd4106a1493dfac54a
--- /dev/null
+++ b/test/adjoint/CMakeLists.txt
@@ -0,0 +1,5 @@
+dune_add_formcompiler_system_test(UFLFILE poisson_mc.ufl
+  BASENAME adjoint_poisson_mc
+  INIFILE poisson_mc.ini
+  SOURCE poisson_mc_main.cc
+  )
diff --git a/test/adjoint/poisson_mc.ini b/test/adjoint/poisson_mc.ini
new file mode 100644
index 0000000000000000000000000000000000000000..fe36a516f8cdd2a57a16e756a5cbfd770e3a5e79
--- /dev/null
+++ b/test/adjoint/poisson_mc.ini
@@ -0,0 +1,34 @@
+__name = adjoint_poisson_mc
+
+lowerleft = 0.0 0.0
+upperright = 1.0 1.0
+elements = 32 32
+elementType = simplical
+
+[wrapper.vtkcompare]
+name = poisson_mc
+extension = vtu
+
+[formcompiler]
+operators = r, r_adjoint, r_control
+
+[formcompiler.r]
+form = r
+filename = poisson_mc_operator_r.hh
+classname = ROperator
+
+[formcompiler.r_adjoint]
+form = r
+adjoint = 1
+objective_function = J
+filename = poisson_mc_operator_r_adjoint.hh
+classname = RAdjointOperator
+
+[formcompiler.r_control]
+form = r
+control = 1
+objective_function = J
+control_variable = A, b, f
+generate_jacobians = 0
+filename = poisson_mc_operator_r_control.hh
+classname = RControlOperator
diff --git a/test/adjoint/poisson_mc.ufl b/test/adjoint/poisson_mc.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..80fb03d0ef548788fd7417d8cd9402f5839ef98a
--- /dev/null
+++ b/test/adjoint/poisson_mc.ufl
@@ -0,0 +1,20 @@
+cell = triangle
+
+x = SpatialCoordinate(cell)
+
+V = FiniteElement("CG", cell, 1)
+u = TrialFunction(V)
+v = TestFunction(V)
+
+g = x[0]*x[0] + x[1]*x[1]
+A = as_matrix([[variable(1.0), variable(0.5)],[variable(1.2), variable(3.0)]])
+b = as_vector([variable(2.0), variable(4.2)])
+c = as_vector([-2.8, 1.7])
+f = variable(-4.0)
+
+r = (inner(A*grad(u), grad(v)) + inner(c,b)*u*v - f*v)*dx
+forms = [r]
+
+J = inner(u,u)*dx
+interpolate_expression = g
+is_dirichlet = 1
\ No newline at end of file
diff --git a/test/adjoint/poisson_mc_driver.hh b/test/adjoint/poisson_mc_driver.hh
new file mode 100644
index 0000000000000000000000000000000000000000..d75f8b056ae11c39f299d243f04de10f866b00ea
--- /dev/null
+++ b/test/adjoint/poisson_mc_driver.hh
@@ -0,0 +1,229 @@
+#ifndef POISSON_MC_DRIVER_HH
+#define POISSON_MC_DRIVER_HH
+
+
+#include "dune/pdelab/gridfunctionspace/vtk.hh"
+#include "dune/pdelab/backend/istl.hh"
+#include "dune/common/parametertreeparser.hh"
+#include "dune/pdelab/stationary/linearproblem.hh"
+#include "dune/testtools/gridconstruction.hh"
+#include <random>
+#include "dune/pdelab/function/callableadapter.hh"
+#include "dune/alugrid/grid.hh"
+#include "string"
+#include "dune/perftool/vtkpredicate.hh"
+#include "dune/pdelab/gridfunctionspace/gridfunctionadapter.hh"
+#include "dune/common/parametertree.hh"
+#include "dune/pdelab/gridoperator/gridoperator.hh"
+#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh"
+#include "dune/pdelab/common/functionutilities.hh"
+#include "dune/pdelab/finiteelementmap/pkfem.hh"
+#include "dune/pdelab/constraints/conforming.hh"
+#include "dune/pdelab/function/discretegridviewfunction.hh"
+
+#include "poisson_mc_operator_r.hh"
+#include "poisson_mc_operator_r_adjoint.hh"
+#include "poisson_mc_operator_r_control.hh"
+
+
+bool driver(int argc, char** argv){
+  // Initialize basic stuff...
+  using RangeType = double;
+  Dune::ParameterTree initree;
+  Dune::ParameterTreeParser::readINITree(argv[1], initree);
+
+  // Setup grid (view)...
+  using Grid = Dune::ALUGrid<2, 2, Dune::simplex, Dune::conforming>;
+  using GV = Grid::LeafGridView;
+  using DF = Grid::ctype;
+  IniGridFactory<Grid> factory(initree);
+  std::shared_ptr<Grid> grid = factory.getGrid();
+  GV gv = grid->leafGridView();
+
+  // Set up finite element maps...
+  using P1_FEM = Dune::PDELab::PkLocalFiniteElementMap<GV, DF, RangeType, 1>;
+  P1_FEM p1_fem(gv);
+
+  // Set up grid function spaces...
+  using VectorBackendP1 = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::none>;
+  using DirichletConstraintsAssember = Dune::PDELab::ConformingDirichletConstraints;
+  using P1_dirichlet_GFS = Dune::PDELab::GridFunctionSpace<GV, P1_FEM, DirichletConstraintsAssember, VectorBackendP1>;
+  P1_dirichlet_GFS p1_dirichlet_gfs_(gv, p1_fem);
+  p1_dirichlet_gfs_.name("p1_dirichlet_gfs_");
+
+  // Set up constraints container...
+  using P1_dirichlet_GFS_CC = P1_dirichlet_GFS::ConstraintsContainer<RangeType>::Type;
+  P1_dirichlet_GFS_CC p1_dirichlet_gfs__cc;
+  p1_dirichlet_gfs__cc.clear();
+  auto p1_bctype_lambda = [&](const auto& x){ return 1.0; };
+  auto p1_bctype = Dune::PDELab::makeBoundaryConditionFromCallable(gv, p1_bctype_lambda);
+  Dune::PDELab::constraints(p1_bctype, p1_dirichlet_gfs_, p1_dirichlet_gfs__cc);
+
+  // Set up grid grid operators...
+  using LOP_R = ROperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType>;
+  using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
+  using GO_r = Dune::PDELab::GridOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, LOP_R, MatrixBackend, DF, RangeType, RangeType, P1_dirichlet_GFS_CC, P1_dirichlet_GFS_CC>;
+  LOP_R lop_r(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree);
+  p1_dirichlet_gfs_.update();
+  int generic_dof_estimate =  6 * p1_dirichlet_gfs_.maxLocalSize();
+  int dofestimate = initree.get<int>("istl.number_of_nnz", generic_dof_estimate);
+  MatrixBackend mb(dofestimate);
+  GO_r go_r(p1_dirichlet_gfs_, p1_dirichlet_gfs__cc, p1_dirichlet_gfs_, p1_dirichlet_gfs__cc, lop_r, mb);
+  std::cout << "gfs with " << p1_dirichlet_gfs_.size() << " dofs generated  "<< std::endl;
+  std::cout << "cc with " << p1_dirichlet_gfs__cc.size() << " dofs generated  "<< std::endl;
+
+  // Set up solution vectors...
+  using V_R = Dune::PDELab::Backend::Vector<P1_dirichlet_GFS,DF>;
+  V_R x_r(p1_dirichlet_gfs_);
+  x_r = 0.0;
+  auto lambda_0000 = [&](const auto& x){ return (double)x[1] * x[1] + x[0] * x[0]; };
+  auto func_0000 = Dune::PDELab::makeGridFunctionFromCallable(gv, lambda_0000);
+  Dune::PDELab::interpolate(func_0000, p1_dirichlet_gfs_, x_r);
+  auto lambda_0001 = [&](const auto& x){ return 0.0; };
+  auto func_0001 = Dune::PDELab::makeGridFunctionFromCallable(gv, lambda_0001);
+
+  // Set up (non)linear solvers...
+  using LinearSolver = Dune::PDELab::ISTLBackend_SEQ_SuperLU;
+  using SLP = Dune::PDELab::StationaryLinearProblemSolver<GO_r, LinearSolver, V_R>;
+  LinearSolver ls(false);
+  double reduction = initree.get<double>("reduction", 1e-12);
+  SLP slp(go_r, ls, x_r, reduction);
+  slp.apply();
+
+  // Do visualization...
+  using VTKWriter = Dune::SubsamplingVTKWriter<GV>;
+  Dune::RefinementIntervals subint(initree.get<int>("vtk.subsamplinglevel", 1));
+  VTKWriter vtkwriter(gv, subint);
+  std::string vtkfile = initree.get<std::string>("wrapper.vtkcompare.name", "output");
+  CuttingPredicate predicate;
+  Dune::PDELab::addSolutionToVTKWriter(vtkwriter, p1_dirichlet_gfs_, x_r, Dune::PDELab::vtk::defaultNameScheme(), predicate);
+  vtkwriter.write(vtkfile, Dune::VTK::ascii);
+
+
+  //===============================================================//
+  //    ___      _ _       _       _     _____ _          __  __   //
+  //   / _ \    | (_)     (_)     | |   /  ___| |        / _|/ _|  //
+  //  / /_\ \ __| |_  ___  _ _ __ | |_  \ `--.| |_ _   _| |_| |_   //
+  //  |  _  |/ _` | |/ _ \| | '_ \| __|  `--. \ __| | | |  _|  _|  //
+  //  | | | | (_| | | (_) | | | | | |_  /\__/ / |_| |_| | | | |    //
+  //  \_| |_/\__,_| |\___/|_|_| |_|\__| \____/ \__|\__,_|_| |_|    //
+  //             _/ |                                              //
+  //            |__/                                               //
+  //===============================================================//
+
+  std::cout << std::endl << "Adjoint Stuff" << std::endl << std::endl;
+
+  //=========//
+  // Adjoint //
+  //=========//
+
+  // The adjoint needs the solution of the forward problem as DiscreteGridViewFunction
+  using GF_X = Dune::PDELab::DiscreteGridViewFunction<P1_dirichlet_GFS, V_R>;
+  GF_X x_gf(p1_dirichlet_gfs_, x_r);
+
+  // Local operator for adjoint problem
+  using LOP_Adjoint = RAdjointOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType, GF_X>;
+  LOP_Adjoint lop_adjoint(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree, x_gf);
+
+  // Grid operator for adjoint problem
+  using GO_Adjoint = Dune::PDELab::GridOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, LOP_Adjoint, MatrixBackend, DF, RangeType, RangeType, P1_dirichlet_GFS_CC, P1_dirichlet_GFS_CC>;
+  GO_Adjoint go_adjoint(p1_dirichlet_gfs_, p1_dirichlet_gfs__cc, p1_dirichlet_gfs_, p1_dirichlet_gfs__cc, lop_adjoint, mb);
+
+  // Boundary condition
+  using V_Adjoint = GO_Adjoint::Traits::Domain;
+  V_Adjoint x_adjoint(p1_dirichlet_gfs_);
+  x_adjoint = 0.0;
+
+  // Solve problem
+  using SLP_Adjoint = Dune::PDELab::StationaryLinearProblemSolver<GO_Adjoint, LinearSolver, V_Adjoint>;
+  SLP_Adjoint slp_adjoint(go_adjoint, ls, x_adjoint, reduction);
+  slp_adjoint.apply();
+
+  // print_l2_norm(p1_dirichlet_gfs_, x_adjoint, gv);
+  using Dune::PDELab::Backend::native;
+  std::cout << "Norm of adjoint vector: " << native(x_adjoint).two_norm() << std::endl;
+
+  //=========//
+  // Control //
+  //=========//
+
+  // The control problem needs the solution of the adjoint problem as DiscreteGridViewFunction
+  using GF_Adjoint = Dune::PDELab::DiscreteGridViewFunction<P1_dirichlet_GFS, V_Adjoint>;
+  GF_Adjoint gf_adjoint(p1_dirichlet_gfs_, x_adjoint);
+
+  // Derivative of objective function w.r.t. the control
+  using DJDM = std::vector<RangeType>;
+  DJDM dJdm(7,0.0);
+
+  // Local operator for control problem
+  using LOP_Control = RControlOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType, GF_Adjoint, DJDM>;
+  LOP_Control lop_control(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree, gf_adjoint, dJdm);
+
+  // Grid operator for control problem
+  //
+  // Note: Create without contstraints container. We don't want to
+  // apply any Dirichlet constraints here (this would mean setting the
+  // corresponding values of the residual vector to zero).
+  //
+  // Note: Having a GFS that was constructed with dirichlet
+  // constraints and then creating a GO without constraints works.
+  using GO_Control = Dune::PDELab::GridOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, LOP_Control, MatrixBackend, DF, RangeType, RangeType>;
+  GO_Control go_control(p1_dirichlet_gfs_, p1_dirichlet_gfs_, lop_control, mb);
+
+  // Calculate dJdm
+  using V_Control = GO_Control::Traits::Domain;
+  V_Control r_control(p1_dirichlet_gfs_);
+  r_control = 0.0;
+  go_control.residual(x_r, r_control);
+
+  //========================================//
+  // Print derivative of objective function //
+  //========================================//
+
+  std::cout << std::endl;
+  std::cout << "Derivatives of objective function: " << std::setprecision(20)
+            << dJdm[0] << "  "
+            << dJdm[1] << "  "
+            << dJdm[2] << "  "
+            << dJdm[3] << "  "
+            << dJdm[4] << "  "
+            << dJdm[5] << "  "
+            << dJdm[6] << "  "
+            << std::endl;
+  std::cout << std::endl;
+
+  //==================================================================================//
+  //   _____          _    ___      _ _       _       _     _____ _          __  __   //
+  //  |  ___|        | |  / _ \    | (_)     (_)     | |   /  ___| |        / _|/ _|  //
+  //  | |__ _ __   __| | / /_\ \ __| |_  ___  _ _ __ | |_  \ `--.| |_ _   _| |_| |_   //
+  //  |  __| '_ \ / _` | |  _  |/ _` | |/ _ \| | '_ \| __|  `--. \ __| | | |  _|  _|  //
+  //  | |__| | | | (_| | | | | | (_| | | (_) | | | | | |_  /\__/ / |_| |_| | | | |    //
+  //  \____/_| |_|\__,_| \_| |_/\__,_| |\___/|_|_| |_|\__| \____/ \__|\__,_|_| |_|    //
+  //                                _/ |                                              //
+  //                               |__/                                               //
+  //==================================================================================//
+
+  // Compare with results from doflin-adjoint:
+  using std::abs;
+  bool fail = false;
+  if (abs(dJdm[0]- 0.02895684)>1e-3)
+    fail = true;
+  if (abs(dJdm[1]- 0.00173435)>1e-3)
+    fail = true;
+  if (abs(dJdm[2]- 0.00173435)>1e-3)
+    fail = true;
+  if (abs(dJdm[3]- 0.03019001)>1e-3)
+    fail = true;
+  if (abs(dJdm[4]- 0.05060596)>1e-3)
+    fail = true;
+  if (abs(dJdm[5]- -0.03072505)>1e-3)
+    fail = true;
+  if (abs(dJdm[6]- 0.0236605)>1e-3)
+    fail = true;
+
+  return fail;
+
+}
+
+
+#endif //GENERATED_POISSON_MC_DRIVER_HH
diff --git a/cmake/modules/StandardMain.cmake b/test/adjoint/poisson_mc_main.cc
similarity index 96%
rename from cmake/modules/StandardMain.cmake
rename to test/adjoint/poisson_mc_main.cc
index 028c2efc0b208705845326fc9a621697c72ec408..00ff73821febe039cb4508b4d578426c83332ce9 100644
--- a/cmake/modules/StandardMain.cmake
+++ b/test/adjoint/poisson_mc_main.cc
@@ -5,7 +5,7 @@
 #include <dune/common/parallel/mpihelper.hh>
 #include <dune/common/exceptions.hh>
 
-#include"@GEN_DRIVER@"
+#include"poisson_mc_driver.hh"
 
 int main(int argc, char** argv)
 {
diff --git a/test/blockstructured/nonlinear/nonlinear.mini b/test/blockstructured/nonlinear/nonlinear.mini
index 18b3e9adc9328b7073d87c851582847eea4d2fef..5e9835a0311a98e815e7c54e51a02714b3bef9ee 100644
--- a/test/blockstructured/nonlinear/nonlinear.mini
+++ b/test/blockstructured/nonlinear/nonlinear.mini
@@ -10,5 +10,7 @@ extension = vtu
 
 [formcompiler]
 compare_l2errorsquared = 6e-4
+
+[formcompiler.r]
 blockstructured = 1
-number_of_blocks = 5
\ No newline at end of file
+number_of_blocks = 5
diff --git a/test/blockstructured/nonlinear/nonlinear.ufl b/test/blockstructured/nonlinear/nonlinear.ufl
index 6187bbeb86d8390f578d337727089c254ff4ff06..d43cc0205dfeab043b3ca843b83fab4af435699f 100644
--- a/test/blockstructured/nonlinear/nonlinear.ufl
+++ b/test/blockstructured/nonlinear/nonlinear.ufl
@@ -9,7 +9,7 @@ V = FiniteElement("CG", cell, 1)
 u = TrialFunction(V)
 v = TestFunction(V)
 
-forms = [(inner(grad(u), grad(v)) + u*u*v - f*v)*dx]
-dirichlet_expression = g
+r = (inner(grad(u), grad(v)) + u*u*v - f*v)*dx
+interpolate_expression = g
 exact_solution = g
 is_dirichlet = 1
diff --git a/test/blockstructured/poisson/3d/poisson.mini b/test/blockstructured/poisson/3d/poisson.mini
index 441fbe43ec06fa9095af4f28e310d53c2777bdea..e9e34187a447198666d8de5a11e77e54aaa4ebb1 100644
--- a/test/blockstructured/poisson/3d/poisson.mini
+++ b/test/blockstructured/poisson/3d/poisson.mini
@@ -10,9 +10,11 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 exact_solution_expression = g
 compare_l2errorsquared = 1e-7
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
 blockstructured = 1
 number_of_blocks = 3
 
diff --git a/test/blockstructured/poisson/3d/poisson.ufl b/test/blockstructured/poisson/3d/poisson.ufl
index 562606820cf377a80cad2a0826f5207d4f5a2a5d..d0ebc7125e26ead1170cee0891dd628706a7ab11 100644
--- a/test/blockstructured/poisson/3d/poisson.ufl
+++ b/test/blockstructured/poisson/3d/poisson.ufl
@@ -10,7 +10,7 @@ u = TrialFunction(V)
 v = TestFunction(V)
 
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx]
-dirichlet_expression = g
+r = (inner(grad(u), grad(v)) - f*v)*dx
+interpolate_expression = g
 exact_solution = g
 is_dirichlet = 1
diff --git a/test/blockstructured/poisson/poisson.mini b/test/blockstructured/poisson/poisson.mini
index 98d4bd2636150eea063bc0259349f069d88473f9..3e7cdca908f73171b35e7b8cd120ee57e08ca485 100644
--- a/test/blockstructured/poisson/poisson.mini
+++ b/test/blockstructured/poisson/poisson.mini
@@ -10,7 +10,9 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 1e-7
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
 blockstructured = 1
 number_of_blocks = 3
diff --git a/test/blockstructured/poisson/poisson.ufl b/test/blockstructured/poisson/poisson.ufl
index d8a7ef0ce23d9cc7cf96445509530af750199743..f4f7145ca85de1f39952ee090a51b9b207aa29be 100644
--- a/test/blockstructured/poisson/poisson.ufl
+++ b/test/blockstructured/poisson/poisson.ufl
@@ -9,7 +9,7 @@ V = FiniteElement("CG", cell, 2)
 u = TrialFunction(V)
 v = TestFunction(V)
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx]
-dirichlet_expression = g
+r = (inner(grad(u), grad(v)) - f*v)*dx
+interpolate_expression = g
 exact_solution = g
 is_dirichlet = 1
diff --git a/test/blockstructured/poisson/poisson_matrix_free.mini b/test/blockstructured/poisson/poisson_matrix_free.mini
index a5f2fecc732804cec7b9e64f2186033b8aeeba1e..e6b9af12a01c46fe8d6361050a341b7769a3e484 100644
--- a/test/blockstructured/poisson/poisson_matrix_free.mini
+++ b/test/blockstructured/poisson/poisson_matrix_free.mini
@@ -9,7 +9,9 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-matrix_free = 1
 compare_l2errorsquared = 1e-7
+
+[formcompiler.r]
+matrix_free = 1
 blockstructured = 1
 number_of_blocks = 4
\ No newline at end of file
diff --git a/test/blockstructured/poisson/poisson_neumann.mini b/test/blockstructured/poisson/poisson_neumann.mini
index 93272e18a46bb9b6e4c3e1ee19894846fd330784..1512f88bc1ba25141e121de948da3aed06933b97 100644
--- a/test/blockstructured/poisson/poisson_neumann.mini
+++ b/test/blockstructured/poisson/poisson_neumann.mini
@@ -10,7 +10,9 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 1e-8
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
 blockstructured = 1
 number_of_blocks = 4
diff --git a/test/blockstructured/poisson/poisson_neumann.ufl b/test/blockstructured/poisson/poisson_neumann.ufl
index 867403d8920c6869eee6721a943c3cd6199c57b4..9fd7f4c625284a9fe3276f80881dbdc873b31fad 100644
--- a/test/blockstructured/poisson/poisson_neumann.ufl
+++ b/test/blockstructured/poisson/poisson_neumann.ufl
@@ -16,7 +16,7 @@ v = TestFunction(V)
 # Define the boundary measure that knows where we are...
 ds = ds(subdomain_data=bctype)
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx - j*v*ds(0)]
-dirichlet_expression = g
+r = (inner(grad(u), grad(v)) - f*v)*dx - j*v*ds(0)
+interpolate_expression = g
 exact_solution = g
 is_dirichlet = bctype
diff --git a/test/blockstructured/poisson/poisson_tensor.mini b/test/blockstructured/poisson/poisson_tensor.mini
index 44ad633559f6d7809db7b42298a3e5613161017a..ff6426b9129d3d0028047f7d1413a4f17761d6df 100644
--- a/test/blockstructured/poisson/poisson_tensor.mini
+++ b/test/blockstructured/poisson/poisson_tensor.mini
@@ -10,7 +10,9 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 1e-7
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
 blockstructured = 1
 number_of_blocks = 4
diff --git a/test/blockstructured/poisson/poisson_tensor.ufl b/test/blockstructured/poisson/poisson_tensor.ufl
index df8bcbab312335e8e0c8e1160a145288187318fc..239e14fca9550031cd53994d9528cb0c41e424a6 100644
--- a/test/blockstructured/poisson/poisson_tensor.ufl
+++ b/test/blockstructured/poisson/poisson_tensor.ufl
@@ -12,7 +12,7 @@ V = FiniteElement("CG", cell, 1)
 u = TrialFunction(V)
 v = TestFunction(V)
 
-forms = [(inner(A*grad(u), grad(v)) + c*u*v -f*v)*dx]
-dirichlet_expression = g
+r = (inner(A*grad(u), grad(v)) + c*u*v -f*v)*dx
+interpolate_expression = g
 exact_solution = g
 is_dirichlet = 1
diff --git a/test/blockstructured/stokes/stokes.mini b/test/blockstructured/stokes/stokes.mini
index ae59e3ef3ca1c12f74065e9c6c2ec361b495feb2..532a4159b6b6019525acdea50da3c18f14e22f9a 100644
--- a/test/blockstructured/stokes/stokes.mini
+++ b/test/blockstructured/stokes/stokes.mini
@@ -1,7 +1,7 @@
 __name = blockstructured_stokes_{__exec_suffix}
 __exec_suffix = symdiff, numdiff | expand num
 
-cells = 5 5
+cells = 20 20
 extension = 1. 1.
 
 [wrapper.vtkcompare]
@@ -10,7 +10,9 @@ reference = hagenpoiseuille_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 0, 1 | expand num
 compare_l2errorsquared = 1e-9
+
+[formcompiler.r]
+numerical_jacobian = 0, 1 | expand num
 blockstructured = 1
-number_of_blocks = 3
\ No newline at end of file
+number_of_blocks = 3
diff --git a/test/blockstructured/stokes/stokes.ufl b/test/blockstructured/stokes/stokes.ufl
index c8f630b84aae43a68221ace670fe9fcb027af47d..4411e791138cf85824f24fc6549d52cf6ef1af0d 100644
--- a/test/blockstructured/stokes/stokes.ufl
+++ b/test/blockstructured/stokes/stokes.ufl
@@ -13,7 +13,6 @@ u, p = TrialFunctions(TH)
 
 r = (inner(grad(v), grad(u)) - div(v)*p - q*div(u))*dx
 
-forms = [r]
 is_dirichlet = v_bctype, v_bctype, 0
-dirichlet_expression = g_v, None
+interpolate_expression = g_v, None
 exact_solution = g_v, 8.*(1.-x[0])
diff --git a/test/coeffeval/CMakeLists.txt b/test/coeffeval/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7ca549c996a2411ddd2746474fef1c7732f6566d
--- /dev/null
+++ b/test/coeffeval/CMakeLists.txt
@@ -0,0 +1,5 @@
+dune_add_formcompiler_system_test(UFLFILE poisson.ufl
+                                  SOURCE coeffeval_poisson.cc
+                                  INIFILE coeffeval_poisson.mini
+                                  BASENAME coeffeval_poisson
+                                  )
diff --git a/test/coeffeval/coeffeval_poisson.cc b/test/coeffeval/coeffeval_poisson.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4cd8e0d89850879394681162710f54e458a2bb3
--- /dev/null
+++ b/test/coeffeval/coeffeval_poisson.cc
@@ -0,0 +1,153 @@
+#include"config.h"
+
+#include "dune/common/parametertreeparser.hh"
+#include "dune/pdelab/gridfunctionspace/gridfunctionadapter.hh"
+#include "dune/pdelab/constraints/conforming.hh"
+#include "dune/pdelab/backend/istl.hh"
+#include "dune/pdelab/gridfunctionspace/vtk.hh"
+#include "dune/common/parametertree.hh"
+#include "dune/testtools/gridconstruction.hh"
+#include "dune/pdelab/finiteelementmap/pkfem.hh"
+#include <random>
+#include "dune/pdelab/function/callableadapter.hh"
+#include "dune/perftool/vtkpredicate.hh"
+#include <string>
+#include "dune/alugrid/grid.hh"
+#include "dune/pdelab/common/functionutilities.hh"
+#include "dune/pdelab/gridoperator/gridoperator.hh"
+#include "dune/pdelab/stationary/linearproblem.hh"
+#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh"
+#include "dune/pdelab/function/discretegridviewfunction.hh"
+
+#if OPERATOR == 1
+#include "poisson_grad_localoperator.hh"
+#endif
+
+#if OPERATOR == 0
+#include "poisson_nongrad_localoperator.hh"
+#endif
+
+int main(int argc, char** argv)
+{
+  // MPI helper stuff
+  Dune::MPIHelper& helper = Dune::MPIHelper::instance(argc, argv);
+
+  // Parse the ini file
+  Dune::ParameterTree initree;
+  Dune::ParameterTreeParser::readINITree(argv[1], initree);
+
+  // Build a grid
+  using Grid = Dune::ALUGrid<2, 2, Dune::simplex, Dune::conforming>;
+  using GV = Grid::LeafGridView;
+  IniGridFactory<Grid> factory(initree);
+  std::shared_ptr<Grid> grid = factory.getGrid();
+  GV gv = grid->leafGridView();
+
+  // General types and stuff
+  using DF = Grid::ctype;
+  using RangeType = double;
+
+  // Finite Element Maps
+  using P1_FEM = Dune::PDELab::PkLocalFiniteElementMap<GV, DF, RangeType, 1>;
+  using P2_FEM = Dune::PDELab::PkLocalFiniteElementMap<GV, DF, RangeType, 2>;
+  P1_FEM p1_fem(gv);
+  P2_FEM p2_fem(gv);
+
+  // Grid Function Spaces
+  using VectorBackend = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::none>;
+  using DirichletConstraintsAssember = Dune::PDELab::ConformingDirichletConstraints;
+  using P1_dirichlet_GFS = Dune::PDELab::GridFunctionSpace<GV, P1_FEM, DirichletConstraintsAssember, VectorBackend>;
+  using P2_GFS = Dune::PDELab::GridFunctionSpace<GV, P2_FEM, DirichletConstraintsAssember, VectorBackend>;
+  P1_dirichlet_GFS p1_dirichlet_gfs_(gv, p1_fem);
+  P2_GFS p2_gfs(gv, p2_fem);
+  p1_dirichlet_gfs_.name("p1_dirichlet_gfs_");
+  p1_dirichlet_gfs_.update();
+  std::cout << "gfs with " << p1_dirichlet_gfs_.size() << " dofs generated  "<< std::endl;
+
+  // Solution vectors / Grid Functions
+  using V_R = Dune::PDELab::Backend::Vector<P1_dirichlet_GFS,DF>;
+  using V2 = Dune::PDELab::Backend::Vector<P2_GFS,DF>;
+  V_R x_r(p1_dirichlet_gfs_);
+  V2 c(p2_gfs);
+
+  using GF = Dune::PDELab::DiscreteGridViewFunction<P2_GFS, V2>;
+  GF c_gf(p2_gfs, c);
+
+  // Local Operator
+  using LOP_R = PoissonLocalOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType, GF>;
+  LOP_R lop_r(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree, c_gf);
+
+  // Constraints stuff
+  using P1_dirichlet_GFS_CC = P1_dirichlet_GFS::ConstraintsContainer<RangeType>::Type;
+  P1_dirichlet_GFS_CC p1_dirichlet_gfs__cc;
+  p1_dirichlet_gfs__cc.clear();
+  auto p1_bctype_lambda = [&](const auto& x){ return 1.0; };
+  auto p1_bctype = Dune::PDELab::makeBoundaryConditionFromCallable(gv, p1_bctype_lambda);
+  Dune::PDELab::constraints(p1_bctype, p1_dirichlet_gfs_, p1_dirichlet_gfs__cc);
+  std::cout << "cc with " << p1_dirichlet_gfs__cc.size() << " dofs generated  "<< std::endl;
+
+  // Matrix Backend
+  using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>;
+  int generic_dof_estimate =  6 * p1_dirichlet_gfs_.maxLocalSize();
+  int dofestimate = initree.get<int>("istl.number_of_nnz", generic_dof_estimate);
+  MatrixBackend mb(dofestimate);
+
+  // Grid Operator
+  using GO_r = Dune::PDELab::GridOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, LOP_R, MatrixBackend, DF, RangeType, RangeType, P1_dirichlet_GFS_CC, P1_dirichlet_GFS_CC>;
+  GO_r go_r(p1_dirichlet_gfs_, p1_dirichlet_gfs__cc, p1_dirichlet_gfs_, p1_dirichlet_gfs__cc, lop_r, mb);
+
+  // Solver
+  using LinearSolver = Dune::PDELab::ISTLBackend_SEQ_SuperLU;
+  LinearSolver ls(false);
+  using SLP = Dune::PDELab::StationaryLinearProblemSolver<GO_r, LinearSolver, V_R>;
+
+  // Interpolation
+  auto lambda_0000 = [&](const auto& x){ return (double)exp((-1.0) * ((0.5 - x[1]) * (0.5 - x[1]) + (0.5 - x[0]) * (0.5 - x[0]))); };
+  auto func_0000 = Dune::PDELab::makeGridFunctionFromCallable(gv, lambda_0000);
+  Dune::PDELab::interpolate(func_0000, p1_dirichlet_gfs_, x_r);
+
+  auto lambda_0001 = [&](const auto& x){ return (0.5-x[0])*(0.5-x[0]) + (0.5-x[1])*(0.5-x[1]); };
+  auto func_0001 = Dune::PDELab::makeGridFunctionFromCallable(gv, lambda_0001);
+  Dune::PDELab::interpolate(func_0001, p2_gfs, c);
+
+  // Solving
+  double reduction = initree.get<double>("reduction", 1e-12);
+  SLP slp(go_r, ls, x_r, reduction);
+  slp.apply();
+
+  // VTK visualization
+  using VTKWriter = Dune::SubsamplingVTKWriter<GV>;
+  Dune::RefinementIntervals subint(initree.get<int>("vtk.subsamplinglevel", 1));
+  VTKWriter vtkwriter(gv, subint);
+  std::string vtkfile = initree.get<std::string>("wrapper.vtkcompare.name", "output");
+  CuttingPredicate predicate;
+  Dune::PDELab::addSolutionToVTKWriter(vtkwriter, p1_dirichlet_gfs_, x_r, Dune::PDELab::vtk::defaultNameScheme(), predicate);
+  vtkwriter.write(vtkfile, Dune::VTK::ascii);
+
+  // Error calculation
+  using P1_DIRICHLET_GFS__DGF = Dune::PDELab::DiscreteGridFunction<decltype(p1_dirichlet_gfs_),decltype(x_r)>;
+  P1_DIRICHLET_GFS__DGF p1_dirichlet_gfs__dgf(p1_dirichlet_gfs_,x_r);
+  using DifferenceSquaredAdapter_ = Dune::PDELab::DifferenceSquaredAdapter<decltype(func_0000), decltype(p1_dirichlet_gfs__dgf)>;
+  DifferenceSquaredAdapter_ dsa_(func_0000, p1_dirichlet_gfs__dgf);
+  RangeType l2error(0.0);
+  {
+    // L2 error squared of difference between numerical
+    // solution and the interpolation of exact solution
+    // for treepath ()
+    typename P1_DIRICHLET_GFS__DGF::Traits::RangeType err(0.0);
+    Dune::PDELab::integrateGridFunction(dsa_, err, 10);
+
+    l2error += err;
+    if (gv.comm().rank() == 0){
+      std::cout << "L2 Error for treepath : " << err << std::endl;
+    }}
+  bool testfail(false);
+  using std::abs;
+  using std::isnan;
+  if (gv.comm().rank() == 0){
+    std::cout << "\nl2errorsquared: " << l2error << std::endl << std::endl;
+  }
+  if (isnan(l2error) or abs(l2error)>1e-7)
+    testfail = true;
+  return testfail;
+}
diff --git a/test/coeffeval/coeffeval_poisson.mini b/test/coeffeval/coeffeval_poisson.mini
new file mode 100644
index 0000000000000000000000000000000000000000..e2ec630c49cc25886022aac1a652a81d761a0513
--- /dev/null
+++ b/test/coeffeval/coeffeval_poisson.mini
@@ -0,0 +1,22 @@
+__name = coeffeval_poisson_{__exec_suffix}
+__exec_suffix = {grad_suffix}
+
+grad_suffix = grad, nongrad | expand grad
+
+lowerleft = 0.0 0.0
+upperright = 1.0 1.0
+elements = 32 32
+elementType = simplical
+
+[formcompiler]
+compare_l2errorsquared = 1e-7
+
+[formcompiler.r]
+classname = PoissonLocalOperator
+filename = poisson_{grad_suffix}_localoperator.hh
+
+[formcompiler.ufl_variants]
+use_grad = 1, 0 | expand grad
+
+[__static]
+OPERATOR = 1, 0 | expand grad
diff --git a/test/coeffeval/poisson.ufl b/test/coeffeval/poisson.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..378d9267fadebc0e6d5f63ba6cb7fef7ad5a2b40
--- /dev/null
+++ b/test/coeffeval/poisson.ufl
@@ -0,0 +1,21 @@
+cell = triangle
+
+x = SpatialCoordinate(cell)
+
+V = FiniteElement("CG", cell, 1)
+u = TrialFunction(V)
+v = TestFunction(V)
+
+P2 = FiniteElement("CG", cell, 2)
+c = Coefficient(P2)
+
+if use_grad:
+    # This is a stupid trick to test gradients of coefficients in an easy setting.
+    # We interpolate c = x0^2 + x1^2 in the driver and we can verify, that below
+    # equation holds. That means we test evaluation of c in terms of evaluation
+    # of its gradient!
+    c = 0.25 * (grad(c)[0] * grad(c)[0] + grad(c)[1] * grad(c)[1])
+
+f = 4*(1.-c)*exp(-1.*c)
+
+r = (inner(grad(u), grad(v)) - f*v)*dx
diff --git a/test/heatequation/heatequation.mini b/test/heatequation/heatequation.mini
index 6059854ec5b6c09a97d06712b928d1e7044ef3af..24a699673927de379260c45e50ea6b707dde95be 100644
--- a/test/heatequation/heatequation.mini
+++ b/test/heatequation/heatequation.mini
@@ -14,6 +14,7 @@ extension = vtu
 [formcompiler]
 explicit_time_stepping = 0, 1 | expand scheme
 compare_l2errorsquared = 1e-7
+operators = mass, poisson
 
 # Disable explicit tests for now
 {__exec_suffix} == explicit | exclude
diff --git a/test/heatequation/heatequation.ufl b/test/heatequation/heatequation.ufl
index 9fe2e20bff8bead3b823bf86c9cb4b4372f155c9..8a4ef977b3bd956c86df1f2aff8acb43d7ebd45b 100644
--- a/test/heatequation/heatequation.ufl
+++ b/test/heatequation/heatequation.ufl
@@ -13,7 +13,6 @@ v = TestFunction(V)
 mass = (u*v)*dx
 poisson = (inner(grad(u), grad(v)) - f*v)*dx
 
-forms = [mass, poisson]
-dirichlet_expression = g
+interpolate_expression = g
 is_dirichlet = 1
 exact_solution = g
\ No newline at end of file
diff --git a/test/heatequation/heatequation_dg.mini b/test/heatequation/heatequation_dg.mini
index 3c21abe6c6a6f4ef80563db46ff49d83fdfdc17f..169be57b4a3d485505cc47ff132aa3cabecee212 100644
--- a/test/heatequation/heatequation_dg.mini
+++ b/test/heatequation/heatequation_dg.mini
@@ -14,6 +14,7 @@ extension = vtu
 [formcompiler]
 explicit_time_stepping = 0, 1 | expand scheme
 compare_l2errorsquared = 1e-7
+operators = mass, poisson
 
 # Disable explicit tests for now
 {__exec_suffix} == explicit | exclude
diff --git a/test/heatequation/heatequation_dg.ufl b/test/heatequation/heatequation_dg.ufl
index 8dfd74302e57c53f3672d4a0e25592dd0c4cd34c..1d113a88f35b8404d7129a2fe5a2494f060e394b 100644
--- a/test/heatequation/heatequation_dg.ufl
+++ b/test/heatequation/heatequation_dg.ufl
@@ -1,12 +1,13 @@
 cell = triangle
+degree = 1
+dim = 2
 
 x = SpatialCoordinate(cell)
-
 c = (0.5-x[0])**2 + (0.5-x[1])**2
 g = exp(-1.*c)
 f = 4*(1.-c)*g
 
-V = FiniteElement("DG", cell, 1)
+V = FiniteElement("DG", cell, degree)
 
 u = TrialFunction(V)
 v = TestFunction(V)
@@ -14,25 +15,28 @@ v = TestFunction(V)
 n = FacetNormal(cell)('+')
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
 
 poisson = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
-  + gamma*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
 mass = (u*v)*dx
 
-forms = [mass, poisson]
-dirichlet_expression = g
+interpolate_expression = g
 is_dirichlet = 1
 exact_solution = g
\ No newline at end of file
diff --git a/test/heatequation/heatequation_time_dependent_bc.mini b/test/heatequation/heatequation_time_dependent_bc.mini
index 191cc52f1c9ff806496affb5eb6a7d8ebbdb25d9..762951ac66010cdc35f7c37c59873d22a72c7eaa 100644
--- a/test/heatequation/heatequation_time_dependent_bc.mini
+++ b/test/heatequation/heatequation_time_dependent_bc.mini
@@ -14,6 +14,7 @@ extension = vtu
 [formcompiler]
 explicit_time_stepping = 0, 1 | expand scheme
 compare_l2errorsquared = 2e-4
+operators = mass, poisson
 
 [instat]
 T = 1.0
diff --git a/test/heatequation/heatequation_time_dependent_bc.ufl b/test/heatequation/heatequation_time_dependent_bc.ufl
index ac0079253cc170c757da9ae772eec8e4fd1a6087..6b8443c5712df1463c84576ba23702cb42c5c45b 100644
--- a/test/heatequation/heatequation_time_dependent_bc.ufl
+++ b/test/heatequation/heatequation_time_dependent_bc.ufl
@@ -16,7 +16,6 @@ v = TestFunction(V)
 mass = (u*v)*dx
 poisson = (inner(grad(u), grad(v)) - f*v)*dx
 
-forms = [mass, poisson]
-dirichlet_expression = g
+interpolate_expression = g
 is_dirichlet = 1
 exact_solution = g
diff --git a/test/hyperbolic/linearacoustics.mini b/test/hyperbolic/linearacoustics.mini
index 5ccd60388b75b190883de796ce1bd9f4ea2394e4..ad1cc95b405ebd4a7624ef8057b99834fb52ccbc 100644
--- a/test/hyperbolic/linearacoustics.mini
+++ b/test/hyperbolic/linearacoustics.mini
@@ -13,5 +13,11 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+explicit_time_stepping = 1
+operators = mass, r
+
+[formcompiler.mass]
+numerical_jacobian = 1
+
+[formcompiler.r]
 numerical_jacobian = 1
-explicit_time_stepping = 1
\ No newline at end of file
diff --git a/test/hyperbolic/linearacoustics.ufl b/test/hyperbolic/linearacoustics.ufl
index 8b9d48c4433f72395c054c88ea6c4eaeedb9fcb0..5a78e7848578053dbb8d7f75a94c2f901f5831d3 100644
--- a/test/hyperbolic/linearacoustics.ufl
+++ b/test/hyperbolic/linearacoustics.ufl
@@ -21,12 +21,11 @@ flux = as_matrix([[q0,  q1],
                   [0., rho]])
 
 # Define numerical fluxes to choose from
-llf_flux = dot(avg(flux), n) - 0.5*jump(u)
+llf_flux = dot(avg(flux), n) + 0.5*jump(u)
 numerical_flux = llf_flux
 
 r = -1. * inner(flux, grad(v))*dx \
-  - inner(numerical_flux, jump(v))*dS \
+  + inner(numerical_flux, jump(v))*dS \
   + inner(u, v)*ds
 
-forms = [mass, r]
-dirichlet_expression = f, 0.0, 0.0
+interpolate_expression = f, 0.0, 0.0
diff --git a/test/hyperbolic/lineartransport.mini b/test/hyperbolic/lineartransport.mini
index 1ca4dedeb11ae71cc2d9f3f99e55306ebd9580ed..60a465d670b1acc664c514afe6371a3c7038b51b 100644
--- a/test/hyperbolic/lineartransport.mini
+++ b/test/hyperbolic/lineartransport.mini
@@ -18,6 +18,12 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand diff
 explicit_time_stepping = 0, 1 | expand scheme
-compare_l2errorsquared = 1e-10
\ No newline at end of file
+compare_l2errorsquared = 1e-10
+operators = mass, r
+
+[formcompiler.mass]
+numerical_jacobian = 1, 0 | expand diff
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand diff
diff --git a/test/hyperbolic/lineartransport.ufl b/test/hyperbolic/lineartransport.ufl
index 858a18bd32537eec046de5b382a95d95401e9aae..eaf8d33efe67a07a9a688efd3949e709a0a8122b 100644
--- a/test/hyperbolic/lineartransport.ufl
+++ b/test/hyperbolic/lineartransport.ufl
@@ -15,17 +15,16 @@ v = TestFunction(V)
 beta = as_vector((1., 1.))
 n = FacetNormal(cell)('+')
 
-def numerical_flux(normal, outside, inside):
+def numerical_flux(normal, inside, outside):
 	return conditional(inner(beta, n) > 0, inside, outside)*inner(beta, n)
 
 mass = u*v*dx
 
 r = -1.*u*inner(beta, grad(v))*dx \
-  - numerical_flux(n, u('+'), u('-'))*jump(v)*dS \
+  + numerical_flux(n, u('+'), u('-'))*jump(v)*dS \
   + inner(beta, n)*u*v*dso \
-  + numerical_flux(n, 0.0, u('-'))*v*dsd
+  + numerical_flux(n, u('+'), 0.0)*v*dsd
 
-forms = [mass, r]
 is_dirichlet = dirichlet
-dirichlet_expression = initial
+interpolate_expression = initial
 exact_solution = 0
\ No newline at end of file
diff --git a/test/hyperbolic/shallowwater.mini b/test/hyperbolic/shallowwater.mini
index f72b422b2fb19fa39969fc0febae5cb39d647c88..39408b491829daadb409652db25a4c41e8606c31 100644
--- a/test/hyperbolic/shallowwater.mini
+++ b/test/hyperbolic/shallowwater.mini
@@ -14,5 +14,11 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1
+operators = mass, r
 explicit_time_stepping = 1
+
+[formcompiler.mass]
+numerical_jacobian = 1
+
+[formcompiler.r]
+numerical_jacobian = 1
diff --git a/test/hyperbolic/shallowwater.ufl b/test/hyperbolic/shallowwater.ufl
index 8737b2851cce2fb093a0f6000c8cc649356d3ff3..c58429633b4ec11f5a69fc87fdbf0ed82c034a1b 100644
--- a/test/hyperbolic/shallowwater.ufl
+++ b/test/hyperbolic/shallowwater.ufl
@@ -23,14 +23,13 @@ b_flux = as_matrix([[-1.* q], [q*q/h + 0.5*g*h*h]])
 
 # Define numerical fluxes to choose from
 alpha = Max(abs(n[0]*q('+')) / h('+') + sqrt(g*h('+')), abs(n[0]*q('-')) / h('-') + sqrt(g*h('-')))
-llf_flux = dot(avg(flux), n) - 0.5*alpha*jump(u)
+llf_flux = dot(avg(flux), n) + 0.5*alpha*jump(u)
 alpha_b = abs(n[0]*q) / h + sqrt(g*h)
 boundary_flux = 0.5*dot(flux + b_flux, n) + alpha_b * as_vector([0., q])
 numerical_flux = llf_flux
 
 r = -1. * inner(flux, grad(v))*dx \
-  - inner(numerical_flux, jump(v))*dS \
+  + inner(numerical_flux, jump(v))*dS \
   + inner(boundary_flux, v)*ds
 
-forms = [mass, r]
-dirichlet_expression = f, 0.0
+interpolate_expression = f, 0.0
diff --git a/test/laplace/CMakeLists.txt b/test/laplace/CMakeLists.txt
deleted file mode 100644
index 5ae3a8576c632975f3f33be32aa17be96b6d4597..0000000000000000000000000000000000000000
--- a/test/laplace/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-dune_add_formcompiler_system_test(UFLFILE laplace.ufl
-                                  BASENAME laplace
-                                  INIFILE laplace.mini)
-
-dune_add_formcompiler_system_test(UFLFILE laplace_dg.ufl
-                                  BASENAME laplace_dg
-                                  INIFILE laplace_dg.mini)
-
-add_executable(laplace_dg_ref reference_main.cc)
-set_target_properties(laplace_dg_ref PROPERTIES EXCLUDE_FROM_ALL 1)
diff --git a/test/laplace/laplace.mini b/test/laplace/laplace.mini
deleted file mode 100644
index 1db0ffd82475b75b830f085586bb2d991af2c514..0000000000000000000000000000000000000000
--- a/test/laplace/laplace.mini
+++ /dev/null
@@ -1,11 +0,0 @@
-__name = laplace_{__exec_suffix}
-__exec_suffix = symdiff, numdiff | expand num
-
-lowerleft = 0.0 0.0
-upperright = 1.0 1.0
-elements = 4 4
-elementType = simplical
-printmatrix = true
-
-[formcompiler]
-numerical_jacobian = 0, 1 | expand num
diff --git a/test/laplace/laplace.ufl b/test/laplace/laplace.ufl
deleted file mode 100644
index 29b6a4bd3d6da839ca622098a222079d716a4f9e..0000000000000000000000000000000000000000
--- a/test/laplace/laplace.ufl
+++ /dev/null
@@ -1,5 +0,0 @@
-V = FiniteElement("CG", "triangle", 1)
-u = TrialFunction(V)
-v = TestFunction(V)
-
-forms = [inner(grad(u), grad(v))*dx]
diff --git a/test/laplace/laplace_dg.mini b/test/laplace/laplace_dg.mini
deleted file mode 100644
index 04a3c3dd0b0c3a5ee4f7d01c073b00f03d498b72..0000000000000000000000000000000000000000
--- a/test/laplace/laplace_dg.mini
+++ /dev/null
@@ -1,11 +0,0 @@
-__name = laplace_dg_{__exec_suffix}
-__exec_suffix = numdiff, symdiff | expand num
-
-lowerleft = 0.0 0.0
-upperright = 1.0 1.0
-elements = 2 2
-elementType = simplical
-printmatrix = true
-
-[formcompiler]
-numerical_jacobian = 1, 0 | expand num
diff --git a/test/laplace/laplace_dg.ufl b/test/laplace/laplace_dg.ufl
deleted file mode 100644
index 1a30bcea2f224b8e7a81ec1bf1a60eb39f58ae16..0000000000000000000000000000000000000000
--- a/test/laplace/laplace_dg.ufl
+++ /dev/null
@@ -1,23 +0,0 @@
-cell = triangle
-V = FiniteElement("DG", cell, 1)
-
-u = TrialFunction(V)
-v = TestFunction(V)
-
-n = FacetNormal(cell)('+')
-
-# penalty factor
-gamma = 1.0
-
-# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
-theta = 1.0
-
-r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
-  - inner(n, grad(u))*v*ds \
-  + gamma*u*v*ds \
-  + theta*u*inner(grad(v), n)*ds
-
-forms = [r]
diff --git a/test/laplace/reference_driver.hh b/test/laplace/reference_driver.hh
deleted file mode 100644
index 02791b7dcb785463376e4dca0c15407f496d0c74..0000000000000000000000000000000000000000
--- a/test/laplace/reference_driver.hh
+++ /dev/null
@@ -1,144 +0,0 @@
-#ifndef _HOME_DOMINIC_DUNE_BUILD_DUNE_PERFTOOL_TEST_LAPLACE_LAPLACE_DG_SYMDIFF_DRIVER_HH
-#define _HOME_DOMINIC_DUNE_BUILD_DUNE_PERFTOOL_TEST_LAPLACE_LAPLACE_DG_SYMDIFF_DRIVER_HH
-
-#include <dune/pdelab/gridoperator/gridoperator.hh>
-#include <dune/pdelab/backend/istl.hh>
-#include <dune/pdelab/backend/istl.hh>
-#include <dune/pdelab/gridfunctionspace/vtk.hh>
-#include <dune/grid/uggrid.hh>
-#include <dune/pdelab/backend/istl.hh>
-#include <string>
-#include <dune/common/parametertree.hh>
-#include <dune/pdelab/finiteelementmap/opbfem.hh>
-#include <dune/grid/io/file/vtk/subsamplingvtkwriter.hh>
-#include <dune/pdelab/stationary/linearproblem.hh>
-#include <dune/common/parametertreeparser.hh>
-#include <dune/testtools/gridconstruction.hh>
-#include <dune/pdelab/localoperator/convectiondiffusiondg.hh>
-#include <dune/pdelab/localoperator/convectiondiffusionparameter.hh>
-
-template<typename GV, typename RF>
-class CDProb
-{
-  typedef Dune::PDELab::ConvectionDiffusionBoundaryConditions::Type BCType;
-
-  public:
-  typedef Dune::PDELab::ConvectionDiffusionParameterTraits<GV,RF> Traits;
-
-  //! tensor diffusion coefficient
-  typename Traits::PermTensorType
-  A (const typename Traits::ElementType& e, const typename Traits::DomainType& x) const
-  {
-    typename Traits::PermTensorType I;
-    for (std::size_t i=0; i<Traits::dimDomain; i++)
-      for (std::size_t j=0; j<Traits::dimDomain; j++)
-        I[i][j] = (i==j) ? 1 : 0;
-    return I;
-  }
-
-  //! velocity field
-  typename Traits::RangeType
-  b (const typename Traits::ElementType& e, const typename Traits::DomainType& x) const
-  {
-    typename Traits::RangeType v(0.0);
-    return v;
-  }
-
-  //! sink term
-  typename Traits::RangeFieldType
-  c (const typename Traits::ElementType& e, const typename Traits::DomainType& x) const
-  {
-    return 0.0;
-  }
-
-  //! source term
-  typename Traits::RangeFieldType
-  f (const typename Traits::ElementType& e, const typename Traits::DomainType& x) const
-  {
-    return 0.0;
-  }
-
-  //! boundary condition type function
-  BCType
-  bctype (const typename Traits::IntersectionType& is, const typename Traits::IntersectionDomainType& x) const
-  {
-    return Dune::PDELab::ConvectionDiffusionBoundaryConditions::Dirichlet;
-  }
-
-  //! Dirichlet boundary condition value
-  typename Traits::RangeFieldType
-  g (const typename Traits::ElementType& e, const typename Traits::DomainType& x) const
-  {
-    return 0.0;
-  }
-
-  //! Neumann boundary condition
-  typename Traits::RangeFieldType
-  j (const typename Traits::IntersectionType& is, const typename Traits::IntersectionDomainType& x) const
-  {
-    return 0.0;
-  }
-
-  //! outflow boundary condition
-  typename Traits::RangeFieldType
-  o (const typename Traits::IntersectionType& is, const typename Traits::IntersectionDomainType& x) const
-  {
-    return 0.0;
-  }
-};
-
-
-void driver(int argc, char** argv){  typedef Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::none, 1> VectorBackend;
-  static const int dim = 2;
-  typedef Dune::UGGrid<dim> Grid;
-  typedef Grid::LeafGridView GV;
-  typedef Grid::ctype DF;
-  typedef double R;
-  typedef Dune::PDELab::OPBLocalFiniteElementMap<DF, R, 1, dim, Dune::GeometryType::simplex> DG1_FEM;
-  typedef Dune::PDELab::NoConstraints NoConstraintsAssembler;
-  typedef Dune::PDELab::GridFunctionSpace<GV, DG1_FEM, NoConstraintsAssembler, VectorBackend> DG1_DIRICHLET_GFS;
-  Dune::ParameterTree initree;
-  Dune::ParameterTreeParser::readINITree(argv[1], initree);
-  IniGridFactory<Grid> factory(initree);
-  std::shared_ptr<Grid> grid = factory.getGrid();
-  GV gv = grid->leafGridView();
-  DG1_FEM dg1_fem;
-  DG1_DIRICHLET_GFS dg1_dirichlet_gfs(gv, dg1_fem);
-  dg1_dirichlet_gfs.name("bla");
-  typedef Dune::SubsamplingVTKWriter<GV> VTKWriter;
-  int sublevel = initree.get<int>("vtk.subsamplinglevel", 0);
-  VTKWriter vtkwriter(gv, sublevel);
-  using LOP = Dune::PDELab::ConvectionDiffusionDG<CDProb<GV, R>, DG1_FEM>;
-  typedef DG1_DIRICHLET_GFS::ConstraintsContainer<R>::Type DG1_CC;
-  typedef Dune::PDELab::ISTL::BCRSMatrixBackend<> MatrixBackend;
-  typedef Dune::PDELab::GridOperator<DG1_DIRICHLET_GFS, DG1_DIRICHLET_GFS, LOP, MatrixBackend, DF, R, R, DG1_CC, DG1_CC> GO;
-  typedef GO::Traits::Domain V;
-  V x(dg1_dirichlet_gfs);
-  x = 0.0;
-  std::string vtkfile = initree.get<std::string>("wrapper.vtkcompare.name", "output");
-  typedef Dune::PDELab::ISTLBackend_SEQ_UMFPack LinearSolver;
-  typedef Dune::PDELab::StationaryLinearProblemSolver<GO, LinearSolver, V> SLP;
-  DG1_CC dg1_cc;
-  dg1_cc.clear();
-  CDProb<GV, R> params;
-  LOP lop(params, Dune::PDELab::ConvectionDiffusionDGMethod::SIPG);
-  int generic_dof_estimate =  6 * dg1_dirichlet_gfs.maxLocalSize();
-  int dofestimate = initree.get<int>("istl.number_of_nnz", generic_dof_estimate);
-  MatrixBackend mb(dofestimate);
-  GO go(dg1_dirichlet_gfs, dg1_cc, dg1_dirichlet_gfs, dg1_cc, lop, mb);
-  std::cout << "gfs with " << dg1_dirichlet_gfs.size() << " dofs generated  "<< std::endl;
-  std::cout << "cc with " << dg1_cc.size() << " dofs generated  "<< std::endl;
-  LinearSolver ls(false);
-  double reduction = initree.get<double>("reduction", 1e-12);
-  SLP slp(go, ls, x, reduction);
-  slp.apply();
-  typedef typename GO::Traits::Jacobian M;
-  M m(go);
-  go.jacobian(x,m);
-  using Dune::PDELab::Backend::native;
-  Dune::printmatrix(std::cout, native(m),"global stiffness matrix","row",9,1);
-  Dune::PDELab::addSolutionToVTKWriter(vtkwriter, dg1_dirichlet_gfs, x);
-  vtkwriter.write(vtkfile, Dune::VTK::ascii);
-}
-
-#endif //_HOME_DOMINIC_DUNE_BUILD_DUNE_PERFTOOL_TEST_LAPLACE_LAPLACE_DG_SYMDIFF_DRIVER_HH
diff --git a/test/laplace/reference_main.cc b/test/laplace/reference_main.cc
deleted file mode 100644
index ff00d5c060de6a23fafc79ffb7aeabb8f5d1ac9d..0000000000000000000000000000000000000000
--- a/test/laplace/reference_main.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <dune/common/parallel/mpihelper.hh>
-#include <dune/common/exceptions.hh>
-
-#include"/home/dominic/dune/dune-perftool/test/laplace/reference_driver.hh"
-
-int main(int argc, char** argv)
-{
-  try{
-    //Maybe initialize Mpi
-    Dune::MPIHelper& helper = Dune::MPIHelper::instance(argc, argv);
-    if(Dune::MPIHelper::isFake)
-      std::cout<< "This is a sequential program." << std::endl;
-    else
-      std::cout<<"I am rank "<<helper.rank()<<" of "<<helper.size()
-        <<" processes!"<<std::endl;
-
-    driver(argc, argv);
-
-    return 0;
-  }
-  catch (Dune::Exception &e){
-    std::cerr << "Dune reported error: " << e << std::endl;
-    return 1;
-  }
-  catch (...){
-    std::cerr << "Unknown exception thrown!" << std::endl;
-    return 1;
-  }
-}
diff --git a/test/navier-stokes/CMakeLists.txt b/test/navier-stokes/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b87827a05984dbdded01684d7b328d80d48895b1
--- /dev/null
+++ b/test/navier-stokes/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_subdirectory(reference_program)
+
+dune_add_formcompiler_system_test(UFLFILE navierstokes_2d_dg_quadrilateral.ufl
+                                  BASENAME navierstokes_2d_dg_quadrilateral
+                                  INIFILE navierstokes_2d_dg_quadrilateral.mini
+                                  SCRIPT dune_execute_parallel.py
+                                  )
+
+dune_add_formcompiler_system_test(UFLFILE navierstokes_3d_dg_quadrilateral.ufl
+                                  BASENAME navierstokes_3d_dg_quadrilateral
+                                  INIFILE navierstokes_3d_dg_quadrilateral.mini
+                                  )
diff --git a/test/navier-stokes/navierstokes_2d_dg_quadrilateral.mini b/test/navier-stokes/navierstokes_2d_dg_quadrilateral.mini
new file mode 100644
index 0000000000000000000000000000000000000000..5700107d03d62655e87b50b129aabc5b534134d2
--- /dev/null
+++ b/test/navier-stokes/navierstokes_2d_dg_quadrilateral.mini
@@ -0,0 +1,39 @@
+__name = navierstokes_2d_dg_quadrilateral_{__exec_suffix}
+__exec_suffix = symdiff, numdiff | expand num
+
+cells = 16 16
+lowerleft = -1. -1.
+extension = 2. 2.
+periodic = true true
+
+printmatrix = false
+
+[wrapper.execute_parallel]
+numprocessors = 4
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+operators = mass, r
+compare_l2errorsquared = 5e-5
+# Only calculate error for the velocity part
+l2error_tree_path = 1, 1, 0
+explicit_time_stepping = 0
+yaspgrid_offset = 1
+overlapping = 1
+
+[formcompiler.mass]
+numerical_jacobian = 0, 1 | expand num
+
+[formcompiler.r]
+numerical_jacobian = 0, 1 | expand num
+
+[instat]
+T = 1e-2
+dt = 1e-3
+nth = 1
+
+# Disable numdiff tests
+{__exec_suffix} == numdiff | exclude
diff --git a/test/navier-stokes/navierstokes_2d_dg_quadrilateral.ufl b/test/navier-stokes/navierstokes_2d_dg_quadrilateral.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..957d715e82acf50a936b9977957b3ae16c1d3e3b
--- /dev/null
+++ b/test/navier-stokes/navierstokes_2d_dg_quadrilateral.ufl
@@ -0,0 +1,47 @@
+# Taylor-Green vortex
+
+cell = quadrilateral
+degree = 2
+dim = 2
+
+x = SpatialCoordinate(cell)
+time = get_time(cell)
+
+P2 = VectorElement("DG", cell, degree)
+P1 = FiniteElement("DG", cell, degree-1)
+TH = P2 * P1
+
+v, q = TestFunctions(TH)
+u, p = TrialFunctions(TH)
+
+n = FacetNormal(cell)('+')
+
+rho = 1.0
+mu = 1.0/100.0
+
+g_v = as_vector((-exp(-2*pi*mu/rho*time)*cos(pi*x[0])*sin(pi*x[1]),
+                 exp(-2*pi*mu/rho*time)*sin(pi*x[0])*cos(pi*x[1])))
+g_p = -0.25*rho*exp(-4*pi*pi*mu/rho*time)*(cos(2*pi*x[0])+cos(2*pi*x[1]))
+
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = -1.0
+
+# penalty factor
+alpha = 1.0
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+
+mass = rho*inner(u,v)*dx
+
+r = mu * inner(grad(u), grad(v))*dx \
+  - p*div(v)*dx \
+  - q*div(u)*dx \
+  + rho * inner(grad(u)*u,v)*dx \
+  - mu * inner(avg(grad(u))*n, jump(v))*dS \
+  + mu * gamma_int * inner(jump(u), jump(v))*dS \
+  + mu * theta * inner(avg(grad(v))*n, jump(u))*dS \
+  + avg(p)*inner(jump(v), n)*dS \
+  + avg(q)*inner(jump(u), n)*dS \
+
+interpolate_expression = g_v, g_p
+exact_solution = g_v, g_p
diff --git a/test/navier-stokes/navierstokes_3d_dg_quadrilateral.mini b/test/navier-stokes/navierstokes_3d_dg_quadrilateral.mini
new file mode 100644
index 0000000000000000000000000000000000000000..a329eaaff7b46954009490c63b672f8830c3a73e
--- /dev/null
+++ b/test/navier-stokes/navierstokes_3d_dg_quadrilateral.mini
@@ -0,0 +1,33 @@
+__name = navierstokes_3d_dg_quadrilateral_{__exec_suffix}
+__exec_suffix = symdiff, numdiff | expand num
+
+cells = 4 4 4
+lowerleft = -1. -1. -1.
+extension = 2. 2. 2.
+
+printmatrix = false
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+explicit_time_stepping = 0
+yaspgrid_offset = 1
+compare_l2errorsquared = 5e-4
+# Only calculate error for the velocity part
+l2error_tree_path = 1, 1, 1, 0
+operators = mass, r
+
+[formcompiler.mass]
+numerical_jacobian = 0, 1 | expand num
+
+[formcompiler.r]
+numerical_jacobian = 0, 1 | expand num
+
+[instat]
+T = 1e-1
+dt = 5e-2
+
+# Disable numdiff tests
+{__exec_suffix} == numdiff | exclude
diff --git a/test/navier-stokes/navierstokes_3d_dg_quadrilateral.ufl b/test/navier-stokes/navierstokes_3d_dg_quadrilateral.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..96038a8b4ad4cc0b2ec8254f5a0bfcbcd4922373
--- /dev/null
+++ b/test/navier-stokes/navierstokes_3d_dg_quadrilateral.ufl
@@ -0,0 +1,58 @@
+# Beltrami flow
+
+cell = hexahedron
+degree = 2
+dim = 3
+
+x = SpatialCoordinate(cell)
+time = get_time(cell)
+
+P2 = VectorElement("DG", cell, degree)
+P1 = FiniteElement("DG", cell, degree-1)
+TH = P2 * P1
+
+v, q = TestFunctions(TH)
+u, p = TrialFunctions(TH)
+
+n = FacetNormal(cell)('+')
+
+rho = 1.0
+mu = 1.0
+
+a = pi/4
+d = pi/2
+g_v = as_vector((-a*exp(-d*d*time)*(exp(a*x[0])*sin(d*x[2]+a*x[1])+cos(d*x[1]+a*x[0])*exp(a*x[2])),
+                 -a*exp(-d*d*time)*(exp(a*x[0])*cos(d*x[2]+a*x[1])+exp(a*x[1])*sin(a*x[2]+d*x[0])),
+                 -a*exp(-d*d*time)*(exp(a*x[1])*cos(a*x[2]+d*x[0])+sin(d*x[1]+a*x[0])*exp(a*x[2]))))
+g_p = -0.5*a*a*rho*exp(-d*d*time)*  (  2*cos(d*x[1]+a*x[0])*exp(a*(x[2]+x[0]) )*sin(d*x[2]+a*x[1])  +  2*exp(a*(x[1]+x[0]))*sin(a*x[2]+d*x[0])*cos(d*x[2]+a*x[1])  +  2*sin(d*x[1]+a*x[0])*exp(a*(x[2]+x[1]))*cos(a*x[2]+d*x[0])  +  exp(2*a*x[2])  +  exp(2*a*x[1])  +  exp(2*a*x[0])  )
+
+
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = -1.0
+
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+
+mass = rho*inner(u,v)*dx
+
+r = mu * inner(grad(u), grad(v))*dx \
+  - p*div(v)*dx \
+  - q*div(u)*dx \
+  + rho * inner(grad(u)*u,v)*dx \
+  - mu * inner(avg(grad(u))*n, jump(v))*dS \
+  + mu * gamma_int * inner(jump(u), jump(v))*dS \
+  + mu * theta * inner(avg(grad(v))*n, jump(u))*dS \
+  + avg(p)*inner(jump(v), n)*dS \
+  + avg(q)*inner(jump(u), n)*dS \
+  - mu * inner(grad(u)*n, v)*ds \
+  + mu * gamma_ext * inner(u-g_v, v)*ds \
+  + mu * theta * inner(grad(v)*n, u-g_v)*ds \
+  + p*inner(v, n)*ds \
+  + q*inner(u-g_v, n)*ds
+
+interpolate_expression = g_v, g_p
+exact_solution = g_v, g_p
\ No newline at end of file
diff --git a/test/navier-stokes/reference_program/CMakeLists.txt b/test/navier-stokes/reference_program/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..21af3e83f8d1fdec5377291b521f8712987a494d
--- /dev/null
+++ b/test/navier-stokes/reference_program/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_executable(taylor-green taylor-green.cc)
+dune_symlink_to_source_files(FILES taylor-green.ini)
+set_target_properties(taylor-green PROPERTIES EXCLUDE_FROM_ALL 1)
diff --git a/test/navier-stokes/reference_program/taylor-green.cc b/test/navier-stokes/reference_program/taylor-green.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e48e04296122dd89b615166f33d21219324d0726
--- /dev/null
+++ b/test/navier-stokes/reference_program/taylor-green.cc
@@ -0,0 +1,310 @@
+// -*- tab-width: 2; indent-tabs-mode: nil -*-
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include <iostream>
+#include <vector>
+#include <map>
+#include <string>
+#include <random>
+#include <dune/common/parallel/mpihelper.hh>
+#include <dune/common/exceptions.hh>
+#include <dune/common/fvector.hh>
+#include <dune/grid/yaspgrid.hh>
+#include <dune/grid/io/file/vtk/subsamplingvtkwriter.hh>
+#include <dune/istl/bvector.hh>
+#include <dune/istl/operators.hh>
+#include <dune/istl/solvers.hh>
+#include <dune/istl/preconditioners.hh>
+#include <dune/istl/io.hh>
+
+#include <dune/pdelab/common/function.hh>
+#include <dune/pdelab/common/functionutilities.hh>
+#include <dune/pdelab/finiteelementmap/qkdg.hh>
+#include <dune/pdelab/gridfunctionspace/gridfunctionspaceutilities.hh>
+#include <dune/pdelab/gridfunctionspace/subspace.hh>
+#include <dune/pdelab/gridfunctionspace/vectorgridfunctionspace.hh>
+#include <dune/pdelab/gridfunctionspace/vtk.hh>
+#include <dune/pdelab/gridoperator/gridoperator.hh>
+#include <dune/pdelab/gridfunctionspace/interpolate.hh>
+#include <dune/pdelab/localoperator/dgnavierstokes.hh>
+#include <dune/pdelab/backend/istl.hh>
+#include <dune/pdelab/finiteelementmap/monomfem.hh>
+#include <dune/pdelab/common/function.hh>
+#include <dune/pdelab/common/vtkexport.hh>
+#include <dune/pdelab/constraints/p0.hh>
+#include<dune/pdelab/gridoperator/onestep.hh>
+#include<dune/pdelab/newton/newton.hh>
+#include "dune/perftool/vtkpredicate.hh"
+#include "dune/grid/io/file/vtk/vtksequencewriter.hh"
+
+#include "taylor-green.hh"
+
+
+#define PERIODIC
+// #define NORMALIZE_PRESSURE
+
+//===============================================================
+// Problem setup and solution
+//===============================================================
+template<typename GV, typename RF, int vOrder, int pOrder>
+void taylor_green (const GV& gv, const Dune::ParameterTree& configuration, std::string filename)
+{
+  // Some types
+  using ES = Dune::PDELab::AllEntitySet<GV>;
+  ES es(gv);
+  using DF = typename ES::Grid::ctype;
+  static const unsigned int dim = ES::dimension;
+  Dune::Timer timer;
+
+  // Create finite element maps
+  const int velocity_degree = 2;
+  const int pressure_degree = 1;
+  using FEM_V = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RF, velocity_degree, dim>;
+  using FEM_P = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RF, pressure_degree, dim>;
+  FEM_V fem_v;
+  FEM_P fem_p;
+
+  // Do not block anything and order it lexicographic
+  using VectorBackend_V = Dune::PDELab::istl::VectorBackend<Dune::PDELab::istl::Blocking::none>;
+  using VectorBackend_P = Dune::PDELab::istl::VectorBackend<Dune::PDELab::istl::Blocking::none>;
+  using VectorBackend = Dune::PDELab::istl::VectorBackend<Dune::PDELab::istl::Blocking::none>;
+  using OrderingTag_V = Dune::PDELab::LexicographicOrderingTag;
+
+  // For periodic boundary conditions in Yasp grid we need an
+  // overlap. Therefore we run our program in parallel and need these
+  // constraints
+#ifdef PERIODIC
+  using Con = Dune::PDELab::P0ParallelConstraints;
+#else
+  using Con = Dune::PDELab::NoConstraints;
+#endif
+
+  // Velocity GFS
+  using GFS_V = Dune::PDELab::VectorGridFunctionSpace<
+    ES,FEM_V,dim,
+    VectorBackend,
+    VectorBackend_V,
+    Con,
+    OrderingTag_V
+    >;
+  GFS_V gfs_v(es,fem_v);
+  gfs_v.name("v");
+
+  // Pressure GFS
+  using GFS_P = Dune::PDELab::GridFunctionSpace<
+    ES,
+    FEM_P,
+    Con,
+    VectorBackend_P>;
+  GFS_P gfs_p(es,fem_p);
+  gfs_p.name("p");
+
+
+  // GFS
+  using OrderingTag = Dune::PDELab::LexicographicOrderingTag;
+  using GFS = Dune::PDELab::CompositeGridFunctionSpace<VectorBackend,OrderingTag,GFS_V,GFS_P>;
+  GFS gfs(gfs_v, gfs_p);
+  using namespace Dune::Indices;
+  gfs_v.child(_0).name("velocity_0");
+  gfs_v.child(_1).name("velocity_1");
+  gfs_p.name("pressure");
+  gfs.name("test");
+  gfs.update();
+  using CC = typename GFS::template ConstraintsContainer<double>::Type;
+  CC cc;
+  cc.clear();
+#ifdef PERIODIC
+  Dune::PDELab::constraints(gfs,cc);
+#endif
+  std::cout << "gfs with " << gfs.size() << " dofs generated  "<< std::endl;
+  std::cout << "cc with " << cc.size() << " dofs generated  "<< std::endl;
+
+  // Parameter functions
+  using FType = ZeroVectorFunction<ES,RF,dim>;
+  FType f(es);
+  using BType = BCTypeParamGlobalDirichlet;
+  BType b;
+  using VType = TaylorGreenVelocity<ES,RF,dim>;
+  VType v(es, configuration.sub("parameters"));
+  using PType = TaylorGreenPressure<ES,RF>;
+  PType p(es, configuration.sub("parameters"));
+  using PenaltyTerm = Dune::PDELab::DefaultInteriorPenalty<RF>;
+
+  // Local operator
+  using LOP_Parameters =
+    Dune::PDELab::DGNavierStokesParameters<ES,RF,FType,BType,VType,PType,true,false,PenaltyTerm>;
+  LOP_Parameters lop_parameters(configuration.sub("parameters"),f,b,v,p);
+  using LOP = Dune::PDELab::DGNavierStokes<LOP_Parameters>;
+  const int superintegration_order = 0;
+  LOP lop(lop_parameters,superintegration_order);
+  using LOP_M = Dune::PDELab::NavierStokesMass<LOP_Parameters>;
+  LOP_M lop_m(lop_parameters,1);
+
+  // Grid operator
+  using MBE = Dune::PDELab::istl::BCRSMatrixBackend<>;
+  MBE mbe(75); // Maximal number of nonzeroes per row can be cross-checked by patternStatistics().
+  using GO_R = Dune::PDELab::GridOperator<GFS,GFS,LOP,MBE,RF,RF,RF,CC,CC>;
+  GO_R go_r(gfs,cc,gfs,cc,lop,mbe);
+  using GO_M = Dune::PDELab::GridOperator<GFS,GFS,LOP_M,MBE,RF,RF,RF,CC,CC>;
+  GO_M go_m(gfs,cc,gfs,cc,lop_m,mbe);
+  using IGO = Dune::PDELab::OneStepGridOperator<GO_R,GO_M>;
+  IGO igo(go_r,go_m);
+
+  // Create initial solution
+  using InitialVelocity = TaylorGreenVelocity<GV,RF,2>;
+  InitialVelocity initial_velocity(gv, configuration.sub("parameters"));
+  using InitialPressure = TaylorGreenPressure<GV,RF>;
+  InitialPressure initial_pressure(gv, configuration.sub("parameters"));
+  using InitialSolution = Dune::PDELab::CompositeGridFunction<InitialVelocity,InitialPressure>;
+  InitialSolution initial_solution(initial_velocity, initial_pressure);
+
+  // Make coefficent vector and initialize it from a function
+  using V = typename IGO::Traits::Domain;
+  V xold(gfs);
+  xold = 0.0;
+  Dune::PDELab::interpolate(initial_solution,gfs,xold);
+
+  // Solver
+#ifdef PERIODIC
+  using LinearSolver = Dune::PDELab::ISTLBackend_OVLP_BCGS_ILU0<GFS,CC>;
+  LinearSolver ls(gfs,cc);
+#else
+  using LinearSolver = Dune::PDELab::ISTLBackend_SEQ_BCGS_ILU0;
+  LinearSolver ls;
+  // using LinearSolver = Dune::PDELab::ISTLBackend_SEQ_UMFPack;
+  // LinearSolver ls(false);
+#endif
+  using PDESolver = Dune::PDELab::Newton<IGO,LinearSolver,V>;
+  PDESolver newton(igo,xold,ls);
+  // newton.setReassembleThreshold(0.0);
+  // newton.setVerbosityLevel(2);
+  // newton.setMaxIterations(50);
+  // newton.setLineSearchMaxIterations(30);
+
+  // Time stepping
+  // using TSM = Dune::PDELab::OneStepThetaParameter<RF>;
+  // TSM tsm(1.0);
+  using TSM = Dune::PDELab::Alexander2Parameter<RF>;
+  TSM tsm;
+  Dune::PDELab::OneStepMethod<RF,IGO,PDESolver,V,V> osm(tsm,igo,newton);
+  // osm.setVerbosityLevel(2);
+
+  // Set time
+  RF time = 0.0;
+  RF time_end = configuration.get<RF>("driver.time_end");
+  RF dt = configuration.get<RF>("driver.dt");
+  RF dt_min = 1e-8;
+
+  // Visualize initial condition
+  using VTKSW = Dune::VTKSequenceWriter<GV>;
+  using VTKWriter = Dune::SubsamplingVTKWriter<GV>;
+  VTKWriter vtkwriter(gv, 2);
+  VTKSW vtkSequenceWriter(std::make_shared<VTKWriter>(vtkwriter), filename);
+  CuttingPredicate predicate;
+  Dune::PDELab::addSolutionToVTKWriter(vtkSequenceWriter, gfs, xold, Dune::PDELab::vtk::defaultNameScheme(), predicate);
+  vtkSequenceWriter.write(time, Dune::VTK::appendedraw);
+
+  V x(gfs,0.0);
+
+#ifdef NORMALIZE_PRESSURE
+  // Pressure normalization
+  using PressureSubGFS = typename Dune::PDELab::GridFunctionSubSpace <GFS,Dune::TypeTree::TreePath<1> >;
+  PressureSubGFS pressureSubGfs(gfs);
+  using PDGF = Dune::PDELab::DiscreteGridFunction<PressureSubGFS,V>;
+  PDGF pdgf(pressureSubGfs,x);
+  typename PDGF::Traits::RangeType pressure_integral(0);
+
+  int elements = int(sqrt(gv.size(0)));
+  int pressure_index = elements * elements * dim * pow((velocity_degree + 1), dim);
+  using Dune::PDELab::Backend::native;
+  std::cout << std::endl;
+  std::cout << "info elements: " << elements << std::endl;
+  std::cout << "info pressure_index: " << pressure_index << std::endl;
+  std::cout << "info gfs.size(): " << gfs.size() << std::endl;
+  std::cout << "info native(x).size(): " << native(x).size() << std::endl;
+  std::cout << std::endl;
+#endif
+
+  // Time loop
+  int step = 0;
+  const int nth = configuration.get<RF>("driver.nth");
+  while (time < time_end - dt_min*0.5){
+    osm.apply(time,dt,xold,x);
+
+#ifdef NORMALIZE_PRESSURE
+    // Correct pressure after each step. Without this pressure
+    // correction the velocity will be ok but the pressure will be
+    // shifted by a constant.
+    Dune::PDELab::integrateGridFunction(pdgf,pressure_integral,2);
+    pressure_integral = gv.comm().sum(pressure_integral);
+    std::cout << gv.comm().rank() << " pressure_integral before normalization: " << pressure_integral << std::endl;
+
+    // Scale integral
+    pressure_integral = pressure_integral/4;
+    for (int i=pressure_index; i<gfs.size(); ++i){
+      native(x)[i] -= pressure_integral;
+    }
+    Dune::PDELab::integrateGridFunction(pdgf,pressure_integral,2);
+    pressure_integral = gv.comm().sum(pressure_integral);
+    std::cout << "pressure_integral after normalization: " << pressure_integral << std::endl;
+#endif
+
+    xold = x;
+    time += dt;
+    step++;
+
+    if(step%nth==0){
+      vtkSequenceWriter.write(time, Dune::VTK::appendedraw);
+    }
+  }
+}
+
+//===============================================================
+// Main program with grid setup
+//===============================================================
+int main(int argc, char** argv)
+{
+  try{
+    // Maybe initialize Mpi
+    Dune::MPIHelper::instance(argc, argv);
+
+    // Read ini file
+    Dune::ParameterTree configuration;
+    const std::string config_filename("taylor-green.ini");
+    std::cout << "Reading ini-file \""<< config_filename
+              << "\"" << std::endl;
+    Dune::ParameterTreeParser::readINITree(config_filename, configuration);
+
+    // Create grid
+    const int dim = 2;
+    const int cells_per_dir = configuration.get<double>("driver.cells_per_dir");
+    Dune::FieldVector<double,dim> lowerleft(-1.0);
+    Dune::FieldVector<double,dim> upperright(1.0);
+    std::array<int, dim> cells(Dune::fill_array<int, dim>(cells_per_dir));
+    std::bitset<dim> periodic(false);
+    int overlap = 0;
+#ifdef PERIODIC
+    periodic[0] = true;
+    periodic[1] = true;
+    overlap = 1;
+#endif
+    using Grid = Dune::YaspGrid<dim, Dune::EquidistantOffsetCoordinates<double, dim> >;
+    Grid grid(lowerleft, upperright, cells, periodic, overlap);
+
+    // Solve problem
+    using GV = Grid::LeafGridView;
+    const GV gv=grid.leafGridView();
+    Dune::dinfo.push(false);
+    taylor_green<GV,double,2,1>(gv,configuration,"taylor-green");
+    return 0;
+  }
+  catch (Dune::Exception &e){
+    std::cerr << "Dune reported error: " << e << std::endl;
+    return 1;
+  }
+  catch (...){
+    std::cerr << "Unknown exception thrown!" << std::endl;
+    return 1;
+  }
+}
diff --git a/test/navier-stokes/reference_program/taylor-green.hh b/test/navier-stokes/reference_program/taylor-green.hh
new file mode 100644
index 0000000000000000000000000000000000000000..fc30069f6507ac9e10f36c147fa9188ce8f86293
--- /dev/null
+++ b/test/navier-stokes/reference_program/taylor-green.hh
@@ -0,0 +1,149 @@
+#ifndef TAYLOR_GREEN_HH
+#define TAYLOR_GREEN_HH
+
+//===============================================================
+// Define parameter functions f,g,j and \partial\Omega_D/N
+//===============================================================
+
+// constraints parameter class for selecting boundary condition type
+class BCTypeParamGlobalDirichlet
+{
+public :
+  typedef Dune::PDELab::StokesBoundaryCondition BC;
+
+  struct Traits
+  {
+    typedef BC::Type RangeType;
+  };
+
+  BCTypeParamGlobalDirichlet() {}
+
+  template<typename I>
+  inline void evaluate (const I & intersection,   /*@\label{bcp:name}@*/
+                        const Dune::FieldVector<typename I::ctype, I::dimension-1> & coord,
+                        BC::Type& y) const
+  {
+    y = BC::VelocityDirichlet;
+  }
+
+  template<typename T>
+  void setTime(T t){
+  }
+};
+
+
+template<typename GV, typename RF, int dim>
+class TaylorGreenVelocity :
+  public Dune::PDELab::AnalyticGridFunctionBase<
+  Dune::PDELab::AnalyticGridFunctionTraits<GV,RF,dim>,
+  TaylorGreenVelocity<GV,RF,dim> >
+{
+public:
+  typedef Dune::PDELab::AnalyticGridFunctionTraits<GV,RF,dim> Traits;
+  typedef Dune::PDELab::AnalyticGridFunctionBase<Traits,TaylorGreenVelocity<GV,RF,dim> > BaseT;
+
+  typedef typename Traits::DomainType DomainType;
+  typedef typename Traits::RangeType RangeType;
+
+  TaylorGreenVelocity(const GV & gv, const Dune::ParameterTree& params) : BaseT(gv)
+  {
+    mu = params.get<RF>("mu");
+    rho = params.get<RF>("rho");
+    time = 0.0;
+  }
+
+  inline void evaluateGlobal(const DomainType & x, RangeType & y) const
+  {
+    // TODO Get mu and rho from somewhere else!
+    RF pi = 3.14159265358979323846;
+    RF nu = mu/rho;
+    y[0] = -exp(-2.0*pi*pi*nu*time)*cos(pi*x[0])*sin(pi*x[1]);
+    y[1] = exp(-2.0*pi*pi*nu*time)*sin(pi*x[0])*cos(pi*x[1]);
+  }
+
+  template <typename T>
+  void setTime(T t){
+    time = t;
+  }
+
+private:
+  RF rho;
+  RF mu;
+  RF time;
+};
+
+
+template<typename GV, typename RF>
+class TaylorGreenPressure
+  : public Dune::PDELab::AnalyticGridFunctionBase<
+  Dune::PDELab::AnalyticGridFunctionTraits<GV,RF,1>,
+  TaylorGreenPressure<GV,RF> >
+{
+public:
+  typedef Dune::PDELab::AnalyticGridFunctionTraits<GV,RF,1> Traits;
+  typedef Dune::PDELab::AnalyticGridFunctionBase<Traits,TaylorGreenPressure<GV,RF> > BaseT;
+
+  typedef typename Traits::DomainType DomainType;
+  typedef typename Traits::RangeType RangeType;
+
+  TaylorGreenPressure (const GV& gv, const Dune::ParameterTree& params) : BaseT(gv)
+  {
+    mu = params.get<RF>("mu");
+    rho = params.get<RF>("rho");
+    time = 0.0;
+  }
+
+  inline void evaluateGlobal (const typename Traits::DomainType& x,
+                              typename Traits::RangeType& y) const
+  {
+    RF pi = 3.14159265358979323846;
+    RF nu = mu/rho;
+    y = -0.25*rho*exp(-4.0*pi*pi*nu*time)*(cos(2.0*pi*x[0])+cos(2.0*pi*x[1]));
+  }
+
+  template<typename T>
+  void setTime(T t){
+    time = t;
+  }
+
+private:
+  RF rho;
+  RF mu;
+  RF time;
+};
+
+
+
+template<typename GV, typename RF, std::size_t dim_range>
+class ZeroVectorFunction :
+  public Dune::PDELab::AnalyticGridFunctionBase<
+  Dune::PDELab::AnalyticGridFunctionTraits<GV,RF,dim_range>,
+  ZeroVectorFunction<GV,RF,dim_range> >,
+  public Dune::PDELab::InstationaryFunctionDefaults
+{
+public:
+  typedef Dune::PDELab::AnalyticGridFunctionTraits<GV,RF,dim_range> Traits;
+  typedef Dune::PDELab::AnalyticGridFunctionBase<Traits, ZeroVectorFunction> BaseT;
+
+  typedef typename Traits::DomainType DomainType;
+  typedef typename Traits::RangeType RangeType;
+
+  ZeroVectorFunction(const GV & gv) : BaseT(gv) {}
+
+  inline void evaluateGlobal(const DomainType & x, RangeType & y) const
+  {
+    y=0;
+  }
+};
+
+template<typename GV, typename RF>
+class ZeroScalarFunction
+  : public ZeroVectorFunction<GV,RF,1>
+{
+public:
+
+  ZeroScalarFunction(const GV & gv) : ZeroVectorFunction<GV,RF,1>(gv) {}
+
+};
+
+#endif
diff --git a/test/navier-stokes/reference_program/taylor-green.ini b/test/navier-stokes/reference_program/taylor-green.ini
new file mode 100644
index 0000000000000000000000000000000000000000..b7e94815a97507b65f0d85838c63882e349b860d
--- /dev/null
+++ b/test/navier-stokes/reference_program/taylor-green.ini
@@ -0,0 +1,14 @@
+[parameters]
+rho = 1.0
+mu = 0.01
+
+[parameters.dg]
+epsilon = -1
+sigma = 6.0
+beta = 1.0
+
+[driver]
+time_end = 1.0
+dt = 1e-3
+nth = 20
+cells_per_dir = 16
diff --git a/test/navier-stokes/reference_program/taylor_green_solution.py b/test/navier-stokes/reference_program/taylor_green_solution.py
new file mode 100644
index 0000000000000000000000000000000000000000..0def653962e95367a9b24ee5f5a80773cbb0340e
--- /dev/null
+++ b/test/navier-stokes/reference_program/taylor_green_solution.py
@@ -0,0 +1,99 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+def pressure(t, x, y):
+    rho = 1.0
+    mu = 1.0/100
+    nu = mu/rho
+    pi = np.pi
+    return -0.25*rho*np.exp(-4.0*pi**2*nu*t)*(np.cos(2.0*pi*x) + np.cos(2.0*pi*y))
+
+
+def velocity(t, x, y):
+    rho = 1.0
+    mu = 1.0/100
+    nu = mu/rho
+    pi = np.pi
+    v = np.empty(2)
+    v[0] = -np.exp(-2.0*pi*mu/rho*t)*np.cos(pi*x)*np.sin(pi*y)
+    v[1] = np.exp(-2.0*pi*mu/rho*t)*np.sin(pi*x)*cos(pi*y)
+    return v
+
+
+def v_0(t, x, y):
+    rho = 1.0
+    mu = 1.0/100
+    nu = mu/rho
+    pi = np.pi
+    return -np.exp(-2.0*pi*mu/rho*t)*np.cos(pi*x)*np.sin(pi*y)
+
+
+def v_1(t, x, y):
+    rho = 1.0
+    mu = 1.0/100
+    nu = mu/rho
+    pi = np.pi
+    return np.exp(-2.0*pi*mu/rho*t)*np.sin(pi*x)*cos(pi*y)
+
+
+def velocity_norm(t, x, y):
+    rho = 1.0
+    mu = 1.0/100
+    nu = mu/rho
+    pi = np.pi
+    return np.sqrt((-np.exp(-2.0*pi*mu/rho*t)*np.cos(pi*x)*np.sin(pi*y))**2 + (np.exp(-2.0*pi*mu/rho*t)*np.sin(pi*x)*np.cos(pi*y))**2)
+
+
+def plot_pressure(t, n):
+    h = 2.0/n
+    x = np.arange(-1,1,h)
+    y = np.arange(-1,1,h)
+    xx, yy = np.meshgrid(x, y, sparse=True)
+    z = pressure(t, xx, yy)
+    CS = plt.contourf(x,y,z)
+    cbar = plt.colorbar(CS)
+    plt.show()
+
+
+def minmax_pressure(t, n):
+    h = 2.0/n
+    x = np.arange(-1,1,h)
+    y = np.arange(-1,1,h)
+    xx, yy = np.meshgrid(x, y, sparse=True)
+    z = pressure(t, xx, yy)
+    return np.min(z), np.max(z)
+
+
+def plot_velocity(t, n):
+    h = 2.0/n
+    x = np.arange(-1,1,h)
+    y = np.arange(-1,1,h)
+    xx, yy = np.meshgrid(x, y, sparse=True)
+    z = velocity_norm(t, xx, yy)
+    CS = plt.contourf(x,y,z)
+    cbar = plt.colorbar(CS)
+    plt.show()
+
+
+def minmax_velocity_norm(t, n):
+    h = 2.0/n
+    x = np.arange(-1,1,h)
+    y = np.arange(-1,1,h)
+    xx, yy = np.meshgrid(x, y, sparse=True)
+    z = velocity_norm(t, xx, yy)
+    return np.min(z), np.max(z)
+
+
+print(minmax_velocity_norm(1.0, 64))
+
+# plot_pressure(1.0, 100)
+# plot_velocity(1.0, 100)
+
+# dt = 1.0e-4
+# n = 1000
+# t = 0.0
+
+# for i in range(20):
+#     minimum, maximum = minmax_pressure(t, n)
+#     print("t: {}, n: {}, minumum: {}, maximum: {}".format(t,n,minimum,maximum))
+#     t = t + dt
diff --git a/test/nonlinear/diffusivewave.mini b/test/nonlinear/diffusivewave.mini
index 2f877983394b41d54bb1592276d71d052776ca7e..c82f8ed23330982d6caf40f21d71dbb5d96f0588 100644
--- a/test/nonlinear/diffusivewave.mini
+++ b/test/nonlinear/diffusivewave.mini
@@ -14,5 +14,12 @@ dt = 0.001
 T = 0.01
 
 [formcompiler]
+operators = mass, poisson
+
+[formcompiler.mass]
+sumfact = 0, 1 | expand sf
+fastdg = 0, 0 | expand sf
+
+[formcompiler.operator]
 sumfact = 0, 1 | expand sf
-fastdg = 0, 0 | expand sf
\ No newline at end of file
+fastdg = 0, 0 | expand sf
diff --git a/test/nonlinear/diffusivewave.ufl b/test/nonlinear/diffusivewave.ufl
index 66b2524332f317ac05bcdcbd19404315931c974e..5dddfe3cb719343a82fd74f9a109460a97ecb71e 100644
--- a/test/nonlinear/diffusivewave.ufl
+++ b/test/nonlinear/diffusivewave.ufl
@@ -1,8 +1,10 @@
 cell = quadrilateral
+degree = 1
+dim = 2
 
 x = SpatialCoordinate(cell)
 
-V = FiniteElement("DG", cell, 1)
+V = FiniteElement("DG", cell, degree)
 
 u = TrialFunction(V)
 v = TestFunction(V)
@@ -10,7 +12,11 @@ v = TestFunction(V)
 n = FacetNormal(cell)('+')
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+# h_ext = CellVolume(cell) / FacetArea(cell)
+# gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
@@ -20,16 +26,16 @@ K = u**(5./3.)
 # / Max(1e-8, norm)
 
 poisson = inner(K*grad(u), grad(v))*dx \
-  + inner(n, avg(K*grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(K*grad(v)), n)*dS
+  - inner(n, avg(K*grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(K*grad(v)), n)*dS
 #  - inner(n, K*grad(u))*v*ds \
-#  + gamma*u*v*ds \
+#  + gamma_ext*u*v*ds \
 #  + theta*u*inner(K*grad(v), n)*ds \
-#  - theta*g*inner(K*grad(v), n)*ds \
-#  - gamma*g*v*ds
+#  - gamma_ext*g*v*ds \
+#  - theta*g*inner(K*grad(v), n)*ds
 
 mass = (u*v)*dx
 
 forms = [mass, poisson]
-dirichlet_expression = sin(pi*x[0])
+interpolate_expression = sin(pi*x[0])
diff --git a/test/nonlinear/nonlinear.ufl b/test/nonlinear/nonlinear.ufl
index f81b287e504909b1381a4ac71f3a84af5e34a01c..425c6cee834c2e940ef2a6b698ac5eb22eaf4bd9 100644
--- a/test/nonlinear/nonlinear.ufl
+++ b/test/nonlinear/nonlinear.ufl
@@ -10,7 +10,6 @@ v = TestFunction(V)
 
 r = (inner(grad(u), grad(v)) + u*u*v - f*v)*dx
 
-forms = [r]
 exact_solution = g
-dirichlet_expression = g
+interpolate_expression = g
 is_dirichlet = 1
\ No newline at end of file
diff --git a/test/nonlinear/nonlinear_dg.ufl b/test/nonlinear/nonlinear_dg.ufl
index 5fba927e4256b2c32d1b4a5766c78046dffbddc1..41dfb430c1048e011c04843f496b35947695877f 100644
--- a/test/nonlinear/nonlinear_dg.ufl
+++ b/test/nonlinear/nonlinear_dg.ufl
@@ -1,10 +1,12 @@
 cell = "triangle"
-x = SpatialCoordinate(cell)
+degree = 1
+dim = 2
 
+x = SpatialCoordinate(cell)
 f = -4.
 g = x[0]*x[0] + x[1]*x[1]
 
-V = FiniteElement("DG", cell, 1)
+V = FiniteElement("DG", cell, degree)
 
 u = TrialFunction(V)
 v = TestFunction(V)
@@ -15,22 +17,25 @@ def q(u):
     return u*u
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
-  - inner(n, grad(u))*v*ds \
-  + gamma*u*v*ds \
-  + theta*u*inner(grad(v), n)*ds \
   - f*v*dx \
   + q(u)*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma*g*v*ds
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - inner(n, grad(u))*v*ds \
+  + gamma_ext*u*v*ds \
+  + theta*u*inner(grad(v), n)*ds \
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
 exact_solution = g
diff --git a/test/nonlinear/nonlinear_dg_matrix_free.mini b/test/nonlinear/nonlinear_dg_matrix_free.mini
index 8a1de68470a4375d05253337686105bcb35e6fea..dfd4035d77c067a4678a4c834759fe63ef750f1f 100644
--- a/test/nonlinear/nonlinear_dg_matrix_free.mini
+++ b/test/nonlinear/nonlinear_dg_matrix_free.mini
@@ -11,9 +11,11 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+compare_l2errorsquared = 5e-3
+
+[formcompiler.r]
 numerical_jacobian = 1, 0 | expand num
 matrix_free = 1
-compare_l2errorsquared = 5e-3
 
 # Disable numerical methods (not working in PDELab?)
 {__exec_suffix} == numdiff | exclude
diff --git a/test/nonlinear/nonlinear_matrix_free.mini b/test/nonlinear/nonlinear_matrix_free.mini
index 644297ab361203ec049c27529904d8b09dd2261a..0f28479de167747bebb646b4ad3c4852a417192a 100644
--- a/test/nonlinear/nonlinear_matrix_free.mini
+++ b/test/nonlinear/nonlinear_matrix_free.mini
@@ -11,6 +11,8 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+compare_l2errorsquared = 6e-4
+
+[formcompiler.r]
 numerical_jacobian = 1, 0 | expand num
 matrix_free = 1
-compare_l2errorsquared = 6e-4
diff --git a/test/poisson/CMakeLists.txt b/test/poisson/CMakeLists.txt
index 68c635bc79eef0818780ca09d36c808c1eb01954..a05cbe8c3650c9e33f8f5c4f1bbce696b2df9ba8 100644
--- a/test/poisson/CMakeLists.txt
+++ b/test/poisson/CMakeLists.txt
@@ -67,6 +67,12 @@ dune_add_formcompiler_system_test(UFLFILE poisson_dg_tensor.ufl
                                   INIFILE poisson_dg_tensor.mini
                                   )
 
+# 12. Poisson Test Case with a custom function
+dune_add_formcompiler_system_test(UFLFILE poisson_customfunction.ufl
+                                  BASENAME poisson_customfunction
+                                  INIFILE poisson_customfunction.mini
+                                  )
+
 # the reference vtk file
 add_executable(poisson_dg_ref reference_main.cc)
 set_target_properties(poisson_dg_ref PROPERTIES EXCLUDE_FROM_ALL 1)
diff --git a/test/poisson/dimension-grid-variations/poisson_1d_cg_interval.ufl b/test/poisson/dimension-grid-variations/poisson_1d_cg_interval.ufl
index 5eefb2d7c12d0e18f1638949abb4bd0524512fd8..d584ea4b577deeec3a893e2ca636d0371c36fb5f 100644
--- a/test/poisson/dimension-grid-variations/poisson_1d_cg_interval.ufl
+++ b/test/poisson/dimension-grid-variations/poisson_1d_cg_interval.ufl
@@ -9,7 +9,7 @@ V = FiniteElement("CG", cell, 1)
 u = TrialFunction(V)
 v = TestFunction(V)
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx]
+r = (inner(grad(u), grad(v)) - f*v)*dx
 exact_solution = g
 is_dirichlet = 1
-dirichlet_expression = g
+interpolate_expression = g
diff --git a/test/poisson/dimension-grid-variations/poisson_1d_dg_interval.ufl b/test/poisson/dimension-grid-variations/poisson_1d_dg_interval.ufl
index 535d102440fa6c5f5405f61ab0ef82285dfeaa7a..51776b3acb57f87519bb2d493a1e833034c29b3d 100644
--- a/test/poisson/dimension-grid-variations/poisson_1d_dg_interval.ufl
+++ b/test/poisson/dimension-grid-variations/poisson_1d_dg_interval.ufl
@@ -1,11 +1,13 @@
 cell = "interval"
+degree = 1
+dim = 1
 
 x = SpatialCoordinate(cell)
 c = (0.5-x[0])**2
 g = exp(-1.*c)
 f = 2*(1.-2*c)*g
 
-V = FiniteElement("DG", cell, 1)
+V = FiniteElement("DG", cell, degree)
 
 u = TrialFunction(V)
 v = TestFunction(V)
@@ -13,21 +15,24 @@ v = TestFunction(V)
 n = FacetNormal(cell)('+')
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
-  + gamma*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
 exact_solution = g
diff --git a/test/poisson/dimension-grid-variations/poisson_2d_cg_quadrilateral.ufl b/test/poisson/dimension-grid-variations/poisson_2d_cg_quadrilateral.ufl
index d3771aacfe8f806cb3fd1e31d9ba581106885247..5d1921828127bbe2024d36c8674bbe4d9c868190 100644
--- a/test/poisson/dimension-grid-variations/poisson_2d_cg_quadrilateral.ufl
+++ b/test/poisson/dimension-grid-variations/poisson_2d_cg_quadrilateral.ufl
@@ -9,7 +9,7 @@ V = FiniteElement("CG", cell, 1)
 u = TrialFunction(V)
 v = TestFunction(V)
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx]
+r = (inner(grad(u), grad(v)) - f*v)*dx
 exact_solution = g
 is_dirichlet = 1
-dirichlet_expression = g
+interpolate_expression = g
diff --git a/test/poisson/dimension-grid-variations/poisson_2d_cg_triangle.ufl b/test/poisson/dimension-grid-variations/poisson_2d_cg_triangle.ufl
index e95064dfb1e0d947c51a1c021ac4f0f7a9f7adc1..a720e454218bd38347e62246a84e61fb38d5a0ac 100644
--- a/test/poisson/dimension-grid-variations/poisson_2d_cg_triangle.ufl
+++ b/test/poisson/dimension-grid-variations/poisson_2d_cg_triangle.ufl
@@ -8,7 +8,7 @@ V = FiniteElement("CG", "triangle", 1)
 u = TrialFunction(V)
 v = TestFunction(V)
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx]
+r = (inner(grad(u), grad(v)) - f*v)*dx
 exact_solution = g
 is_dirichlet = 1
-dirichlet_expression = g
+interpolate_expression = g
diff --git a/test/poisson/dimension-grid-variations/poisson_2d_dg_quadrilateral.ufl b/test/poisson/dimension-grid-variations/poisson_2d_dg_quadrilateral.ufl
index 84019c1e87a0a368429d49a9c787344dc6bcdaf2..87fd9b591b7d6ce406b655d52926d7117218a90d 100644
--- a/test/poisson/dimension-grid-variations/poisson_2d_dg_quadrilateral.ufl
+++ b/test/poisson/dimension-grid-variations/poisson_2d_dg_quadrilateral.ufl
@@ -1,11 +1,13 @@
 cell = "quadrilateral"
+degree = 1
+dim = 2
 
 x = SpatialCoordinate(cell)
 c = (0.5-x[0])**2 + (0.5-x[1])**2
 g = exp(-1.*c)
 f = 2*(2.-2*c)*g
 
-V = FiniteElement("DG", cell, 1)
+V = FiniteElement("DG", cell, degree)
 
 u = TrialFunction(V)
 v = TestFunction(V)
@@ -13,21 +15,24 @@ v = TestFunction(V)
 n = FacetNormal(cell)('+')
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
-  + gamma*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
 exact_solution = g
diff --git a/test/poisson/dimension-grid-variations/poisson_2d_dg_triangle.ufl b/test/poisson/dimension-grid-variations/poisson_2d_dg_triangle.ufl
index d518b11815bc40279f08f2a8802f41c02d2f5703..250b6900d28fa305534334c37d6e07563ad36aa5 100644
--- a/test/poisson/dimension-grid-variations/poisson_2d_dg_triangle.ufl
+++ b/test/poisson/dimension-grid-variations/poisson_2d_dg_triangle.ufl
@@ -1,10 +1,12 @@
 cell = triangle
+degree = 1
+dim = 2
 
 x = SpatialCoordinate(cell)
 f = -4.
 g = x[0]*x[0] + x[1]*x[1]
 
-V = FiniteElement("DG", cell, 1)
+V = FiniteElement("DG", cell, degree)
 
 u = TrialFunction(V)
 v = TestFunction(V)
@@ -12,21 +14,24 @@ v = TestFunction(V)
 n = FacetNormal(cell)('+')
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
-  + gamma*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
 exact_solution = g
diff --git a/test/poisson/dimension-grid-variations/poisson_3d_cg_hexahedron.ufl b/test/poisson/dimension-grid-variations/poisson_3d_cg_hexahedron.ufl
index 93e880daec3964f358b303d02dcb45bca0314743..9a9b16fc17df5b1d9df5d96b7e7e9e49853a09c5 100644
--- a/test/poisson/dimension-grid-variations/poisson_3d_cg_hexahedron.ufl
+++ b/test/poisson/dimension-grid-variations/poisson_3d_cg_hexahedron.ufl
@@ -8,7 +8,7 @@ V = FiniteElement("CG", cell, 1)
 u = TrialFunction(V)
 v = TestFunction(V)
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx]
+r = (inner(grad(u), grad(v)) - f*v)*dx
 exact_solution = g
 is_dirichlet = 1
-dirichlet_expression = g
+interpolate_expression = g
diff --git a/test/poisson/dimension-grid-variations/poisson_3d_cg_tetrahedron.ufl b/test/poisson/dimension-grid-variations/poisson_3d_cg_tetrahedron.ufl
index 61ebabe881158548f770b69b999759285520439e..b9df144c0534c800675cdeaf95343cb504b82bfe 100644
--- a/test/poisson/dimension-grid-variations/poisson_3d_cg_tetrahedron.ufl
+++ b/test/poisson/dimension-grid-variations/poisson_3d_cg_tetrahedron.ufl
@@ -8,7 +8,7 @@ V = FiniteElement("CG", "tetrahedron", 1)
 u = TrialFunction(V)
 v = TestFunction(V)
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx]
+r = (inner(grad(u), grad(v)) - f*v)*dx
 exact_solution = g
 is_dirichlet = 1
-dirichlet_expression = g
+interpolate_expression = g
diff --git a/test/poisson/dimension-grid-variations/poisson_3d_dg_hexahedron.ufl b/test/poisson/dimension-grid-variations/poisson_3d_dg_hexahedron.ufl
index 8abb5fb0d42197f3b87f38389f222efafeccb30b..810bfa46234b7a3951173cb36e9825a3dd7338c0 100644
--- a/test/poisson/dimension-grid-variations/poisson_3d_dg_hexahedron.ufl
+++ b/test/poisson/dimension-grid-variations/poisson_3d_dg_hexahedron.ufl
@@ -1,11 +1,13 @@
 cell = "hexahedron"
+degree = 1
+dim = 3
 
 x = SpatialCoordinate(cell)
 c = (0.5 - x[0])**2 + (0.5 - x[1])**2 + (0.5 - x[2])**2
 g = exp(-1.*c)
 f = 2*(3.-2*c)*g
 
-V = FiniteElement("DG", cell, 1)
+V = FiniteElement("DG", cell, degree)
 
 u = TrialFunction(V)
 v = TestFunction(V)
@@ -13,21 +15,24 @@ v = TestFunction(V)
 n = FacetNormal(cell)('+')
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
-  + gamma*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
-exact_solution = g
+exact_solution = g
\ No newline at end of file
diff --git a/test/poisson/dimension-grid-variations/poisson_3d_dg_tetrahedron.ufl b/test/poisson/dimension-grid-variations/poisson_3d_dg_tetrahedron.ufl
index 384d582e45cd0d820e0d57ef40abb2d07ada88e2..f3af6ada266ea8b3099e37ebc544688bf6bfca18 100644
--- a/test/poisson/dimension-grid-variations/poisson_3d_dg_tetrahedron.ufl
+++ b/test/poisson/dimension-grid-variations/poisson_3d_dg_tetrahedron.ufl
@@ -1,10 +1,12 @@
 cell = "tetrahedron"
+degree = 1
+dim = 3
 
 x = SpatialCoordinate(cell)
 f = -6.
 g = x[0]*x[0] + x[1]*x[1] + x[2]*x[2]
 
-V = FiniteElement("DG", cell, 1)
+V = FiniteElement("DG", cell, degree)
 
 u = TrialFunction(V)
 v = TestFunction(V)
@@ -12,21 +14,24 @@ v = TestFunction(V)
 n = FacetNormal(cell)('+')
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
-  + gamma*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
-exact_solution = g
+exact_solution = g
\ No newline at end of file
diff --git a/test/poisson/opcount_poisson_dg.ufl b/test/poisson/opcount_poisson_dg.ufl
index 1748e05796f0a0c82073c5b7fb24db9631891c71..4962ce0cefce2cd5ed901994eb8a3b8b75f187a9 100644
--- a/test/poisson/opcount_poisson_dg.ufl
+++ b/test/poisson/opcount_poisson_dg.ufl
@@ -1,9 +1,9 @@
-degree = 1
 cell = "quadrilateral"
 
+degree = 1
 dim = 2
-x = SpatialCoordinate(cell)
 
+x = SpatialCoordinate(cell)
 f = -4.
 g = x[0]*x[0] + x[1]*x[1]
 
@@ -25,15 +25,14 @@ gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
   + gamma_int*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
   + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma_ext*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
-exact_solution = g
+exact_solution = g
\ No newline at end of file
diff --git a/test/poisson/poisson.mini b/test/poisson/poisson.mini
index 3597ac0536fddfa5dcb83f5e6f4fdafe09419360..6fac6a11ae0e1f70024452f5c61eb7850ce55450 100644
--- a/test/poisson/poisson.mini
+++ b/test/poisson/poisson.mini
@@ -12,5 +12,7 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 1e-7
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
diff --git a/test/poisson/poisson.ufl b/test/poisson/poisson.ufl
index 2bfe33131e8fc3456a1d68781874a1bcae0f87af..5c6cf421ebc94b7419a8aa1d8d6df29f28cf0bb2 100644
--- a/test/poisson/poisson.ufl
+++ b/test/poisson/poisson.ufl
@@ -11,7 +11,7 @@ u = TrialFunction(V)
 v = TestFunction(V)
 
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx]
+r = (inner(grad(u), grad(v)) - f*v)*dx
 exact_solution = g
-dirichlet_expression = g
-is_dirichlet = 1
\ No newline at end of file
+interpolate_expression = g
+is_dirichlet = 1
diff --git a/test/poisson/poisson_customfunction.mini b/test/poisson/poisson_customfunction.mini
new file mode 100644
index 0000000000000000000000000000000000000000..c45e50fabd2b2b7743a0a6d94a3eb88e6926800f
--- /dev/null
+++ b/test/poisson/poisson_customfunction.mini
@@ -0,0 +1,18 @@
+__name = poisson_customfunction_{__exec_suffix}
+__exec_suffix = numdiff, symdiff | expand num
+
+lowerleft = 0.0 0.0
+upperright = 1.0 1.0
+elements = 32 32
+elementType = simplical
+
+[wrapper.vtkcompare]
+name = {__name}
+reference = poisson_ref
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-7
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
diff --git a/test/poisson/poisson_customfunction.ufl b/test/poisson/poisson_customfunction.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..026e069d697e8bfe05e8393abaf1118169f884a3
--- /dev/null
+++ b/test/poisson/poisson_customfunction.ufl
@@ -0,0 +1,33 @@
+import ufl
+
+cell = triangle
+
+x = SpatialCoordinate(cell)
+
+class SquareFct(ufl.classes.MathFunction):
+    def __init__(self, arg):
+        ufl.classes.MathFunction.__init__(self, 'square', arg)
+
+    def _ufl_expr_reconstruct_(self, *operands):
+        return SquareFct(*operands)
+
+    def derivative(self):
+        return 2 * self.ufl_operands[0]
+
+    def visit(self, visitor):
+        op = visitor.call(self.ufl_operands[0])
+        return op * op
+
+
+c = SquareFct(0.5-x[0]) + SquareFct(0.5-x[1])
+g = exp(-1.*c)
+f = 4*(1.-c)*g
+
+V = FiniteElement("CG", cell, 1)
+u = TrialFunction(V)
+v = TestFunction(V)
+
+r = (inner(grad(u), grad(v)) - f*v)*dx
+exact_solution = g
+interpolate_expression = g
+is_dirichlet = 1
\ No newline at end of file
diff --git a/test/poisson/poisson_dg.mini b/test/poisson/poisson_dg.mini
index fd859d6045eb496ab40e5a67a69d14e289bd9d19..bb806382e6b6fe110e1149bd58821bba2820a33e 100644
--- a/test/poisson/poisson_dg.mini
+++ b/test/poisson/poisson_dg.mini
@@ -12,5 +12,7 @@ reference = poisson_dg_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 9e-8
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
\ No newline at end of file
diff --git a/test/poisson/poisson_dg.ufl b/test/poisson/poisson_dg.ufl
index 9ac02bdf061feb9788488ba188e9d3b864ed59e0..75b536b851aa6752ea3c7505087f61f7f85177bd 100644
--- a/test/poisson/poisson_dg.ufl
+++ b/test/poisson/poisson_dg.ufl
@@ -25,15 +25,14 @@ gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
   + gamma_int*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
   + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma_ext*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
 exact_solution = g
\ No newline at end of file
diff --git a/test/poisson/poisson_dg_matrix_free.mini b/test/poisson/poisson_dg_matrix_free.mini
index 4fa26853c64f1075cc0de65ba1937890066bf54b..0d8d4cd6734673b944009f8377847588879d4a37 100644
--- a/test/poisson/poisson_dg_matrix_free.mini
+++ b/test/poisson/poisson_dg_matrix_free.mini
@@ -12,6 +12,8 @@ reference = poisson_dg_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
-matrix_free = 1
 compare_l2errorsquared = 1e-6
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
+matrix_free = 1
\ No newline at end of file
diff --git a/test/poisson/poisson_dg_neumann.mini b/test/poisson/poisson_dg_neumann.mini
index 43157de9980380b50d1a5f8a29df2997309ef7cd..7d930fb92e7813ff37c9c4ea49bdc7c2ae34d80b 100644
--- a/test/poisson/poisson_dg_neumann.mini
+++ b/test/poisson/poisson_dg_neumann.mini
@@ -12,5 +12,7 @@ reference = poisson_dg_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 9e-8
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
diff --git a/test/poisson/poisson_dg_neumann.ufl b/test/poisson/poisson_dg_neumann.ufl
index 3cedfc0877164e86e6edef343de11784a99d1d0e..bdcea093ef8d0d0b53f4a965513d5d8ae1b058e2 100644
--- a/test/poisson/poisson_dg_neumann.ufl
+++ b/test/poisson/poisson_dg_neumann.ufl
@@ -1,6 +1,6 @@
-dim = 3
-degree = 1
 cell = triangle
+degree = 1
+dim = 2
 
 x = SpatialCoordinate(cell)
 c = (0.5-x[0])**2 + (0.5-x[1])**2
@@ -30,16 +30,15 @@ gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
   + gamma_int*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds(1) \
   + gamma_ext*u*v*ds(1) \
   + theta*u*inner(grad(v), n)*ds(1) \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds(1) \
   - gamma_ext*g*v*ds(1) \
+  - theta*g*inner(grad(v), n)*ds(1) \
   - j*v*ds(0)
 
-forms = [r]
 exact_solution = g
diff --git a/test/poisson/poisson_dg_quadrilateral.mini b/test/poisson/poisson_dg_quadrilateral.mini
index 05da536acc9dd1864dd59f5d817f397e131ed541..dde2e495ab819ed22be6b848518ed71d41cfe30f 100644
--- a/test/poisson/poisson_dg_quadrilateral.mini
+++ b/test/poisson/poisson_dg_quadrilateral.mini
@@ -10,5 +10,7 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 7e-7
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
diff --git a/test/poisson/poisson_dg_quadrilateral.ufl b/test/poisson/poisson_dg_quadrilateral.ufl
index 30b1eb8bcd225600c9f78801f3498077388444f3..7387894341f25221d412ae967717a3bfca8788f4 100644
--- a/test/poisson/poisson_dg_quadrilateral.ufl
+++ b/test/poisson/poisson_dg_quadrilateral.ufl
@@ -1,11 +1,13 @@
 cell = "quadrilateral"
+degree = 1
+dim = 2
 
 x = SpatialCoordinate(cell)
 c = (0.5-x[0])**2 + (0.5-x[1])**2
 g = exp(-1.*c)
 f = 4*(1.-c)*g
 
-V = FiniteElement("DG", cell, 1)
+V = FiniteElement("DG", cell, degree)
 
 u = TrialFunction(V)
 v = TestFunction(V)
@@ -13,21 +15,24 @@ v = TestFunction(V)
 n = FacetNormal(cell)('+')
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
-  + gamma*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
 exact_solution = g
diff --git a/test/poisson/poisson_dg_tensor.mini b/test/poisson/poisson_dg_tensor.mini
index 52df8e1da0f9cadae3d553e00092c053dafac4c1..d696cebc5196bd711ef4e8e2e1371a977be3a31f 100644
--- a/test/poisson/poisson_dg_tensor.mini
+++ b/test/poisson/poisson_dg_tensor.mini
@@ -12,5 +12,7 @@ reference = poisson_dg_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 4e-6
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
\ No newline at end of file
diff --git a/test/poisson/poisson_dg_tensor.ufl b/test/poisson/poisson_dg_tensor.ufl
index 9409ece67ae08270896caf5adf87ced67ebced01..ba8eb74d084f7e4b3efc9e940beb616e305b609c 100644
--- a/test/poisson/poisson_dg_tensor.ufl
+++ b/test/poisson/poisson_dg_tensor.ufl
@@ -1,6 +1,6 @@
-dim = 2
-degree = 1
 cell = quadrilateral
+degree = 1
+dim = 2
 
 x = SpatialCoordinate(cell)
 
@@ -28,15 +28,14 @@ gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 theta = 1.0
 
 r = (inner(A*grad(u), grad(v)) + c*u*v)*dx \
-  + inner(n, A*avg(grad(u)))*jump(v)*dS \
+  - f*v*dx \
+  - inner(n, A*avg(grad(u)))*jump(v)*dS \
   + gamma_int*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(A*avg(grad(v)), n)*dS \
+  + theta*jump(u)*inner(A*avg(grad(v)), n)*dS \
   - inner(n, A*grad(u))*v*ds \
   + gamma_ext*u*v*ds \
   + theta*u*inner(A*grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(A*grad(v), n)*ds \
-  - gamma_ext*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(A*grad(v), n)*ds
 
-forms = [r]
 exact_solution = g
diff --git a/test/poisson/poisson_matrix_free.mini b/test/poisson/poisson_matrix_free.mini
index 5709ef999b6447810847c80948fb010994fd07fc..3372c8ac453f91d001c3178110e1d6caacec6b48 100644
--- a/test/poisson/poisson_matrix_free.mini
+++ b/test/poisson/poisson_matrix_free.mini
@@ -11,5 +11,7 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-matrix_free = 1
 compare_l2errorsquared = 1e-7
+
+[formcompiler.r]
+matrix_free = 1
\ No newline at end of file
diff --git a/test/poisson/poisson_neumann.mini b/test/poisson/poisson_neumann.mini
index 0c4aa9c7ff2f7470f9f39e146c933dbb4f452ee0..76a1fa9e12af16426692c197b80a14599aaee74b 100644
--- a/test/poisson/poisson_neumann.mini
+++ b/test/poisson/poisson_neumann.mini
@@ -12,5 +12,7 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 8e-8
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
diff --git a/test/poisson/poisson_neumann.ufl b/test/poisson/poisson_neumann.ufl
index d951842134b2859689629189bed631c28caf2442..16eea674b5fd40003af4817ee2c43d461b45e15e 100644
--- a/test/poisson/poisson_neumann.ufl
+++ b/test/poisson/poisson_neumann.ufl
@@ -16,7 +16,7 @@ v = TestFunction(V)
 # Define the boundary measure that knows where we are...
 ds = ds(subdomain_data=bctype)
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx - j*v*ds(0)]
+r = (inner(grad(u), grad(v)) - f*v)*dx - j*v*ds(0)
 exact_solution = g
 is_dirichlet = bctype
-dirichlet_expression = g
\ No newline at end of file
+interpolate_expression = g
\ No newline at end of file
diff --git a/test/poisson/poisson_tensor.mini b/test/poisson/poisson_tensor.mini
index 8711de545698818505fc63e8701206bce8acdf54..ec4d2c310bf6ba32e9d17aca53aeba00f6cbc4ee 100644
--- a/test/poisson/poisson_tensor.mini
+++ b/test/poisson/poisson_tensor.mini
@@ -12,5 +12,7 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
-compare_l2errorsquared = 1e-7
\ No newline at end of file
+compare_l2errorsquared = 1e-7
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
\ No newline at end of file
diff --git a/test/poisson/poisson_tensor.ufl b/test/poisson/poisson_tensor.ufl
index 7208c1e0271e91d6ece79ee388510dc2f64a7a25..b527d05258667dae629f608a1a630e5f11f947b8 100644
--- a/test/poisson/poisson_tensor.ufl
+++ b/test/poisson/poisson_tensor.ufl
@@ -12,7 +12,7 @@ V = FiniteElement("CG", cell, 1)
 u = TrialFunction(V)
 v = TestFunction(V)
 
-forms = [(inner(A*grad(u), grad(v)) + c*u*v -f*v)*dx]
+r= (inner(A*grad(u), grad(v)) + c*u*v -f*v)*dx
 exact_solution = g
 is_dirichlet = 1
-dirichlet_expression = g
\ No newline at end of file
+interpolate_expression = g
\ No newline at end of file
diff --git a/test/stokes/stokes.mini b/test/stokes/stokes.mini
index c236f3a87ddec3867557ef6056f3096c04f4be1e..b281b6bbe7755097c091007d1c9ed0b6840afad2 100644
--- a/test/stokes/stokes.mini
+++ b/test/stokes/stokes.mini
@@ -13,5 +13,7 @@ reference = hagenpoiseuille_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 0, 1 | expand num
 compare_l2errorsquared = 1e-11
+
+[formcompiler.r]
+numerical_jacobian = 0, 1 | expand num
diff --git a/test/stokes/stokes.ufl b/test/stokes/stokes.ufl
index 4307fbeee64a03e4ace5cb63931458ae5d9edbcf..99f21bbc943fd51fa1e514b7438e776e8a91e1f0 100644
--- a/test/stokes/stokes.ufl
+++ b/test/stokes/stokes.ufl
@@ -13,7 +13,6 @@ u, p = TrialFunctions(TH)
 
 r = (inner(grad(v), grad(u)) - div(v)*p - q*div(u))*dx
 
-forms = [r]
 is_dirichlet = v_bctype, v_bctype, 0
-dirichlet_expression = g_v, None
+interpolate_expression = g_v, None
 exact_solution = g_v, 8.*(1.-x[0])
\ No newline at end of file
diff --git a/test/stokes/stokes_3d_dg_quadrilateral.mini b/test/stokes/stokes_3d_dg_quadrilateral.mini
index d7c82422aab9a3f4522d9ba28c41514da2345f8f..59396277ede15f6a563d04fb448c4d3b5a445b3b 100644
--- a/test/stokes/stokes_3d_dg_quadrilateral.mini
+++ b/test/stokes/stokes_3d_dg_quadrilateral.mini
@@ -10,5 +10,7 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+compare_l2errorsquared = 6e-8
+
+[formcompiler.r]
 numerical_jacobian = 0, 1 | expand num
-compare_l2errorsquared = 6e-8
\ No newline at end of file
diff --git a/test/stokes/stokes_3d_dg_quadrilateral.ufl b/test/stokes/stokes_3d_dg_quadrilateral.ufl
index 84d1003e16d7f4f36dc0630434d98eb3d633cd3a..c47773f5821ce09ecb6182e81eeaff6bf17a7fc7 100644
--- a/test/stokes/stokes_3d_dg_quadrilateral.ufl
+++ b/test/stokes/stokes_3d_dg_quadrilateral.ufl
@@ -1,11 +1,13 @@
 cell = hexahedron
+degree = 2
+dim = 3
 
 x = SpatialCoordinate(cell)
 g_v = as_vector((4.*x[1]*(1.-x[1]), 0.0, 0.0))
 bctype = conditional(x[0] < 1. - 1e-8, 1, 0)
 
-P2 = VectorElement("DG", cell, 2)
-P1 = FiniteElement("DG", cell, 1)
+P2 = VectorElement("DG", cell, degree)
+P1 = FiniteElement("DG", cell, degree-1)
 TH = P2 * P1
 
 v, q = TestFunctions(TH)
@@ -14,23 +16,29 @@ u, p = TrialFunctions(TH)
 ds = ds(subdomain_id=1, subdomain_data=bctype)
 
 n = FacetNormal(cell)('+')
-eps = -1.0
-sigma = 1.0
-h_e = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = -1.0
+
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 r = inner(grad(u), grad(v))*dx \
   - p*div(v)*dx \
   - q*div(u)*dx \
-  + inner(avg(grad(u))*n, jump(v))*dS \
-  + sigma / h_e * inner(jump(u), jump(v))*dS \
-  - eps * inner(avg(grad(v))*n, jump(u))*dS \
-  - avg(p)*inner(jump(v), n)*dS \
-  - avg(q)*inner(jump(u), n)*dS \
+  - inner(avg(grad(u))*n, jump(v))*dS \
+  + gamma_int * inner(jump(u), jump(v))*dS \
+  + theta * inner(avg(grad(v))*n, jump(u))*dS \
+  + avg(p)*inner(jump(v), n)*dS \
+  + avg(q)*inner(jump(u), n)*dS \
   - inner(grad(u)*n, v)*ds \
-  + sigma / h_e * inner(u-g_v, v)*ds \
-  + eps * inner(grad(v)*n, u-g_v)*ds \
+  + gamma_ext * inner(u-g_v, v)*ds \
+  + theta * inner(grad(v)*n, u-g_v)*ds \
   + p*inner(v, n)*ds \
   + q*inner(u-g_v, n)*ds
 
-forms = [r]
 exact_solution = g_v, 8.*(1.-x[0])
diff --git a/test/stokes/stokes_3d_quadrilateral.mini b/test/stokes/stokes_3d_quadrilateral.mini
index 89c4796da75f3212ca59f44cace5a53229b1259a..17f3d9f510054ff9014830e258edd7840180248e 100644
--- a/test/stokes/stokes_3d_quadrilateral.mini
+++ b/test/stokes/stokes_3d_quadrilateral.mini
@@ -11,5 +11,7 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+compare_l2errorsquared = 1e-10
+
+[formcompiler.r]
 numerical_jacobian = 1, 0 | expand num
-compare_l2errorsquared = 1e-10
\ No newline at end of file
diff --git a/test/stokes/stokes_3d_quadrilateral.ufl b/test/stokes/stokes_3d_quadrilateral.ufl
index 0888298844ccf8e7ebda072c26c4908e96565072..f39cdcd42803cc94fd03ca98059707bafc1a844a 100644
--- a/test/stokes/stokes_3d_quadrilateral.ufl
+++ b/test/stokes/stokes_3d_quadrilateral.ufl
@@ -13,7 +13,6 @@ u, p = TrialFunctions(TH)
 
 r = (inner(grad(v), grad(u)) - div(v)*p - q*div(u))*dx
 
-forms = [r]
 exact_solution = g_v, 8.*(1.-x[0])
 is_dirichlet = v_bctype, v_bctype, v_bctype, 0
-dirichlet_expression = g_v, None
+interpolate_expression = g_v, None
diff --git a/test/stokes/stokes_dg.mini b/test/stokes/stokes_dg.mini
index 253a347941a128a1f2b38d736a055c6f827a3088..2fa0e00a83866ef7d1e3fa5036181e91973a92fa 100644
--- a/test/stokes/stokes_dg.mini
+++ b/test/stokes/stokes_dg.mini
@@ -15,5 +15,7 @@ zeroThreshold.data_0 = 1e-6
 zeroThreshold.data_1 = 1e-6
 
 [formcompiler]
-numerical_jacobian = 0, 1 | expand num
 compare_l2errorsquared = 1e-9
+
+[formcompiler.r]
+numerical_jacobian = 0, 1 | expand num
diff --git a/test/stokes/stokes_dg.ufl b/test/stokes/stokes_dg.ufl
index e7176b264efaf2bbb20146d5c19cd8d4b72a2c45..d4f4225b939d228fc4436d482a3cc989a97ad78f 100644
--- a/test/stokes/stokes_dg.ufl
+++ b/test/stokes/stokes_dg.ufl
@@ -1,11 +1,13 @@
 cell = triangle
+degree = 2
+dim = 2
 
 x = SpatialCoordinate(cell)
 g_v = as_vector((4*x[1]*(1.-x[1]), 0.0))
 bctype = conditional(x[0] < 1. - 1e-8, 1, 0)
 
-P2 = VectorElement("DG", cell, 2)
-P1 = FiniteElement("DG", cell, 1)
+P2 = VectorElement("DG", cell, degree)
+P1 = FiniteElement("DG", cell, degree-1)
 TH = P2 * P1
 
 v, q = TestFunctions(TH)
@@ -14,23 +16,29 @@ u, p = TrialFunctions(TH)
 ds = ds(subdomain_id=1, subdomain_data=bctype)
 
 n = FacetNormal(cell)('+')
-eps = -1.0
-sigma = 1.0
-h_e = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = -1.0
+
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 r = inner(grad(u), grad(v))*dx \
   - p*div(v)*dx \
   - q*div(u)*dx \
-  + inner(avg(grad(u))*n, jump(v))*dS \
-  + sigma / h_e * inner(jump(u), jump(v))*dS \
-  - eps * inner(avg(grad(v))*n, jump(u))*dS \
-  - avg(p)*inner(jump(v), n)*dS \
-  - avg(q)*inner(jump(u), n)*dS \
+  - inner(avg(grad(u))*n, jump(v))*dS \
+  + gamma_int * inner(jump(u), jump(v))*dS \
+  + theta * inner(avg(grad(v))*n, jump(u))*dS \
+  + avg(p)*inner(jump(v), n)*dS \
+  + avg(q)*inner(jump(u), n)*dS \
   - inner(grad(u)*n, v)*ds \
-  + sigma / h_e * inner(u-g_v, v)*ds \
-  + eps * inner(grad(v)*n, u-g_v)*ds \
+  + gamma_ext * inner(u-g_v, v)*ds \
+  + theta * inner(grad(v)*n, u-g_v)*ds \
   + p*inner(v, n)*ds \
   + q*inner(u-g_v, n)*ds
 
-forms = [r]
 exact_solution = g_v, 8*(1.-x[0])
\ No newline at end of file
diff --git a/test/stokes/stokes_dg_quadrilateral.mini b/test/stokes/stokes_dg_quadrilateral.mini
index 7f25099677036a8bfe1f715984ee500d9dc4d015..78954b12873569589c2874d15858c6121a242eb5 100644
--- a/test/stokes/stokes_dg_quadrilateral.mini
+++ b/test/stokes/stokes_dg_quadrilateral.mini
@@ -10,5 +10,7 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 0, 1 | expand num
 compare_l2errorsquared = 1e-8
+
+[formcompiler.r]
+numerical_jacobian = 0, 1 | expand num
diff --git a/test/stokes/stokes_dg_quadrilateral.ufl b/test/stokes/stokes_dg_quadrilateral.ufl
index 8f4415a7a3c30635e9add35faf58336995a820e4..0b37429b54cbe4e77fad3eb0abd88d9e345b7dcf 100644
--- a/test/stokes/stokes_dg_quadrilateral.ufl
+++ b/test/stokes/stokes_dg_quadrilateral.ufl
@@ -1,11 +1,13 @@
 cell = quadrilateral
+degree = 2
+dim = 2
 
 x = SpatialCoordinate(cell)
 g_v = as_vector((4*x[1]*(1.-x[1]), 0.0))
 bctype = conditional(x[0] < 1. - 1e-8, 1, 0)
 
-P2 = VectorElement("DG", cell, 2)
-P1 = FiniteElement("DG", cell, 1)
+P2 = VectorElement("DG", cell, degree)
+P1 = FiniteElement("DG", cell, degree-1)
 TH = P2 * P1
 
 v, q = TestFunctions(TH)
@@ -14,23 +16,29 @@ u, p = TrialFunctions(TH)
 ds = ds(subdomain_id=1, subdomain_data=bctype)
 
 n = FacetNormal(cell)('+')
-eps = -1.0
-sigma = 1.0
-h_e = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+
+# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
+theta = -1.0
+
+# penalty factor
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 r = inner(grad(u), grad(v))*dx \
   - p*div(v)*dx \
   - q*div(u)*dx \
-  + inner(avg(grad(u))*n, jump(v))*dS \
-  + sigma / h_e * inner(jump(u), jump(v))*dS \
-  - eps * inner(avg(grad(v))*n, jump(u))*dS \
-  - avg(p)*inner(jump(v), n)*dS \
-  - avg(q)*inner(jump(u), n)*dS \
+  - inner(avg(grad(u))*n, jump(v))*dS \
+  + gamma_int * inner(jump(u), jump(v))*dS \
+  + theta * inner(avg(grad(v))*n, jump(u))*dS \
+  + avg(p)*inner(jump(v), n)*dS \
+  + avg(q)*inner(jump(u), n)*dS \
   - inner(grad(u)*n, v)*ds \
-  + sigma / h_e * inner(u-g_v, v)*ds \
-  + eps * inner(grad(v)*n, u-g_v)*ds \
+  + gamma_ext * inner(u-g_v, v)*ds \
+  + theta * inner(grad(v)*n, u-g_v)*ds \
   + p*inner(v, n)*ds \
   + q*inner(u-g_v, n)*ds
 
-forms = [r]
 exact_solution = g_v, 8*(1.-x[0])
diff --git a/test/stokes/stokes_quadrilateral.mini b/test/stokes/stokes_quadrilateral.mini
index e9440771716292bda9664a26b2c8911e00a65a34..6ee36e8220463cbb75764a91b0ae1d2970f28eb3 100644
--- a/test/stokes/stokes_quadrilateral.mini
+++ b/test/stokes/stokes_quadrilateral.mini
@@ -11,5 +11,7 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 1e-10
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
diff --git a/test/stokes/stokes_quadrilateral.ufl b/test/stokes/stokes_quadrilateral.ufl
index c8f630b84aae43a68221ace670fe9fcb027af47d..4411e791138cf85824f24fc6549d52cf6ef1af0d 100644
--- a/test/stokes/stokes_quadrilateral.ufl
+++ b/test/stokes/stokes_quadrilateral.ufl
@@ -13,7 +13,6 @@ u, p = TrialFunctions(TH)
 
 r = (inner(grad(v), grad(u)) - div(v)*p - q*div(u))*dx
 
-forms = [r]
 is_dirichlet = v_bctype, v_bctype, 0
-dirichlet_expression = g_v, None
+interpolate_expression = g_v, None
 exact_solution = g_v, 8.*(1.-x[0])
diff --git a/test/stokes/stokes_stress.mini b/test/stokes/stokes_stress.mini
index af72867f443e85aa8b824f66a0c155b9b9f4e9a6..9663f5f1f9d329c02e568a169aeacb66ef4c2e63 100644
--- a/test/stokes/stokes_stress.mini
+++ b/test/stokes/stokes_stress.mini
@@ -15,6 +15,7 @@ reference = hagenpoiseuille_ref
 extension = vtu
 
 [formcompiler]
-# numerical_jacobian = 0, 1 | expand num
-numerical_jacobian = 1
 compare_l2errorsquared = 1e-11
+
+[formcompiler.r]
+numerical_jacobian = 1
diff --git a/test/stokes/stokes_stress.ufl b/test/stokes/stokes_stress.ufl
index a25a73adba1baa7fc141864320f7c639ff9808df..787e5a232383ad9e0767a6fb41e08cedc988943f 100644
--- a/test/stokes/stokes_stress.ufl
+++ b/test/stokes/stokes_stress.ufl
@@ -14,7 +14,6 @@ u, p, S  = TrialFunctions(TH)
 
 r = (inner(grad(v), S) + inner(grad(u) - S, T) - div(v)*p - q*div(u))*dx
 
-forms = [r]
 is_dirichlet = v_bctype, v_bctype, 0, 0, 0, 0, 0
-dirichlet_expression = 4*x[1]*(1.-x[1]), 0.0, None, None, None, None, None
+interpolate_expression = 4*x[1]*(1.-x[1]), 0.0, None, None, None, None, None
 exact_solution = 4*x[1]*(1.-x[1]), 0.0, 8*(1.-x[0]), 0.0, 0.0, -1.*8*x[1] + 4., 0.0
\ No newline at end of file
diff --git a/test/stokes/stokes_stress_sym.mini b/test/stokes/stokes_stress_sym.mini
index 1aa3d6f087cae99310dd2c7b3a50721f3fd447f6..9646ec0840f422c924e53ef289a87a6cebc6c3cb 100644
--- a/test/stokes/stokes_stress_sym.mini
+++ b/test/stokes/stokes_stress_sym.mini
@@ -13,5 +13,7 @@ reference = hagenpoiseuille_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1
 compare_l2errorsquared = 1e-6
+
+[formcompiler.r]
+numerical_jacobian = 1
diff --git a/test/stokes/stokes_stress_sym.ufl b/test/stokes/stokes_stress_sym.ufl
index f5dc520c07202ee05b4294b153d90f3dcd5b4ab0..8e2d55dd4cfca7335932b751d5ae2c2bb71aad59 100644
--- a/test/stokes/stokes_stress_sym.ufl
+++ b/test/stokes/stokes_stress_sym.ufl
@@ -20,7 +20,6 @@ r = (inner(grad(v), S) + inner(2*sym(grad(u)) - S, T) - div(v)*p - q*div(u))*dx
 # \
 #  + inner(S.T*n, v)*ds
 
-forms = [r]
 is_dirichlet = v_bctype, v_bctype, 0, 0, 0, 0
-dirichlet_expression = 4*x[1]*(1.-x[1]), 0.0, None, None, None, None
+interpolate_expression = 4*x[1]*(1.-x[1]), 0.0, None, None, None, None
 exact_solution = 4*x[1]*(1.-x[1]), 0.0, 8*(1.-x[0]), 0.0, 0.0, -1.*8*x[1] + 4.
\ No newline at end of file
diff --git a/test/stokes/stokes_sym.mini b/test/stokes/stokes_sym.mini
index 26cc91467a701e8643b040453943d83a2ece3e96..89dcee74944f7445b78176aafc61a7105d4c5f99 100644
--- a/test/stokes/stokes_sym.mini
+++ b/test/stokes/stokes_sym.mini
@@ -13,5 +13,7 @@ reference = hagenpoiseuille_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 0, 1 | expand num
 compare_l2errorsquared = 1e-10
+
+[formcompiler.r]
+numerical_jacobian = 0, 1 | expand num
diff --git a/test/stokes/stokes_sym.ufl b/test/stokes/stokes_sym.ufl
index 1ae13db697976a046b461096a530ef315a2d7417..c7fe07ceafa660906bb33e21ad9603758d23ed3a 100644
--- a/test/stokes/stokes_sym.ufl
+++ b/test/stokes/stokes_sym.ufl
@@ -16,7 +16,6 @@ n = FacetNormal(triangle)('+')
 
 r = (inner(2*sym(grad(u)), grad(v)) - div(v)*p - q*div(u))*dx - inner(grad(u).T*n,v)*ds
 
-forms = [r]
 is_dirichlet = v_bctype, v_bctype, 0
-dirichlet_expression = g_v, None
+interpolate_expression = g_v, None
 exact_solution = g_v, 8.*(1.-x[0])
\ No newline at end of file
diff --git a/test/sumfact/hyperbolic/linearacoustics.mini b/test/sumfact/hyperbolic/linearacoustics.mini
index 5d113e2b9737a8de3a2859ff99b44b6ba2d284b4..1f7b1fa542172fd28e45321d36ab82dc3cbdc1ef 100644
--- a/test/sumfact/hyperbolic/linearacoustics.mini
+++ b/test/sumfact/hyperbolic/linearacoustics.mini
@@ -13,6 +13,13 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+explicit_time_stepping = 1
+operators = mass, r
+
+[formcompiler.mass]
+numerical_jacobian = 1
+sumfact = 1
+
+[formcompiler.r]
 numerical_jacobian = 1
 sumfact = 1
-explicit_time_stepping = 1
\ No newline at end of file
diff --git a/test/sumfact/hyperbolic/linearacoustics.ufl b/test/sumfact/hyperbolic/linearacoustics.ufl
index 8b9d48c4433f72395c054c88ea6c4eaeedb9fcb0..5a78e7848578053dbb8d7f75a94c2f901f5831d3 100644
--- a/test/sumfact/hyperbolic/linearacoustics.ufl
+++ b/test/sumfact/hyperbolic/linearacoustics.ufl
@@ -21,12 +21,11 @@ flux = as_matrix([[q0,  q1],
                   [0., rho]])
 
 # Define numerical fluxes to choose from
-llf_flux = dot(avg(flux), n) - 0.5*jump(u)
+llf_flux = dot(avg(flux), n) + 0.5*jump(u)
 numerical_flux = llf_flux
 
 r = -1. * inner(flux, grad(v))*dx \
-  - inner(numerical_flux, jump(v))*dS \
+  + inner(numerical_flux, jump(v))*dS \
   + inner(u, v)*ds
 
-forms = [mass, r]
-dirichlet_expression = f, 0.0, 0.0
+interpolate_expression = f, 0.0, 0.0
diff --git a/test/sumfact/hyperbolic/lineartransport.mini b/test/sumfact/hyperbolic/lineartransport.mini
index f4d694b2186ad9a3c764c04fcaf898ac566d55af..4ea0034f57922c41dda48dbac96d8ef89ec426b0 100644
--- a/test/sumfact/hyperbolic/lineartransport.mini
+++ b/test/sumfact/hyperbolic/lineartransport.mini
@@ -13,10 +13,16 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
-sumfact = 1
-#fastdg = 1
 # This tests that all mass is transported out of the domain.
 # While this is not the best of tests, it is something easily checked for.
-numerical_jacobian = 1
 explicit_time_stepping = 1
 compare_l2errorsquared = 1e-10
+operators = mass, r
+
+[formcompiler.mass]
+sumfact = 1
+numerical_jacobian = 1
+
+[formcompiler.r]
+sumfact = 1
+numerical_jacobian = 1
diff --git a/test/sumfact/hyperbolic/lineartransport.ufl b/test/sumfact/hyperbolic/lineartransport.ufl
index 8fa698d935c924f4c79fb9653c1d43e959909ccb..15a8021854222931b8dbef707082e90b406a47dc 100644
--- a/test/sumfact/hyperbolic/lineartransport.ufl
+++ b/test/sumfact/hyperbolic/lineartransport.ufl
@@ -15,17 +15,16 @@ v = TestFunction(V)
 beta = as_vector((1., 1.))
 n = FacetNormal(cell)('+')
 
-def numerical_flux(normal, outside, inside):
+def numerical_flux(normal, inside, outside):
 	return conditional(inner(beta, n) > 0, inside, outside)*inner(beta, n)
 
 mass = u*v*dx
 
 r = -1.*u*inner(beta, grad(v))*dx \
-  - numerical_flux(n, u('+'), u('-'))*jump(v)*dS \
+  + numerical_flux(n, u('+'), u('-'))*jump(v)*dS \
   + inner(beta, n)*u*v*dso \
-  + numerical_flux(n, 0.0, u('-'))*v*dsd
+  + numerical_flux(n, u('+'), 0.0)*v*dsd
 
-forms = [mass, r]
 exact_solution = 0.0
 is_dirichlet = dirichlet
-dirichlet_expression = initial
\ No newline at end of file
+interpolate_expression = initial
\ No newline at end of file
diff --git a/test/sumfact/hyperbolic/shallowwater.mini b/test/sumfact/hyperbolic/shallowwater.mini
index 8346ebf34a742fcd8afa4c19dec08701c4f81fbe..5f63f48bd0bd55537d1055ff685d0998ac28ad15 100644
--- a/test/sumfact/hyperbolic/shallowwater.mini
+++ b/test/sumfact/hyperbolic/shallowwater.mini
@@ -14,4 +14,10 @@ extension = vtu
 
 [formcompiler]
 explicit_time_stepping = 1
+operators = mass, r
+
+[formcompiler.mass]
+sumfact = 1
+
+[formcompiler.r]
 sumfact = 1
diff --git a/test/sumfact/hyperbolic/shallowwater.ufl b/test/sumfact/hyperbolic/shallowwater.ufl
index 9fca496e9fab29d58e91e79f15fb6fed3b4265af..d0bfc147cf83062855836606edaf4a337f5d21c4 100644
--- a/test/sumfact/hyperbolic/shallowwater.ufl
+++ b/test/sumfact/hyperbolic/shallowwater.ufl
@@ -27,13 +27,12 @@ bflux = as_matrix([[-q0,                  -q1],
 
 
 # Define numerical fluxes to choose from
-llf_flux = dot(avg(flux), n) - 0.5*jump(u)
+llf_flux = dot(avg(flux), n) + 0.5*jump(u)
 boundary_flux = 0.5*dot(flux + bflux, n) + as_vector([0., q0, q1])
 numerical_flux = llf_flux
 
 r = -1. * inner(flux, grad(v))*dx \
-  - inner(numerical_flux, jump(v))*dS \
+  + inner(numerical_flux, jump(v))*dS \
   + inner(boundary_flux, v)*ds
 
-forms = [mass, r]
-dirichlet_expression = f, 0.0, 0.0
+interpolate_expression = f, 0.0, 0.0
diff --git a/test/sumfact/mass/mass.mini b/test/sumfact/mass/mass.mini
index 44870439291a6111c18353f2c80e9be2116d3b28..6b0e9db8144fe18f7cf5e89f016d228d74ae9173 100644
--- a/test/sumfact/mass/mass.mini
+++ b/test/sumfact/mass/mass.mini
@@ -12,7 +12,7 @@ printmatrix = 1
 name = {__name}
 extension = vtu
 
-[formcompiler]
+[formcompiler.r]
 numerical_jacobian = 1, 0 | expand num
 vectorization_quadloop = 1, 0 | expand vec
 sumfact = 1
diff --git a/test/sumfact/mass/mass.ufl b/test/sumfact/mass/mass.ufl
index 6434e09f62b28ca9ea003c936997233d31449431..c11e65676418c9472cb77800dd9c8d477ebf7024 100644
--- a/test/sumfact/mass/mass.ufl
+++ b/test/sumfact/mass/mass.ufl
@@ -6,5 +6,3 @@ u = TrialFunction(V)
 v = TestFunction(V)
 
 r = u * v * dx
-
-forms = [r]
diff --git a/test/sumfact/mass/mass_3d.mini b/test/sumfact/mass/mass_3d.mini
index aba93533768a7b5463052c6585f6e45648380b70..fff87d11bb272dc5d5bd57e8f12569581ddc2485 100644
--- a/test/sumfact/mass/mass_3d.mini
+++ b/test/sumfact/mass/mass_3d.mini
@@ -13,7 +13,7 @@ printmatrix = true
 name = {__name}
 extension = vtu
 
-[formcompiler]
+[formcompiler.r]
 numerical_jacobian = 1, 0 | expand num
 vectorization_quadloop = 1, 0 | expand vec
 sumfact = 1
diff --git a/test/sumfact/mass/mass_3d.ufl b/test/sumfact/mass/mass_3d.ufl
index 5f55103e52f0b84c550e38f68c0acc8d77465793..1336a91db188e0626353294281f109e0d30ad8ae 100644
--- a/test/sumfact/mass/mass_3d.ufl
+++ b/test/sumfact/mass/mass_3d.ufl
@@ -6,5 +6,3 @@ u = TrialFunction(V)
 v = TestFunction(V)
 
 r = u * v * dx
-
-forms = [r]
diff --git a/test/sumfact/mass/sliced.mini b/test/sumfact/mass/sliced.mini
index 90dab43e70b8ddc38830c37afb2dd83b4116f5e7..17d331901a999c5d2d90a0453dfc524183f6e132 100644
--- a/test/sumfact/mass/sliced.mini
+++ b/test/sumfact/mass/sliced.mini
@@ -9,7 +9,7 @@ printmatrix = true
 name = {__name}
 extension = vtu
 
-[formcompiler]
+[formcompiler.r]
 numerical_jacobian = 1
 vectorization_strategy = explicit
 vectorization_horizontal = 1
diff --git a/test/sumfact/poisson/diagonal.mini b/test/sumfact/poisson/diagonal.mini
index d3744184c52abd1320aa796bc16249c478afe9a7..298fadba9554771a7bdf4810b15ad7e925861cbf 100644
--- a/test/sumfact/poisson/diagonal.mini
+++ b/test/sumfact/poisson/diagonal.mini
@@ -8,8 +8,10 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
-sumfact = 1
 compare_l2errorsquared = 1e-5
+
+[formcompiler.r]
+sumfact = 1
 vectorization_quadloop = 1
 vectorization_strategy = explicit
 vectorization_horizontal = 2
diff --git a/test/sumfact/poisson/opcount_poisson_2d_order2.mini b/test/sumfact/poisson/opcount_poisson_2d_order2.mini
index 2350f1137da3df811df6bfa56bb6b373c0b422ab..538189b2e5c14bd745c748fc43e3161c4013cc77 100644
--- a/test/sumfact/poisson/opcount_poisson_2d_order2.mini
+++ b/test/sumfact/poisson/opcount_poisson_2d_order2.mini
@@ -12,11 +12,12 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 0
 compare_l2errorsquared = 1e-8
-sumfact = 1
 opcounter = 1
 instrumentation_level = 4
 
+[formcompiler.r]
+sumfact = 1
+
 [formcompiler.ufl_variants]
 degree = 2
diff --git a/test/sumfact/poisson/opcount_sumfact_poisson_dg_2d_vec.mini b/test/sumfact/poisson/opcount_sumfact_poisson_dg_2d_vec.mini
index 063987b96cfb04af6223b426258e412f839582e7..b657c1a2c731e0606093ab3c0e00044afaadd3ae 100644
--- a/test/sumfact/poisson/opcount_sumfact_poisson_dg_2d_vec.mini
+++ b/test/sumfact/poisson/opcount_sumfact_poisson_dg_2d_vec.mini
@@ -10,11 +10,11 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 0
-sumfact = 1
 opcounter = 1
 instrumentation_level = 4
 
+[formcompiler.r]
+sumfact = 1
 
 [formcompiler.ufl_variants]
 degree = 1
diff --git a/test/sumfact/poisson/poisson_2d.mini b/test/sumfact/poisson/poisson_2d.mini
index 9fab490cf7b12a362767520975173927971e2382..d9ce1773e575fab9ec9e05057207093c8ea8b747 100644
--- a/test/sumfact/poisson/poisson_2d.mini
+++ b/test/sumfact/poisson/poisson_2d.mini
@@ -14,8 +14,10 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 4e-5, 4e-9 | expand deg
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
 sumfact = 1
 vectorization_strategy = explicit, none | expand grad
 quadrature_order = 2, 4
diff --git a/test/sumfact/poisson/poisson_2d.ufl b/test/sumfact/poisson/poisson_2d.ufl
index d2c78a8d2a1a928ae1f941db4e1a337c8d308bbb..f0cecc18e42eb5ee9b7bc4b488bb8e926f34ea8b 100644
--- a/test/sumfact/poisson/poisson_2d.ufl
+++ b/test/sumfact/poisson/poisson_2d.ufl
@@ -12,7 +12,7 @@ V = TensorProductElement(V_0, V_1, cell=cell)
 u = TrialFunction(V)
 v = TestFunction(V)
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx]
+r = (inner(grad(u), grad(v)) - f*v)*dx
 exact_solution = g
 is_dirichlet = 1
-dirichlet_expression = g
+interpolate_expression = g
diff --git a/test/sumfact/poisson/poisson_3d.mini b/test/sumfact/poisson/poisson_3d.mini
index 2ddbd626bac79456c3b138e0386a8ac94ee9aa15..e3e6da7d29475cedd70db44a2b667d6171ae5c80 100644
--- a/test/sumfact/poisson/poisson_3d.mini
+++ b/test/sumfact/poisson/poisson_3d.mini
@@ -15,8 +15,10 @@ reference = poisson_ref
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 1, 0 | expand num
 compare_l2errorsquared = 1e-4, 1e-8 | expand deg
+
+[formcompiler.r]
+numerical_jacobian = 1, 0 | expand num
 sumfact = 1
 vectorization_quadloop = 1, 0 | expand quad
 vectorization_strategy = explicit, none | expand grad
diff --git a/test/sumfact/poisson/poisson_3d.ufl b/test/sumfact/poisson/poisson_3d.ufl
index 529db2a042c01c43ffb7c71c390c323dd3c3d4ac..313cec8ec572013d5604d3a9dde332c13f359b3e 100644
--- a/test/sumfact/poisson/poisson_3d.ufl
+++ b/test/sumfact/poisson/poisson_3d.ufl
@@ -9,7 +9,7 @@ V = FiniteElement("CG", cell, degree)
 u = TrialFunction(V)
 v = TestFunction(V)
 
-forms = [(inner(grad(u), grad(v)) - f*v)*dx]
+r = (inner(grad(u), grad(v)) - f*v)*dx
 exact_solution = g
 is_dirichlet = 1
-dirichlet_expression = g
+interpolate_expression = g
diff --git a/test/sumfact/poisson/poisson_dg_2d.mini b/test/sumfact/poisson/poisson_dg_2d.mini
index 99adc0e31563c6a85ecad18e349ed74adcaf21a0..d6799eac4600300f86303ea63853365698efdf1f 100644
--- a/test/sumfact/poisson/poisson_dg_2d.mini
+++ b/test/sumfact/poisson/poisson_dg_2d.mini
@@ -14,9 +14,11 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+compare_l2errorsquared = 5e-5, 5e-7 | expand deg
+
+[formcompiler.r]
 numerical_jacobian = 1, 0 | expand num
 sumfact = 1
-compare_l2errorsquared = 5e-5, 5e-7 | expand deg
 vectorization_quadloop = 1, 0 | expand quad
 vectorization_strategy = explicit, none | expand grad
 
diff --git a/test/sumfact/poisson/poisson_dg_2d.ufl b/test/sumfact/poisson/poisson_dg_2d.ufl
index fefc67d64c9a0b1a6dac9c34423933d149e8c88e..3c2cf8767cdc395643366936529fe44c2a39be5b 100644
--- a/test/sumfact/poisson/poisson_dg_2d.ufl
+++ b/test/sumfact/poisson/poisson_dg_2d.ufl
@@ -1,4 +1,5 @@
 cell = "quadrilateral"
+dim = 2
 
 x = SpatialCoordinate(cell)
 c = (0.5-x[0])**2 + (0.5-x[1])**2
@@ -13,21 +14,24 @@ v = TestFunction(V)
 n = FacetNormal(cell)('+')
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
-  + gamma*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
 exact_solution = g
diff --git a/test/sumfact/poisson/poisson_dg_3d.mini b/test/sumfact/poisson/poisson_dg_3d.mini
index b23fda0eba605025076e6a92dfb295ea06525d00..f0b4ef26f73509e9dea47a1cca366c83a51bfb93 100644
--- a/test/sumfact/poisson/poisson_dg_3d.mini
+++ b/test/sumfact/poisson/poisson_dg_3d.mini
@@ -14,9 +14,11 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+compare_l2errorsquared = 1e-4, 5e-6 | expand deg
+
+[formcompiler.r]
 numerical_jacobian = 1, 0 | expand num
 sumfact = 1
-compare_l2errorsquared = 1e-4, 5e-6 | expand deg
 vectorization_quadloop = 1, 0 | expand quad
 vectorization_strategy = explicit, none | expand grad
 
diff --git a/test/sumfact/poisson/poisson_dg_3d.ufl b/test/sumfact/poisson/poisson_dg_3d.ufl
index 80d78c363b27b6e30476a1189aed521b33b65fa8..0f7f0399b8e07ad85e45a20071304e469b1b6133 100644
--- a/test/sumfact/poisson/poisson_dg_3d.ufl
+++ b/test/sumfact/poisson/poisson_dg_3d.ufl
@@ -1,4 +1,5 @@
-cell = "hexahedron"
+cell = hexahedron
+dim = 3
 
 x = SpatialCoordinate(cell)
 c = (0.5-x[0])**2 + (0.5-x[1])**2 + (0.5-x[2])**2
@@ -13,21 +14,24 @@ v = TestFunction(V)
 n = FacetNormal(cell)('+')
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
 
 r = inner(grad(u), grad(v))*dx \
-  + inner(n, avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - f*v*dx \
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
-  + gamma*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
-  - f*v*dx \
-  - theta*g*inner(grad(v), n)*ds \
-  - gamma*g*v*ds
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
 exact_solution = g
diff --git a/test/sumfact/poisson/poisson_dg_tensor.mini b/test/sumfact/poisson/poisson_dg_tensor.mini
index 4a45e4a1fa469c6fc2753ad230b486bc55bd1b55..f6884f965eac0a47416af97b26aa849a6be688ad 100644
--- a/test/sumfact/poisson/poisson_dg_tensor.mini
+++ b/test/sumfact/poisson/poisson_dg_tensor.mini
@@ -12,8 +12,10 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
-sumfact = 1
 compare_l2errorsquared = 3e-4
+
+[formcompiler.r]
+sumfact = 1
 vectorization_quadloop = 1, 0 | expand quad
 vectorization_strategy = explicit, none | expand grad
 
diff --git a/test/sumfact/poisson/poisson_dg_tensor.ufl b/test/sumfact/poisson/poisson_dg_tensor.ufl
index 0d4b7a79ee8e2b7183b8664bb639990960205e0a..2734383f71d3afddc39c6c72f41ba5545def3b54 100644
--- a/test/sumfact/poisson/poisson_dg_tensor.ufl
+++ b/test/sumfact/poisson/poisson_dg_tensor.ufl
@@ -1,14 +1,14 @@
 cell = hexahedron
+dim = 3
 
 x = SpatialCoordinate(cell)
-
 I = Identity(3)
 A = as_matrix([[x[i]*x[j] + I[i,j] for j in range(3)] for i in range(3)])
 g = x[0]**2 + x[1]**2 + x[2]**2
 c = 10.
 f = -6.
 
-V = FiniteElement("DG", cell, 1)
+V = FiniteElement("DG", cell, degree)
 
 u = TrialFunction(V)
 v = TestFunction(V)
@@ -16,21 +16,24 @@ v = TestFunction(V)
 n = FacetNormal(cell)('+')
 
 # penalty factor
-gamma = 1.0
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 # SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
 theta = 1.0
 
-r = (inner(A*grad(u), grad(v)) + c*u*v)*dx \
-  + inner(n, A*avg(grad(u)))*jump(v)*dS \
-  + gamma*jump(u)*jump(v)*dS \
-  - theta*jump(u)*inner(A*avg(grad(v)), n)*dS \
-  - inner(n, A*grad(u))*v*ds \
-  + gamma*u*v*ds \
-  + theta*u*inner(A*grad(v), n)*ds \
+r = inner(grad(u), grad(v))*dx \
   - f*v*dx \
-  - theta*g*inner(A*grad(v), n)*ds \
-  - gamma*g*v*ds
+  - inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - inner(n, grad(u))*v*ds \
+  + gamma_ext*u*v*ds \
+  + theta*u*inner(grad(v), n)*ds \
+  - gamma_ext*g*v*ds \
+  - theta*g*inner(grad(v), n)*ds
 
-forms = [r]
 exact_solution = g
diff --git a/test/sumfact/poisson/poisson_fastdg_2d.mini b/test/sumfact/poisson/poisson_fastdg_2d.mini
index 541de8712b627f6327076d87586b6d378fa54e78..53012e325a57387003dc42f2af4268f48cbdaa98 100644
--- a/test/sumfact/poisson/poisson_fastdg_2d.mini
+++ b/test/sumfact/poisson/poisson_fastdg_2d.mini
@@ -12,9 +12,11 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+compare_l2errorsquared = 1e-4
+
+[formcompiler.r]
 numerical_jacobian = 0
 sumfact = 1
-compare_l2errorsquared = 1e-4
 vectorization_quadloop = 1, 0 | expand quadvec
 vectorization_strategy = explicit, none | expand gradvec
 fastdg = 1
diff --git a/test/sumfact/poisson/poisson_fastdg_3d.mini b/test/sumfact/poisson/poisson_fastdg_3d.mini
index b5974a4fe62609ca038d315656bf3c9f333f3e32..46552ce9e960e8a53715016e3de0bd6e770b5425 100644
--- a/test/sumfact/poisson/poisson_fastdg_3d.mini
+++ b/test/sumfact/poisson/poisson_fastdg_3d.mini
@@ -12,9 +12,11 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+compare_l2errorsquared = 1e-4
+
+[formcompiler.r]
 numerical_jacobian = 0
 sumfact = 1
-compare_l2errorsquared = 1e-4
 vectorization_quadloop = 1, 0 | expand quadvec
 vectorization_strategy = explicit, none | expand gradvec
 fastdg = 1
diff --git a/test/sumfact/poisson/sliced.mini b/test/sumfact/poisson/sliced.mini
index 858b8c6b6b8804f3cede25236ff29ce66bae010b..55b6fcf7df3b0ee33f06bb97da0e8b555104c161 100644
--- a/test/sumfact/poisson/sliced.mini
+++ b/test/sumfact/poisson/sliced.mini
@@ -8,8 +8,10 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
-sumfact = 1
 compare_l2errorsquared = 1e-5
+
+[formcompiler.r]
+sumfact = 1
 vectorization_quadloop = 1
 vectorization_strategy = explicit
 vectorization_horizontal = 1
diff --git a/test/sumfact/stokes/stokes.mini b/test/sumfact/stokes/stokes.mini
index 10dca54704f30369881095216a6f5f94c25585e0..cfb89ec57d5504fca215ddc08d862e5e2484fc19 100644
--- a/test/sumfact/stokes/stokes.mini
+++ b/test/sumfact/stokes/stokes.mini
@@ -12,7 +12,9 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+compare_l2errorsquared = 1e-12
+
+[formcompiler.r]
 numerical_jacobian = 1, 0 | expand num
 vectorization_quadloop = 1, 0 | expand quad
-compare_l2errorsquared = 1e-12
 sumfact = 1
diff --git a/test/sumfact/stokes/stokes.ufl b/test/sumfact/stokes/stokes.ufl
index fafe0714ccee659ee18ef550cc2a17fba01077e6..9c5cb27a59d7662402a0bc6d65f9818dc4761577 100644
--- a/test/sumfact/stokes/stokes.ufl
+++ b/test/sumfact/stokes/stokes.ufl
@@ -14,7 +14,6 @@ u, p = TrialFunctions(TH)
 
 r = (inner(grad(v), grad(u)) - div(v)*p - q*div(u))*dx
 
-forms = [r]
 exact_solution = g_v, 8.*(1.-x[0])
-dirichlet_expression = g_v, None
+interpolate_expression = g_v, None
 is_dirichlet = v_bctype, v_bctype, None
\ No newline at end of file
diff --git a/test/sumfact/stokes/stokes_3d_dg.mini b/test/sumfact/stokes/stokes_3d_dg.mini
index b7ec60614d3159d8792b42ab1c416c4926150120..7fb1b22ed1de823afff15f519beb4aa474a3ec11 100644
--- a/test/sumfact/stokes/stokes_3d_dg.mini
+++ b/test/sumfact/stokes/stokes_3d_dg.mini
@@ -12,7 +12,9 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
+compare_l2errorsquared = 1e-10
+
+[formcompiler.r]
 numerical_jacobian = 0
 sumfact = 1
-fastdg = 1, 0 | expand fastdg
-compare_l2errorsquared = 1e-10
\ No newline at end of file
+fastdg = 1, 0 | expand fastdg
\ No newline at end of file
diff --git a/test/sumfact/stokes/stokes_3d_dg.ufl b/test/sumfact/stokes/stokes_3d_dg.ufl
index 84d1003e16d7f4f36dc0630434d98eb3d633cd3a..8193a8933e9a3f16722737802e93199b2739ec30 100644
--- a/test/sumfact/stokes/stokes_3d_dg.ufl
+++ b/test/sumfact/stokes/stokes_3d_dg.ufl
@@ -21,16 +21,15 @@ h_e = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
 r = inner(grad(u), grad(v))*dx \
   - p*div(v)*dx \
   - q*div(u)*dx \
-  + inner(avg(grad(u))*n, jump(v))*dS \
+  - inner(avg(grad(u))*n, jump(v))*dS \
   + sigma / h_e * inner(jump(u), jump(v))*dS \
-  - eps * inner(avg(grad(v))*n, jump(u))*dS \
-  - avg(p)*inner(jump(v), n)*dS \
-  - avg(q)*inner(jump(u), n)*dS \
+  + eps * inner(avg(grad(v))*n, jump(u))*dS \
+  + avg(p)*inner(jump(v), n)*dS \
+  + avg(q)*inner(jump(u), n)*dS \
   - inner(grad(u)*n, v)*ds \
   + sigma / h_e * inner(u-g_v, v)*ds \
   + eps * inner(grad(v)*n, u-g_v)*ds \
   + p*inner(v, n)*ds \
   + q*inner(u-g_v, n)*ds
 
-forms = [r]
 exact_solution = g_v, 8.*(1.-x[0])
diff --git a/test/sumfact/stokes/stokes_dg.mini b/test/sumfact/stokes/stokes_dg.mini
index e3374e4a18e844f6f1356ce45b38e5b5212f015f..f34f23422ae888f7ea5d3085b41b87c2ac929346 100644
--- a/test/sumfact/stokes/stokes_dg.mini
+++ b/test/sumfact/stokes/stokes_dg.mini
@@ -13,9 +13,11 @@ name = {__name}
 extension = vtu
 
 [formcompiler]
-numerical_jacobian = 0, 1 | expand num
 compare_l2errorsquared = 1e-8
+
+[formcompiler.r]
+numerical_jacobian = 0, 1 | expand num
 sumfact = 1
 fastdg = 1, 0 | expand fastdg
 
-{formcompiler.fastdg} == 1 and {formcompiler.numerical_jacobian} == 1 | exclude
\ No newline at end of file
+{formcompiler.r.fastdg} == 1 and {formcompiler.r.numerical_jacobian} == 1 | exclude
\ No newline at end of file
diff --git a/test/sumfact/stokes/stokes_dg.ufl b/test/sumfact/stokes/stokes_dg.ufl
index 39c243a00c7b16c857f551813b6bf0f4a99ad065..7f873537ee0c6b5f1015091abef27d660823b331 100644
--- a/test/sumfact/stokes/stokes_dg.ufl
+++ b/test/sumfact/stokes/stokes_dg.ufl
@@ -21,16 +21,15 @@ h_e = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
 r = inner(grad(u), grad(v))*dx \
   - p*div(v)*dx \
   - q*div(u)*dx \
-  + inner(avg(grad(u))*n, jump(v))*dS \
+  - inner(avg(grad(u))*n, jump(v))*dS \
   + sigma / h_e * inner(jump(u), jump(v))*dS \
-  - eps * inner(avg(grad(v))*n, jump(u))*dS \
-  - avg(p)*inner(jump(v), n)*dS \
-  - avg(q)*inner(jump(u), n)*dS \
+  + eps * inner(avg(grad(v))*n, jump(u))*dS \
+  + avg(p)*inner(jump(v), n)*dS \
+  + avg(q)*inner(jump(u), n)*dS \
   - inner(grad(u)*n, v)*ds \
   + sigma / h_e * inner(u-g_v, v)*ds \
   + eps * inner(grad(v)*n, u-g_v)*ds \
   + p*inner(v, n)*ds \
   + q*inner(u-g_v, n)*ds
 
-forms = [r]
 exact_solution = g_v, 8*(1.-x[0])
\ No newline at end of file