diff --git a/applications/knl/poisson_dg/knl_poisson_dg.mini b/applications/knl/poisson_dg/knl_poisson_dg.mini
index 37c02c14602bcb3a5a386ce885776a1fd4f20594..ba2f345fd39edcca22fbbb9aded8e4bd3193157c 100644
--- a/applications/knl/poisson_dg/knl_poisson_dg.mini
+++ b/applications/knl/poisson_dg/knl_poisson_dg.mini
@@ -51,7 +51,6 @@ vectorization_vertical = 2
 matrix_free = 1
 generate_jacobians = 0
 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
-assure_statement_ordering = 1
 
 [formcompiler.ufl_variants]
 cell = hexahedron
diff --git a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini
index e3466ed40eeebec2f361c7da2367ff60b20152ea..4f3f5826f71fd9fc742efa4d88ae222e45759529 100644
--- a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini
+++ b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini
@@ -49,7 +49,6 @@ vectorization_strategy = explicit
 vectorization_horizontal = 4
 vectorization_vertical = 2
 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
-assure_statement_ordering = 1
 matrix_free = 1
 generate_jacobians = 0
 
diff --git a/applications/poisson_dg/poisson_dg.mini b/applications/poisson_dg/poisson_dg.mini
index 6e9ccac0e0d27a765f54f094077bb32282d4fec4..7e65a4815f5fccc2dc6aae955bbdef559e9ebe92 100644
--- a/applications/poisson_dg/poisson_dg.mini
+++ b/applications/poisson_dg/poisson_dg.mini
@@ -46,7 +46,6 @@ sumfact = 1
 vectorization_quadloop = 1
 vectorization_strategy = explicit
 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
-assure_statement_ordering = 1
 matrix_free = 1
 generate_jacobians = 0
 
diff --git a/applications/poisson_dg_tensor/poisson_dg_tensor.mini b/applications/poisson_dg_tensor/poisson_dg_tensor.mini
index 4e9c6f95700feddaf4b48e1a9147ba455378ad4e..4e7ac555b33fa5fcef568cca5b6dddf6dd3552e3 100644
--- a/applications/poisson_dg_tensor/poisson_dg_tensor.mini
+++ b/applications/poisson_dg_tensor/poisson_dg_tensor.mini
@@ -46,7 +46,6 @@ sumfact = 1
 vectorization_quadloop = 1
 vectorization_strategy = explicit
 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
-assure_statement_ordering = 1
 matrix_free = 1
 generate_jacobians = 0
 
diff --git a/applications/stokes_dg/stokes_dg.mini b/applications/stokes_dg/stokes_dg.mini
index 6b1a1e9ca0b3cf1fdd2bc79cb9bd00ae12ebbb0c..a80567f9b73473ae09e52c3a768177431435e849 100644
--- a/applications/stokes_dg/stokes_dg.mini
+++ b/applications/stokes_dg/stokes_dg.mini
@@ -48,7 +48,6 @@ vectorization_quadloop = 1
 vectorization_strategy = model
 vectorization_allow_quadrature_changes = 1
 quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval
-assure_statement_ordering = 1
 matrix_free = 1
 generate_jacobians = 0
 
diff --git a/patches/apply_patches.sh b/patches/apply_patches.sh
index 4abe4fa9de8f80bac0465ccfa7724035ce586063..2c38492b19c08452f7607b084f855782bb3cd35f 100755
--- a/patches/apply_patches.sh
+++ b/patches/apply_patches.sh
@@ -2,6 +2,7 @@
 
 pushd python/loopy
 git apply ../../patches/loopy/Current.patch
+git apply ../../patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
 popd
 
 pushd dune/perftool/vectorclass
diff --git a/patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch b/patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
new file mode 100644
index 0000000000000000000000000000000000000000..436533b399471411d105addab125814de66ec4e5
--- /dev/null
+++ b/patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
@@ -0,0 +1,33 @@
+From abac8a2068e0333a0f00c276519c24c5c16bedf4 Mon Sep 17 00:00:00 2001
+From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
+Date: Mon, 26 Mar 2018 11:13:42 +0200
+Subject: [PATCH] Disable a logging statement that breaks
+
+---
+ loopy/kernel/tools.py | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
+index 15840180..cb877eb6 100644
+--- a/loopy/kernel/tools.py
++++ b/loopy/kernel/tools.py
+@@ -197,11 +197,11 @@ def find_all_insn_inames(kernel):
+         assert isinstance(write_deps, frozenset), type(insn)
+         assert isinstance(iname_deps, frozenset), type(insn)
+ 
+-        logger.debug("%s: find_all_insn_inames: %s (init): %s - "
+-                "read deps: %s - write deps: %s" % (
+-                    kernel.name, insn.id, ", ".join(sorted(iname_deps)),
+-                    ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)),
+-                    ))
++#         logger.debug("%s: find_all_insn_inames: %s (init): %s - "
++#                 "read deps: %s - write deps: %s" % (
++#                     kernel.name, insn.id, ", ".join(sorted(iname_deps)),
++#                     ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)),
++#                     ))
+ 
+         insn_id_to_inames[insn.id] = iname_deps
+         insn_assignee_inames[insn.id] = write_deps & kernel.all_inames()
+-- 
+2.11.0
+
diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index df47d5e9fb1ac03d8ed6bde6ec1de203470f4d2f..a4d8292f5f88bc315980efb521bf7c2cf6a95153 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -140,10 +140,9 @@ def _insn_cache_key(code=None, expression=None, **kwargs):
 def instruction(code=None, expression=None, **kwargs):
     assert (code is not None) or (expression is not None)
     assert not ((code is not None) and (expression is not None))
-    assert 'id' not in kwargs
 
     # Get an ID for this instruction
-    id = 'insn_{}'.format(str(get_counter('__insn_id')).zfill(4))
+    id = kwargs.pop("id", 'insn_{}'.format(str(get_counter('__insn_id')).zfill(4)))
 
     # Now create the actual instruction
     if code:
diff --git a/python/dune/perftool/loopy/buffer.py b/python/dune/perftool/loopy/buffer.py
deleted file mode 100644
index c0e2ab310ce4cf6ddc8c8ea403d2e5cad1c63365..0000000000000000000000000000000000000000
--- a/python/dune/perftool/loopy/buffer.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from dune.perftool.error import PerftoolLoopyError
-from dune.perftool.generation import (get_counted_variable,
-                                      kernel_cached,
-                                      temporary_variable,
-                                      )
-
-
-class FlipFlopBuffer(object):
-    def __init__(self, identifier):
-        self.identifier = identifier
-
-        # Initialize the counter that switches between the base storages!
-        self._current = 0
-
-        # Generate the base storage names
-        self.base_storage = tuple("{}_base_{}".format(self.identifier, i) for i in (0, 1))
-
-    def switch_base_storage(self):
-        self._current = (self._current + 1) % 2
-
-    def get_temporary(self, **kwargs):
-        assert("base_storage" not in kwargs)
-        assert("storage_shape" not in kwargs)
-
-        # Select the base storage and increase counter
-        base = self.base_storage[self._current]
-
-        # Construct a temporary name
-        name = kwargs.pop("name", None)
-        if name is None:
-            name = get_counted_variable(self.identifier)
-
-        # Construct the temporary and return it
-        temporary_variable(name,
-                           base_storage=base,
-                           managed=True,
-                           _base_storage_access_may_be_aliasing=True,
-                           **kwargs
-                           )
-
-        return name
-
-
-@kernel_cached
-def initialize_buffer(identifier):
-    assert isinstance(identifier, str)
-    return FlipFlopBuffer(identifier)
-
-
-def get_buffer_temporary(identifier, **kwargs):
-    return initialize_buffer(identifier).get_temporary(**kwargs)
-
-
-def switch_base_storage(identifier):
-    initialize_buffer(identifier).switch_base_storage()
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index 2193e619182f4090a0ad9d1306c9dd8d771cf4dc..b718c2c632fa2ec92cc907f6af1c0ad3d8464b9c 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -9,6 +9,7 @@ from dune.perftool.generation import (include_file,
                                       retrieve_cache_functions,
                                       )
 from dune.perftool.options import get_option
+from dune.perftool.tools import round_to_multiple
 
 from loopy.symbolic import Literal
 from loopy.target import (TargetBase,
@@ -154,14 +155,16 @@ class DuneASTBuilder(CASTBuilder):
     def get_c_expression_to_code_mapper(self):
         return DuneCExpressionToCodeMapper()
 
-    def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info):
+    def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info):
         # If this is not a DuneTemporaryVariable, it was introduced by loopy
         # and it should be totally under loopys control: Call the base class implementation!
         if not (isinstance(temp_var, DuneTemporaryVariable) and temp_var.custom_declaration):
-            return CASTBuilder.get_temporary_decl(self, knl, schedule_index, temp_var, decl_info)
+            return CASTBuilder.get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info)
 
-        if temp_var.decl_method:
-            return cgen.Line(temp_var.decl_method(temp_var.name, temp_var.shape, temp_var.shape_impl))
+        if temp_var.custom_declaration:
+            decl = temp_var.decl_method(temp_var.name, codegen_state.kernel, decl_info)
+            if decl:
+                return cgen.Line(decl)
 
     def add_vector_access(self, access_expr, index):
         # There is no generic way of implementing a vector access with VCL, as
@@ -176,10 +179,34 @@ class DuneASTBuilder(CASTBuilder):
         return cgen.Line("BARRIER;")
 
     def get_temporary_decls(self, codegen_state, schedule_index):
+        temps = codegen_state.kernel.temporary_variables.values()
+        # Declare all the custom base storages
+        ret = []
+        for bs in set(t.custom_base_storage for t in temps if isinstance(t, DuneTemporaryVariable)) - set({None}):
+            if bs in [a.name for a in codegen_state.kernel.args]:
+                continue
+
+            # Find the alignment bytes
+            alignment = []
+            size = []
+            for t in temps:
+                if isinstance(t, DuneTemporaryVariable) and t.custom_base_storage == bs:
+                    # TODO Extract alignment from the temporaries after switching to loopy 2018.1
+                    alignment.append(get_option("max_vector_width") // 8)
+                    from pytools import product
+                    size.append(product(t.shape))
+
+            alignment = max(alignment)
+            size = max(size)
+            size = round_to_multiple(size, alignment)
+
+            decl = "char {}[{}] __attribute__ ((aligned({})));".format(bs, size * 8, alignment)
+            ret.append(cgen.Line(decl))
+
         if self.target.declare_temporaries:
-            return CASTBuilder.get_temporary_decls(self, codegen_state, schedule_index)
+            return ret + CASTBuilder.get_temporary_decls(self, codegen_state, schedule_index)
         else:
-            return []
+            return ret
 
 
 class DuneTarget(TargetBase):
diff --git a/python/dune/perftool/loopy/temporary.py b/python/dune/perftool/loopy/temporary.py
index d916f6b0312ac763d60829047a52056d088014bc..2bf78ce94c573ca0f1614fc89b72fee52fa16333 100644
--- a/python/dune/perftool/loopy/temporary.py
+++ b/python/dune/perftool/loopy/temporary.py
@@ -5,6 +5,7 @@ from dune.perftool.error import PerftoolLoopyError
 
 from loopy import TemporaryVariable
 
+import loopy as lp
 import numpy
 
 
@@ -27,7 +28,10 @@ def _temporary_type(shape_impl, shape, first=True):
         return "Dune::FieldMatrix<{}, {}, {}>".format(_type, shape[0], shape[1])
 
 
-def default_declaration(name, shape=(), shape_impl=()):
+def default_declaration(name, kernel, decl_info):
+    shape = kernel.temporary_variables[name].shape
+    shape_impl = kernel.temporary_variables[name].shape_impl
+
     # Determine the C++ type to use for this temporary.
     t = _temporary_type(shape_impl, shape)
     if len(shape_impl) == 0:
@@ -44,11 +48,20 @@ def default_declaration(name, shape=(), shape_impl=()):
         return '{} {}(0.0);'.format(t, name)
 
 
+def custom_base_storage_temporary_declaration(storage):
+    def _decl(name, kernel, decl_info):
+        dtype = kernel.temporary_variables[name].dtype
+        _type = kernel.target.dtype_to_typename(decl_info.dtype)
+        return "{0} *{1} = ({0} *){2};".format(_type, name, storage)
+
+    return _decl
+
+
 class DuneTemporaryVariable(TemporaryVariable):
 
-    allowed_extra_kwargs = TemporaryVariable.allowed_extra_kwargs + ["managed", "shape_impl", "decl_method"]
+    allowed_extra_kwargs = TemporaryVariable.allowed_extra_kwargs + ["managed", "shape_impl", "decl_method", "custom_base_storage"]
 
-    def __init__(self, name, managed=False, shape_impl=None, decl_method=None, **kwargs):
+    def __init__(self, name, managed=False, shape_impl=None, decl_method=None, custom_base_storage=None, **kwargs):
         self.managed = managed
         self.decl_method = decl_method
         self.shape_impl = shape_impl
@@ -59,6 +72,15 @@ class DuneTemporaryVariable(TemporaryVariable):
         from dune.perftool.loopy.target import dtype_floatingpoint
         kwargs.setdefault('dtype', dtype_floatingpoint())
 
+        if custom_base_storage and self.decl_method is None:
+            assert shape_impl is None
+            self.decl_method = custom_base_storage_temporary_declaration(custom_base_storage)
+
         self.custom_declaration = self.decl_method is not None
 
-        TemporaryVariable.__init__(self, name, managed=self.managed, shape_impl=self.shape_impl, decl_method=self.decl_method, **kwargs)
+        TemporaryVariable.__init__(self, name,
+                                   managed=self.managed,
+                                   shape_impl=self.shape_impl,
+                                   decl_method=self.decl_method,
+                                   custom_base_storage=custom_base_storage,
+                                   **kwargs)
diff --git a/python/dune/perftool/loopy/transformations/disjointgroups.py b/python/dune/perftool/loopy/transformations/disjointgroups.py
deleted file mode 100644
index 791a7b35ea26cf96af224cd5ee0623cc2d7f673a..0000000000000000000000000000000000000000
--- a/python/dune/perftool/loopy/transformations/disjointgroups.py
+++ /dev/null
@@ -1,13 +0,0 @@
-""" A helper transformation that makes all groups conflicting """
-
-from dune.perftool.options import get_form_option
-
-
-def make_groups_conflicting(knl):
-    # As this transformation introduces a performance bug that basically
-    # kills our CI, we only apply it if really needed - meaning in production.
-    if get_form_option("assure_statement_ordering"):
-        groups = frozenset().union(*tuple(i.groups for i in knl.instructions))
-        return knl.copy(instructions=[i.copy(conflicts_with_groups=groups - i.groups) for i in knl.instructions])
-    else:
-        return knl
diff --git a/python/dune/perftool/loopy/transformations/vectorize_quad.py b/python/dune/perftool/loopy/transformations/vectorize_quad.py
index fa5b03c204b4d77f628a7566897d33dae3e67a7c..e3f30cc2d8d4313dbb7e30674cc59a360675fc30 100644
--- a/python/dune/perftool/loopy/transformations/vectorize_quad.py
+++ b/python/dune/perftool/loopy/transformations/vectorize_quad.py
@@ -7,8 +7,7 @@ from dune.perftool.generation import (function_mangler,
                                       )
 from dune.perftool.loopy.target import dtype_floatingpoint
 from dune.perftool.loopy.vcl import get_vcl_type, get_vcl_type_size
-from dune.perftool.loopy.transformations.vectorview import (add_temporary_with_vector_view,
-                                                            add_vector_view,
+from dune.perftool.loopy.transformations.vectorview import (add_vector_view,
                                                             get_vector_view_name,
                                                             )
 from dune.perftool.loopy.symbolic import substitute
@@ -149,7 +148,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
             knl = knl.copy(temporary_variables=tmps)
 
             # Introduce a vector view of the precomputation result
-            knl = add_vector_view(knl, prec_quantity, flatview=True)
+            knl = add_vector_view(knl, prec_quantity)
 
     #
     # Construct a flat loop for the given instructions
@@ -196,7 +195,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
                 horizontal, vertical = tuple(int(i) for i in re.match("vecsumfac_h(.*)_v(.*)", tag).groups())
 
                 # 1. Rotating the input data
-                knl = add_vector_view(knl, quantity, flatview=True)
+                knl = add_vector_view(knl, quantity)
                 if horizontal > 1:
                     new_insns.append(lp.CallInstruction((),  # assignees
                                                         prim.Call(TransposeReg(vertical=vertical, horizontal=horizontal),
@@ -219,7 +218,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
             elif tag is not None and tag == 'sumfac':
                 # Add a vector view to this quantity
                 expr, = quantity_exprs
-                knl = add_vector_view(knl, quantity, flatview=True)
+                knl = add_vector_view(knl, quantity)
                 replacemap[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)),
                                                   (vector_indices.get(1), prim.Variable(vec_iname)),
                                                   )
@@ -243,7 +242,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix):
     for insn in insns:
         # Get a vector view of the lhs expression
         lhsname = get_pymbolic_basename(insn.assignee)
-        knl = add_vector_view(knl, lhsname, pad_to=vec_size, flatview=True)
+        knl = add_vector_view(knl, lhsname)
         lhsname = get_vector_view_name(lhsname)
         rotating = "gradvec" in insn.tags
 
diff --git a/python/dune/perftool/loopy/transformations/vectorview.py b/python/dune/perftool/loopy/transformations/vectorview.py
index 9b74e3d8ca67cc28584def2ec0055b5504febfc4..3a812abf50289b58e14bd2f155d155cc7bc3155d 100644
--- a/python/dune/perftool/loopy/transformations/vectorview.py
+++ b/python/dune/perftool/loopy/transformations/vectorview.py
@@ -5,7 +5,9 @@ being a an array of SIMD vectors
 """
 
 from dune.perftool.loopy.target import dtype_floatingpoint
+from dune.perftool.loopy.temporary import DuneTemporaryVariable
 from dune.perftool.loopy.vcl import get_vcl_type_size
+from dune.perftool.tools import round_to_multiple
 
 import loopy as lp
 import numpy as np
@@ -17,86 +19,47 @@ def get_vector_view_name(tmpname):
     return tmpname + "_vec"
 
 
-def add_vector_view(knl, tmpname, pad_to=None, flatview=False):
-    """
-    Kernel transformation to add a vector view temporary
-    that interprets the same memory as another temporary
-    """
+def add_vector_view(knl, tmpname, pad_to=1):
     temporaries = knl.temporary_variables
-    assert tmpname in temporaries
     temp = temporaries[tmpname]
-    vecname = get_vector_view_name(tmpname)
+    vectemp = get_vector_view_name(tmpname)
     bsname = tmpname + "_base"
+    vecsize = get_vcl_type_size(temp.dtype)
 
-    if vecname in knl.temporary_variables:
+    # Enforce idempotency
+    if vectemp in temporaries:
         return knl
 
-    # Add base storage to the original temporary!
-    if not temp.base_storage:
-        temp = temp.copy(base_storage=bsname,
-                         _base_storage_access_may_be_aliasing=True,
-                         )
-        temporaries[tmpname] = temp
-    else:
-        bsname = temp.base_storage
-
-    # Determine the shape by dividing total size by vector size
-    # Also apply the padding we need for rotation
-    # TODO: *Only* apply this padding if really needed (a bit hard to figure out)
-    vecsize = get_vcl_type_size(temp.dtype)
-    if all(isinstance(s, int) for s in temp.shape):
-        size = pt.product(temp.shape) // vecsize
-        if size % vecsize != 0:
-            size = (size // vecsize + 1) * vecsize
+    # Modify the original temporary to use our custom base storage mechanism
+    if isinstance(temp, DuneTemporaryVariable):
+        if temp.custom_base_storage:
+            bsname = temp.custom_base_storage
+        else:
+            temp = temp.copy(custom_base_storage=bsname)
+            temporaries[tmpname] = temp
     else:
-        size = prim.FloorDiv(prim.Product(temp.shape), vecsize)
-        size = (size // vecsize + 1) * vecsize
-
-    # Maybe do some padding.
-    if pad_to:
-        size = (size // pad_to + 1) * pad_to
+        temp = DuneTemporaryVariable(custom_base_storage=bsname,
+                                     managed=True,
+                                     **temp.get_copy_kwargs()
+                                     )
+        temporaries[tmpname] = temp
 
-    # Some vectorview are intentionally flat! (e.g. the output buffers of
-    # sum factorization kernels
-    if flatview:
-        shape = (size, vecsize)
-        dim_tags = "c,vec"
-    else:
-        shape = temp.shape
-        # This works around a loopy weirdness (which might as well be a bug)
-        # TODO: investigate this!
-        if len(shape) == 1:
-            shape = (1, vecsize)
-            dim_tags = "c,vec"
-        else:
-            dim_tags = temp.dim_tags[:-1] + ("vec",)
+    size = round_to_multiple(pt.product(temp.shape), vecsize) // vecsize
+    size = round_to_multiple(size, pad_to)
 
     # Now add a vector view temporary
-    vecname = tmpname + "_vec"
-    temporaries[vecname] = lp.TemporaryVariable(vecname,
-                                                dim_tags=dim_tags,
-                                                shape=shape,
-                                                base_storage=bsname,
-                                                dtype=dtype_floatingpoint(),
-                                                scope=lp.temp_var_scope.PRIVATE,
-                                                _base_storage_access_may_be_aliasing=True,
-                                                )
-
-    # Avoid that any of these temporaries are eliminated
-    silenced = ['temp_to_write({})'.format(tmpname),
-                'temp_to_write({})'.format(vecname),
-                'read_no_write({})'.format(tmpname),
-                'read_no_write({})'.format(vecname),
+    temporaries[vectemp] = DuneTemporaryVariable(vectemp,
+                                                 dim_tags="c,vec",
+                                                 shape=(size, vecsize),
+                                                 custom_base_storage=bsname,
+                                                 scope=lp.temp_var_scope.PRIVATE,
+                                                 managed=True,
+                                                 )
+
+    # Avoid that these temporaries are eliminated
+    silenced = ['temp_to_write({})'.format(vectemp),
+                'read_no_write({})'.format(vectemp),
                 ]
 
     return knl.copy(temporary_variables=temporaries,
                     silenced_warnings=knl.silenced_warnings + silenced)
-
-
-def add_temporary_with_vector_view(knl, name, *args, **kwargs):
-    temps = knl.temporary_variables
-    assert name not in temps
-    temps[name] = lp.TemporaryVariable(name, *args, **kwargs)
-    knl = knl.copy(temporary_variables=temps)
-    knl = add_vector_view(knl, name)
-    return knl
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 38a3be311037dedae17ab5db93393f6ea398fd29..229b989002dd417ddb67d451f6c0de11f2bfc124 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -86,7 +86,6 @@ class PerftoolFormOptionsArray(ImmutableRecord):
     vectorization_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers")
     vectorization_list_index = PerftoolOption(default=None, helpstr="Which vectorization to pick from a list (only valid with vectorization_strategy=fromlist).")
     simplify = PerftoolOption(default=False, helpstr="Whether to simplify expressions using sympy")
-    assure_statement_ordering = PerftoolOption(default=False, helpstr="Whether special care should be taken for a good statement ordering in sumfact kernels, runs into a loopy scheduler performance bug, but is necessary for production.")
     generate_jacobians = PerftoolOption(default=True, helpstr="Whether jacobian_* methods should be generated. This is set to false automatically, when numerical_jacobian is set to true.")
     generate_residuals = PerftoolOption(default=True, helpstr="Whether alpha_* methods should be generated.")
     unroll_dimension_loops = PerftoolOption(default=False, helpstr="whether loops over the geometric dimension should be unrolled")
diff --git a/python/dune/perftool/pdelab/adjoint.py b/python/dune/perftool/pdelab/adjoint.py
index 14d660e80f5c012676c1118943e1790115411af5..aef03bbbf08e45f2510bd90f164bdd70b071580c 100644
--- a/python/dune/perftool/pdelab/adjoint.py
+++ b/python/dune/perftool/pdelab/adjoint.py
@@ -145,7 +145,11 @@ def generate_kernel(forms):
     for i, form in enumerate(forms):
         for integral in form:
             visit_integral(integral, i, len(forms))
-    knl = extract_kernel_from_cache("kernel_default")
+
+    from dune.perftool.pdelab.signatures import kernel_name, assembly_routine_signature
+    name = kernel_name()
+    signature = assembly_routine_signature()
+    knl = extract_kernel_from_cache("kernel_default", name, signature)
     delete_cache_items("kernel_default")
 
     # Reset the quadrature degree
diff --git a/python/dune/perftool/pdelab/basis.py b/python/dune/perftool/pdelab/basis.py
index d4db8834090378d5f037bc037c11a91176a7ebf3..2c724f8e4a716c46475434d4f1949de6ba512632 100644
--- a/python/dune/perftool/pdelab/basis.py
+++ b/python/dune/perftool/pdelab/basis.py
@@ -86,7 +86,7 @@ def declare_cache_temporary(element, restriction, which):
     t_cache = type_localbasis_cache(element)
     lfs = name_leaf_lfs(element, restriction)
 
-    def decl(name, shape, shape_impl):
+    def decl(name, kernel, decl_info):
         return "typename {}::{}ReturnType {};".format(t_cache,
                                                       which,
                                                       name,
diff --git a/python/dune/perftool/pdelab/driver/__init__.py b/python/dune/perftool/pdelab/driver/__init__.py
index c859c99d0ab964e298a55bf2c9854f170c507599..3877165c574ac8ef7b6d7197b56f97d071f38f42 100644
--- a/python/dune/perftool/pdelab/driver/__init__.py
+++ b/python/dune/perftool/pdelab/driver/__init__.py
@@ -298,10 +298,10 @@ def generate_driver():
     add_section("constraints", "Set up constraints container...")
     add_section("gridoperator", "Set up grid grid operators...")
     add_section("vector", "Set up solution vectors...")
+    add_section("timings", "Maybe take performance measurements...")
     add_section("solver", "Set up (non)linear solvers...")
     add_section("vtk", "Do visualization...")
     add_section("instat", "Set up instationary stuff...")
-    add_section("timings", "Maybe take performance measurements...")
     add_section("printing", "Maybe print residuals and matrices to stdout...")
     add_section("error", "Maybe calculate errors for test results...")
 
diff --git a/python/dune/perftool/pdelab/driver/solve.py b/python/dune/perftool/pdelab/driver/solve.py
index 8f3ef96ff9fd2babe838a2b3ceb7baeeb63d67f7..0648df399d809fd84d30624711dfc2fb02bbc631 100644
--- a/python/dune/perftool/pdelab/driver/solve.py
+++ b/python/dune/perftool/pdelab/driver/solve.py
@@ -54,7 +54,8 @@ def dune_solve():
     print_matrix()
 
     if get_option('instrumentation_level') >= 2:
-        from dune.perftool.pdelab.driver.timings import setup_timer, name_timing_stream
+        from dune.perftool.pdelab.driver.timings import setup_timer, name_timing_stream, name_timing_identifier
+        timestream = name_timing_stream()
         setup_timer()
         from dune.perftool.generation import post_include
         post_include("HP_DECLARE_TIMER(solve);", filetag="driver")
@@ -68,7 +69,6 @@ def dune_solve():
         if get_option('instrumentation_level') >= 3:
             from dune.perftool.pdelab.driver.gridoperator import name_localoperator
             lop_name = name_localoperator(form_ident)
-            timestream = name_timing_stream()
             solve.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
 
     return solve
diff --git a/python/dune/perftool/pdelab/function.py b/python/dune/perftool/pdelab/function.py
index c10324003955d0bbe2bbbf53f3adc73c57fd10c4..c1dadef18fdcb8a60a88d06aff5fac186a80432d 100644
--- a/python/dune/perftool/pdelab/function.py
+++ b/python/dune/perftool/pdelab/function.py
@@ -19,7 +19,7 @@ def bind_gridfunction_to_element(gf, restriction):
 
 
 def declare_grid_function_range(gridfunction):
-    def _decl(name, *args):
+    def _decl(name, kernel, decl_info):
         return "typename decltype({})::Range {};".format(gridfunction, name)
 
     return _decl
diff --git a/python/dune/perftool/pdelab/geometry.py b/python/dune/perftool/pdelab/geometry.py
index c382faeea09e13208aa7155b120fb070789c80e6..d308917adbd3391d4b61832938b3a1c4f3355eb5 100644
--- a/python/dune/perftool/pdelab/geometry.py
+++ b/python/dune/perftool/pdelab/geometry.py
@@ -261,7 +261,7 @@ def evaluate_unit_outer_normal(name):
 
 
 @preamble
-def declare_normal(name, shape, shape_impl):
+def declare_normal(name, kernel, decl_info):
     ig = name_intersection_geometry_wrapper()
     return "auto {} = {}.centerUnitOuterNormal();".format(name, ig)
 
@@ -300,7 +300,7 @@ def type_jacobian_inverse_transposed(restriction):
 @kernel_cached
 def define_jacobian_inverse_transposed_temporary(restriction):
     @preamble
-    def _define_jacobian_inverse_transposed_temporary(name, shape, shape_impl):
+    def _define_jacobian_inverse_transposed_temporary(name, kernel, decl_info):
         t = type_jacobian_inverse_transposed(restriction)
         return "{} {};".format(t,
                                name,
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 6f109f4cc05c21b0962c3065b1820fea749863ab..549cd30cbf2fce97e3f071bf428f14bd4e447e31 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -473,7 +473,11 @@ def generate_kernel(integrals):
     delete_cache_items("kernel_default")
     for integral in integrals:
         visit_integral(integral)
-    knl = extract_kernel_from_cache("kernel_default")
+
+    from dune.perftool.pdelab.signatures import kernel_name, assembly_routine_signature
+    name = kernel_name()
+    signature = assembly_routine_signature()
+    knl = extract_kernel_from_cache("kernel_default", name, signature)
     delete_cache_items("kernel_default")
 
     # Reset the quadrature degree
@@ -491,7 +495,7 @@ def generate_kernels_per_integral(integrals):
     yield generate_kernel(integrals)
 
 
-def extract_kernel_from_cache(tag, wrap_in_cgen=True):
+def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timings=True):
     # Now extract regular loopy kernel components
     from dune.perftool.loopy.target import DuneTarget
     domains = [i for i in retrieve_cache_items("{} and domain".format(tag))]
@@ -512,13 +516,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
                   check_dep_resolution=False,
                   )
 
-    # Find a name for the kernel
-    if wrap_in_cgen:
-        from dune.perftool.pdelab.signatures import kernel_name
-        name = kernel_name()
-    else:
-        name = "constructor_kernel"
-
     # Create the kernel
     from loopy import make_kernel, preprocess_kernel
     kernel = make_kernel(domains,
@@ -534,9 +531,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
     from loopy import make_reduction_inames_unique
     kernel = make_reduction_inames_unique(kernel)
 
-    from dune.perftool.loopy.transformations.disjointgroups import make_groups_conflicting
-    kernel = make_groups_conflicting(kernel)
-
     # Apply the transformations that were gathered during tree traversals
     for trafo in transformations:
         kernel = trafo[0](kernel, *trafo[1], **trafo[2])
@@ -570,9 +564,10 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
 
     if wrap_in_cgen:
         # Wrap the kernel in something which can generate code
-        from dune.perftool.pdelab.signatures import assembly_routine_signature
-        signature = assembly_routine_signature()
-        kernel = LoopyKernelMethod(signature, kernel)
+        if signature is None:
+            from dune.perftool.pdelab.signatures import assembly_routine_signature
+            signature = assembly_routine_signature()
+        kernel = LoopyKernelMethod(signature, kernel, add_timings=add_timings)
 
     return kernel
 
@@ -673,7 +668,7 @@ def cgen_class_from_cache(tag, members=[]):
     tparams = [i for i in retrieve_cache_items('{} and template_param'.format(tag))]
 
     # Construct the constructor
-    constructor_knl = extract_kernel_from_cache(tag, wrap_in_cgen=False)
+    constructor_knl = extract_kernel_from_cache(tag, "constructor_kernel", None, wrap_in_cgen=False)
     from dune.perftool.loopy.target import DuneTarget
     constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
     signature = "{}({})".format(basename, ", ".join(next(iter(p.generate(with_semicolon=False))) for p in constructor_params))
@@ -1031,10 +1026,23 @@ def generate_localoperator_kernels(operator):
 
 
 def generate_localoperator_file(kernels, filename):
+    logger = logging.getLogger(__name__)
+
     operator_methods = []
     for k in kernels.values():
         operator_methods.extend(k)
 
+    # Generate all the realizations of sum factorization kernel objects needed in this operator
+    sfkernels = [sf for sf in retrieve_cache_items("kernelimpl")]
+    if sfkernels:
+        logger.info("generate_localoperator_kernels: Create {} sumfact kernel realizations".format(len(sfkernels)))
+
+    from dune.perftool.sumfact.realization import realize_sumfact_kernel_function
+    for sf, qp in sfkernels:
+        from dune.perftool.sumfact.tabulation import set_quadrature_points
+        set_quadrature_points(qp)
+        operator_methods.append(realize_sumfact_kernel_function(sf))
+
     if get_option('instrumentation_level') >= 3:
         include_file('dune/perftool/common/timer.hh', filetag='operatorfile')
         operator_methods.append(TimerMethod())
diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
index e928b817eaf7b5f0eb9492f64880f554578c6fe0..3417adf998d43cf9b6720b60e208d9da6518542c 100644
--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -23,8 +23,9 @@ from dune.perftool.options import (get_form_option,
                                    get_option,
                                    )
 from dune.perftool.loopy.flatten import flatten_index
-from dune.perftool.loopy.buffer import get_buffer_temporary
+from dune.perftool.loopy.target import type_floatingpoint
 from dune.perftool.sumfact.quadrature import nest_quadrature_loops
+from dune.perftool.pdelab.driver import FEM_name_mangling
 from dune.perftool.pdelab.localoperator import determine_accumulation_space
 from dune.perftool.pdelab.restriction import restricted_name
 from dune.perftool.pdelab.signatures import assembler_routine_name
@@ -36,13 +37,13 @@ from dune.perftool.sumfact.tabulation import (basis_functions_per_direction,
 from dune.perftool.sumfact.switch import (get_facedir,
                                           get_facemod,
                                           )
-from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelOutputBase
+from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInterfaceBase
 from dune.perftool.ufl.modified_terminals import extract_modified_arguments
 from dune.perftool.tools import get_pymbolic_basename, get_leaf
 from dune.perftool.error import PerftoolError
 from dune.perftool.sumfact.quadrature import quadrature_inames
 
-from pytools import ImmutableRecord
+from pytools import ImmutableRecord, product
 
 import loopy as lp
 import numpy as np
@@ -85,7 +86,7 @@ def accum_iname(element, bound, i):
     return sumfact_iname(bound, "accum{}".format(suffix))
 
 
-class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord):
+class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord):
     def __init__(self,
                  accumvar=None,
                  restriction=None,
@@ -107,6 +108,14 @@ class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord):
     def __repr__(self):
         return ImmutableRecord.__repr__(self)
 
+    @property
+    def stage(self):
+        return 3
+
+    @property
+    def direct_is_possible(self):
+        return get_form_option("fastdg")
+
     @property
     def within_inames(self):
         if self.trial_element is None:
@@ -167,52 +176,73 @@ class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord):
                               forced_iname_deps=frozenset(inames + additional_inames + self.within_inames),
                               forced_iname_deps_is_final=True,
                               depends_on=insn_dep,
-                              predicates=sf.predicates
+                              predicates=sf.predicates,
+                              tags=frozenset({"sumfact_stage3"}),
                               )
 
         return frozenset({dep})
 
-    def realize_direct(self, result, inames, shape, args):
-        ft = get_global_context_value("form_type")
-
-        if self.test_element_index is None:
-            direct_output = "{}_access".format(self.accumvar)
-        else:
-            direct_output = "{}_access_comp{}".format(self.accumvar, self.test_element_index)
+    def realize_direct(self, result, inames, shape, which=0, **args):
+        direct_output = "fastdg{}".format(which)
         ftags = ",".join(["f"] * len(shape))
 
-        from dune.perftool.sumfact.realization import alias_data_array
-        if ft == 'residual' or ft == 'jacobian_apply':
+        if self.trial_element is None:
             globalarg(direct_output,
                       shape=shape,
                       dim_tags=ftags,
                       offset=_dof_offset(self.test_element, self.test_element_index),
                       )
-            alias_data_array(direct_output, self.accumvar)
             lhs = prim.Subscript(prim.Variable(direct_output), inames)
         else:
-            assert ft == 'jacobian'
-
-            direct_output = "{}x{}".format(direct_output, self.trial_element_index)
             rowsize = sum(tuple(s for s in _local_sizes(self.trial_element)))
-            element = get_leaf(self.trial_element, self.trial_element_index)
-            other_shape = tuple(element.degree() + 1 for e in shape)
-            from pytools import product
             manual_strides = tuple("stride:{}".format(rowsize * product(shape[:i])) for i in range(len(shape)))
-            dim_tags = "{},{}".format(ftags, ",".join(manual_strides))
+            offset = "jacobian_offset{}".format(which)
+            valuearg(offset)
             globalarg(direct_output,
-                      shape=other_shape + shape,
-                      offset=rowsize * _dof_offset(self.test_element, self.test_element_index) + _dof_offset(self.trial_element, self.trial_element_index),
-                      dim_tags=dim_tags,
+                      shape=shape,
+                      offset=prim.Variable(offset) + rowsize * _dof_offset(self.test_element, self.test_element_index) + _dof_offset(self.trial_element, self.trial_element_index),
+                      dim_tags=manual_strides,
                       )
-            alias_data_array(direct_output, self.accumvar)
-            # TODO: It is at least questionnable, whether using the *order* of the inames in here
-            #       for indexing is a good idea. Then again, it is hard to find an alternative.
-            _ansatz_inames = tuple(prim.Variable(i) for i in self.within_inames)
-            lhs = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + inames)
+            lhs = prim.Subscript(prim.Variable(direct_output), inames)
 
         result = prim.Sum((lhs, result))
-        return frozenset({instruction(assignee=lhs, expression=result, **args)})
+        return frozenset({instruction(assignee=lhs,
+                                      expression=result,
+                                      tags=frozenset({"sumfact_stage3"}),
+                                      **args)})
+
+    @property
+    def function_name_suffix(self):
+        if get_form_option("fastdg"):
+            suffix = "_fastdg1_{}comp{}".format(FEM_name_mangling(self.test_element), self.test_element_index)
+            if self.within_inames:
+                suffix = "{}x{}comp{}".format(suffix, FEM_name_mangling(self.trial_element), self.trial_element_index)
+            return suffix
+        else:
+            return ""
+
+    @property
+    def function_args(self):
+        if get_form_option("fastdg"):
+            ret = ("{}.data()".format(self.accumvar),)
+            if get_form_option("fastdg") and self.within_inames:
+                element = get_leaf(self.trial_element, self.trial_element_index)
+                shape = tuple(element.degree() + 1 for e in range(element.cell().geometric_dimension()))
+                jacobian_index = flatten_index(tuple(prim.Variable(i) for i in self.within_inames), shape, order="f")
+                ret = ret + (str(jacobian_index),)
+            return ret
+        else:
+            return ()
+
+    @property
+    def signature_args(self):
+        if get_form_option('fastdg'):
+            ret = ("{}* fastdg0".format(type_floatingpoint()),)
+            if self.within_inames:
+                ret = ret + ("unsigned int jacobian_offset0",)
+            return ret
+        else:
+            return ()
 
 
 def _local_sizes(element):
@@ -411,9 +441,8 @@ def generate_accumulation_instruction(expr, visitor):
                                 )
 
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
-                       stage=3,
                        position_priority=priority,
-                       output=output,
+                       interface=output,
                        predicates=predicates,
                        )
 
@@ -427,11 +456,14 @@ def generate_accumulation_instruction(expr, visitor):
 
     vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset()
 
-    temp = get_buffer_temporary(buffer,
-                                shape=vsf.quadrature_shape,
-                                dim_tags=vsf.quadrature_dimtags,
-                                name="input_{}".format(buffer),
-                                )
+    from dune.perftool.sumfact.realization import name_buffer_storage
+    temp = "input_{}".format(buffer)
+    temporary_variable(temp,
+                       shape=vsf.quadrature_shape,
+                       dim_tags=vsf.quadrature_dimtags,
+                       custom_base_storage=name_buffer_storage(buffer, 0),
+                       managed=True,
+                       )
 
     # Those input fields, that are padded need to be set to zero
     # in order to do a horizontal_add later on
@@ -447,12 +479,19 @@ def generate_accumulation_instruction(expr, visitor):
     # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1)
     timer_dep = frozenset()
     if get_option("instrumentation_level") >= 4:
-        timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
+        timer_name = "{}_kernel_stage1".format(assembler_routine_name())
+        timer_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+                                           depends_on=frozenset({lp.match.Tagged("sumfact_stage1"), 'hptimerstart_{}'.format(timer_name)}),
+                                           id="hptimerstop_{}".format(timer_name)
+                                           )}
+                              )
+        timer_name = '{}_kernel_quadratureloop'.format(assembler_routine_name())
         post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
         dump_accumulate_timer(timer_name)
-        if(jacobian_inames):
-            timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
-                                               within_inames=frozenset(jacobian_inames))})
+        timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+                                           within_inames=frozenset(jacobian_inames),
+                                           id="hptimerstart_{}".format(timer_name),
+                                           depends_on=timer_dep)})
 
     # Determine dependencies
     from loopy.match import Or, Writes
@@ -476,9 +515,10 @@ def generate_accumulation_instruction(expr, visitor):
         insn_dep = frozenset({contrib_dep})
 
     if get_option("instrumentation_level") >= 4:
-        insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
-                                          depends_on=insn_dep,
-                                          within_inames=frozenset(jacobian_inames))})
+        insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+                                             depends_on=insn_dep,
+                                             within_inames=frozenset(jacobian_inames),
+                                             id="hptimerstop_{}".format(timer_name))}))
 
     # Add a sum factorization kernel that implements the multiplication
     # with the test function (stage 3)
@@ -486,4 +526,12 @@ def generate_accumulation_instruction(expr, visitor):
     result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep)))
 
     if not get_form_option("fastdg"):
-        vsf.output.realize(vsf, result, insn_dep)
+        insn_dep = vsf.interface.realize(vsf, result, insn_dep)
+
+    if get_option("instrumentation_level") >= 4:
+        assert vsf.stage == 3
+        timer_name = '{}_kernel_stage{}'.format(assembler_routine_name(), vsf.stage)
+        insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+                                          depends_on=frozenset({lp.match.Tagged("sumfact_stage3")}),
+                                          within_inames=frozenset(jacobian_inames),
+                                          id="hptimerstop_{}".format(timer_name))})
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 7c2740e13f195678ff234c9dbd946002f710eb83..39bba49e67fe66a4541ac0ae633e6e403be32c52 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -17,6 +17,7 @@ from dune.perftool.generation import (backend,
                                       kernel_cached,
                                       temporary_variable,
                                       )
+from dune.perftool.loopy.target import type_floatingpoint
 from dune.perftool.sumfact.tabulation import (basis_functions_per_direction,
                                               construct_basis_matrix_sequence,
                                               BasisTabulationMatrix,
@@ -32,8 +33,7 @@ from dune.perftool.pdelab.argument import name_coefficientcontainer
 from dune.perftool.pdelab.geometry import (local_dimension,
                                            world_dimension,
                                            )
-from dune.perftool.loopy.buffer import initialize_buffer, get_buffer_temporary
-from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInputBase
+from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInterfaceBase
 from dune.perftool.options import get_form_option
 from dune.perftool.pdelab.driver import FEM_name_mangling
 from dune.perftool.pdelab.restriction import restricted_name
@@ -51,7 +51,7 @@ from loopy.match import Writes
 import pymbolic.primitives as prim
 
 
-class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
+class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord):
     def __init__(self,
                  coeff_func=None,
                  element=None,
@@ -72,7 +72,11 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
         return repr(self)
 
     @property
-    def direct_input_is_possible(self):
+    def stage(self):
+        return 1
+
+    @property
+    def direct_is_possible(self):
         return get_form_option("fastdg")
 
     def realize(self, sf, insn_dep, index=0):
@@ -83,10 +87,13 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
         coeff = pc(container, lfs, basisiname)
 
         # Get the input temporary!
-        name = get_buffer_temporary(sf.buffer,
-                                    shape=(product(mat.basis_size for mat in sf.matrix_sequence), sf.vector_width),
-                                    name="input_{}".format(sf.buffer)
-                                    )
+        from dune.perftool.sumfact.realization import name_buffer_storage
+        name = "input_{}".format(sf.buffer)
+        temporary_variable(name,
+                           shape=(product(mat.basis_size for mat in sf.matrix_sequence), sf.vector_width),
+                           custom_base_storage=name_buffer_storage(sf.buffer, 0),
+                           managed=True,
+                           )
 
         assignee = prim.Subscript(prim.Variable(name),
                                   (prim.Variable(basisiname),) + (index,))
@@ -98,24 +105,40 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
 
         return insn_dep.union(frozenset({insn}))
 
-    def realize_direct(self, shape, inames):
-        arg = "{}_access{}".format(self.coeff_func(self.restriction),
-                                   "_comp{}".format(self.element_index) if self.element_index else ""
-                                   )
+    def realize_direct(self, shape, inames, which=0):
+        arg = "fastdg{}".format(which)
 
         from dune.perftool.sumfact.accumulation import _dof_offset
-        from dune.perftool.sumfact.realization import alias_data_array
         globalarg(arg,
                   shape=shape,
                   dim_tags=",".join("f" * len(shape)),
                   offset=_dof_offset(self.element, self.element_index),
                   )
 
-        func = self.coeff_func(self.restriction)
-        alias_data_array(arg, func)
-
         return prim.Subscript(prim.Variable(arg), inames)
 
+    @property
+    def function_name_suffix(self):
+        if get_form_option("fastdg"):
+            return "_fastdg1_{}comp{}".format(FEM_name_mangling(self.element), self.element_index)
+        else:
+            return ""
+
+    @property
+    def function_args(self):
+        if get_form_option("fastdg"):
+            func = self.coeff_func(self.restriction)
+            return ("{}.data()".format(func),)
+        else:
+            return ()
+
+    @property
+    def signature_args(self):
+        if get_form_option("fastdg"):
+            return ("const {}* fastdg0".format(type_floatingpoint()),)
+        else:
+            return ()
+
 
 def _basis_functions_per_direction(element):
     """Number of basis functions per direction """
@@ -162,7 +185,7 @@ def pymbolic_coefficient_gradient(element, restriction, index, coeff_func, visit
     # The sum factorization kernel object gathering all relevant information
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
                        position_priority=grad_index,
-                       input=inp,
+                       interface=inp,
                        )
 
     from dune.perftool.sumfact.vectorization import attach_vectorization_info
@@ -203,7 +226,7 @@ def pymbolic_coefficient(element, restriction, index, coeff_func, visitor):
                                 )
 
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
-                       input=inp,
+                       interface=inp,
                        position_priority=3,
                        )
 
diff --git a/python/dune/perftool/sumfact/geometry.py b/python/dune/perftool/sumfact/geometry.py
index d0763db759836bcd49150c9a983dcd357a49006d..7b78de412d2e2c5a893e2d1d1d7e32315bd6aafd 100644
--- a/python/dune/perftool/sumfact/geometry.py
+++ b/python/dune/perftool/sumfact/geometry.py
@@ -12,13 +12,12 @@ from dune.perftool.generation import (backend,
                                       temporary_variable,
                                       globalarg,
                                       )
-from dune.perftool.loopy.buffer import get_buffer_temporary
 from dune.perftool.pdelab.geometry import (local_dimension,
                                            world_dimension,
                                            name_geometry,
                                            )
 from dune.perftool.sumfact.switch import get_facedir
-from dune.perftool.sumfact.symbolic import SumfactKernelInputBase
+from dune.perftool.sumfact.symbolic import SumfactKernelInterfaceBase
 from dune.perftool.sumfact.vectorization import attach_vectorization_info
 from dune.perftool.options import get_form_option, option_switch
 from dune.perftool.ufl.modified_terminals import Restriction
@@ -36,15 +35,18 @@ def corner_iname():
     return name
 
 
-class GeoCornersInput(SumfactKernelInputBase, ImmutableRecord):
+class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableRecord):
     def __init__(self, dir):
         ImmutableRecord.__init__(self, dir=dir)
 
     def realize(self, sf, index, insn_dep):
-        name = get_buffer_temporary(sf.buffer,
-                                    shape=(2 ** local_dimension(), sf.vector_width),
-                                    name="input_{}".format(sf.buffer)
-                                    )
+        from dune.perftool.sumfact.realization import name_buffer_storage
+        name = "input_{}".format(sf.buffer)
+        temporary_variable(name,
+                           shape=(2 ** local_dimension(), sf.vector_width),
+                           custom_base_storage=name_buffer_storage(sf.buffer, 0),
+                           managed=True,
+                           )
 
         ciname = corner_iname()
         geo = name_geometry()
diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py
index d4aea877bdfd46157f3447baaf02869a2dfe6131..55e141086584e65ddec50fb1f7397c979661c663 100644
--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -3,20 +3,20 @@ The code that triggers the creation of the necessary code constructs
 to realize a sum factorization kernel
 """
 from dune.perftool.generation import (barrier,
+                                      delete_cache_items,
                                       dump_accumulate_timer,
                                       generator_factory,
                                       get_global_context_value,
                                       globalarg,
                                       instruction,
+                                      kernel_cached,
                                       post_include,
                                       preamble,
                                       silenced_warning,
                                       temporary_variable,
                                       transform,
                                       )
-from dune.perftool.loopy.buffer import (get_buffer_temporary,
-                                        switch_base_storage,
-                                        )
+from dune.perftool.loopy.flatten import flatten_index
 from dune.perftool.pdelab.argument import pymbolic_coefficient
 from dune.perftool.pdelab.basis import shape_as_pymbolic
 from dune.perftool.pdelab.geometry import world_dimension
@@ -28,11 +28,17 @@ from dune.perftool.sumfact.permutation import (sumfact_permutation_strategy,
                                                permute_backward,
                                                permute_forward,
                                                )
+from dune.perftool.sumfact.quadrature import quadrature_points_per_direction
+from dune.perftool.sumfact.symbolic import (SumfactKernel,
+                                            VectorizedSumfactKernel,
+                                            )
 from dune.perftool.sumfact.vectorization import attach_vectorization_info
 from dune.perftool.sumfact.accumulation import sumfact_iname
 from dune.perftool.loopy.target import dtype_floatingpoint
 from dune.perftool.loopy.vcl import ExplicitVCLCast
+from dune.perftool.tools import get_leaf, remove_duplicates
 
+from pytools import product
 from ufl import MixedElement
 
 import loopy as lp
@@ -40,6 +46,11 @@ import numpy as np
 import pymbolic.primitives as prim
 
 
+# Have a generator function store the necessary sum factorization kernel implementations
+# This way then can easily be extracted at the end of the form visiting process
+necessary_kernel_implementations = generator_factory(item_tags=("kernelimpl",), cache_key_generator=lambda a: a[0].function_name, no_deco=True)
+
+
 def realize_sum_factorization_kernel(sf, **kwargs):
     if get_global_context_value("dry_run", False):
         return sf, sf.insn_dep
@@ -47,36 +58,114 @@ def realize_sum_factorization_kernel(sf, **kwargs):
         return _realize_sum_factorization_kernel(sf, **kwargs)
 
 
-@preamble
-def alias_data_array(name, data):
-    return "auto {} = {}.data();".format(name, data)
+def name_buffer_storage(buff, which):
+    name = "{}_{}".format(buff, which)
+    return name
 
 
-@generator_factory(item_tags=("sumfactkernel",),
-                   context_tags=("kernel",),
-                   cache_key_generator=lambda s, **kw: s.cache_key)
+@kernel_cached
 def _realize_sum_factorization_kernel(sf):
     insn_dep = sf.insn_dep
 
     # Measure times and count operations in c++ code
     if get_option("instrumentation_level") >= 4:
-        if sf.stage == 1:
-            setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
-            insn_dep = insn_dep.union(frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer),
-                                                             within_inames=frozenset(sf.within_inames),
-                                                             depends_on=insn_dep)}))
+        setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
+        timer_dep = frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer),
+                                           id="hptimerstop_{}".format(setuptimer))})
+
+        timer_name = '{}_kernel_stage1'.format(assembler_routine_name())
+        post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+        dump_accumulate_timer(timer_name)
+        timer_dep = timer_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+                                                           id="hptimerstart_{}".format(timer_name),
+                                                           depends_on=timer_dep,
+                                                           ),
+                                               }))
 
-        timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(sf.stage)
+        timer_name = '{}_kernel_stage{}'.format(assembler_routine_name(), sf.stage)
         post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
         dump_accumulate_timer(timer_name)
-        insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
-                                                         within_inames=frozenset(sf.within_inames),
-                                                         depends_on=insn_dep,
-                                                         ),
-                                             }))
+        timer_dep = timer_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+                                                           id="hptimerstart_{}".format(timer_name),
+                                                           within_inames=frozenset(sf.within_inames),
+                                                           depends_on=timer_dep.union(insn_dep),
+                                                           ),
+                                               }))
+        insn_dep = insn_dep.union(timer_dep)
+
+    # Get all the necessary pieces for a function call
+    buffers = tuple(name_buffer_storage(sf.buffer, i) for i in range(2))
+
+    # Make sure that the storage is allocated and has a certain minimum size
+    # This is necessary to allocate buffers that will be passed to sumfact kernel
+    # functions. Loopy has no knowledge of what happens with those...
+    for buf in buffers:
+        # Determine the necessary size of the buffer. We assume that we do not
+        # underintegrate the form!!!
+        size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width,
+                   product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width)
+        temporary_variable("{}_dummy".format(buf),
+                           shape=(size,),
+                           custom_base_storage=buf,
+                           decl_method=lambda n, k, di: None,
+                           )
+
+    # Realize the input if it is not direct
+    if sf.stage == 1 and not sf.interface.direct_is_possible:
+        insn_dep = insn_dep.union(sf.interface.realize(sf, insn_dep))
+
+    # Trigger generation of the sum factorization kernel function
+    qp = quadrature_points_per_direction()
+    necessary_kernel_implementations((sf, qp))
+
+    # Call the function
+    code = "{}({});".format(sf.function_name, ", ".join(buffers + sf.interface.function_args))
+    tag = "sumfact_stage{}".format(sf.stage)
+    insn_dep = frozenset({instruction(code=code,
+                                      depends_on=insn_dep,
+                                      within_inames=frozenset(sf.within_inames),
+                                      tags=frozenset({tag}),
+                                      predicates=sf.predicates,
+                                      )
+                          })
+
+    # Interpret the output as a temporary of correct shape
+    out = "{}_output".format(sf.buffer)
+    temporary_variable(out,
+                       shape=sf.output_shape,
+                       dim_tags=sf.output_dimtags,
+                       custom_base_storage=buffers[sf.length % 2],
+                       managed=True,
+                       )
+    silenced_warning("read_no_write({})".format(out))
+
+    return lp.TaggedVariable(out, sf.tag), insn_dep
+
+
+class BufferSwitcher(object):
+    def __init__(self):
+        self.current = 0
 
-    if not sf.input.direct_input_is_possible:
-        insn_dep = insn_dep.union(sf.input.realize(sf, insn_dep))
+    def get_temporary(self, name=None, **kwargs):
+        assert name
+        bs = "buffer{}".format(self.current)
+        globalarg(bs)
+        temporary_variable(name,
+                           managed=True,
+                           custom_base_storage=bs,
+                           **kwargs
+                           )
+
+        return name
+
+    def switch(self):
+        self.current = (self.current + 1) % 2
+
+
+def realize_sumfact_kernel_function(sf):
+    # Get a buffer switcher instance
+    buffer = BufferSwitcher()
+    insn_dep = frozenset()
 
     # Prepare some dim_tags/shapes for later use
     ftags = ",".join(["f"] * sf.length)
@@ -129,12 +218,12 @@ def _realize_sum_factorization_kernel(sf):
         # * a global data structure (if FastDGGridOperator is in use)
         # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator)
         input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])
-        if l == 0 and sf.input.direct_input_is_possible:
+        if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible:
             # See comment below
             input_inames = permute_backward(input_inames, perm)
             inp_shape = permute_backward(inp_shape, perm)
 
-            input_summand = sf.input.realize_direct(inp_shape, input_inames)
+            input_summand = sf.interface.realize_direct(inp_shape, input_inames)
         else:
             # If we did permute the order of a matrices above we also
             # permuted the order of out_inames. Unfortunately the
@@ -147,9 +236,10 @@ def _realize_sum_factorization_kernel(sf):
             # Get a temporary that interprets the base storage of the input
             # as a column-major matrix. In later iteration of the matrix loop
             # this reinterprets the output of the previous iteration.
-            inp = get_buffer_temporary(sf.buffer,
+            inp = buffer.get_temporary("buff_step{}_in".format(l),
                                        shape=inp_shape + vec_shape,
-                                       dim_tags=ftags)
+                                       dim_tags=ftags,
+                                       )
 
             # The input temporary will only be read from, so we need to silence the loopy warning
             silenced_warning('read_no_write({})'.format(inp))
@@ -157,7 +247,7 @@ def _realize_sum_factorization_kernel(sf):
             input_summand = prim.Subscript(prim.Variable(inp),
                                            input_inames + vec_iname)
 
-        switch_base_storage(sf.buffer)
+        buffer.switch()
 
         # Get a temporary that interprets the base storage of the output.
         #
@@ -171,9 +261,10 @@ def _realize_sum_factorization_kernel(sf):
         output_shape = tuple(out_shape[1:]) + (out_shape[0],)
         if l == len(matrix_sequence) - 1:
             output_shape = permute_backward(output_shape, perm)
-        out = get_buffer_temporary(sf.buffer,
+        out = buffer.get_temporary("buff_step{}_out".format(l),
                                    shape=output_shape + vec_shape,
-                                   dim_tags=ftags)
+                                   dim_tags=ftags,
+                                   )
 
         # Write the matrix-matrix multiplication expression
         matprod = prim.Product((matrix.pymbolic((prim.Variable(out_inames[0]), k_expr) + vec_iname),
@@ -189,25 +280,15 @@ def _realize_sum_factorization_kernel(sf):
         if l == len(matrix_sequence) - 1:
             output_inames = permute_backward(output_inames, perm)
 
-        tag = "sumfact_stage{}".format(sf.stage)
-        if sf.stage == 3:
-            tag = "{}_{}".format(tag, "_".join(sf.within_inames))
-
         # Collect the key word arguments for the loopy instruction
-        insn_args = {"forced_iname_deps": frozenset([i for i in out_inames]).union(frozenset(sf.within_inames)),
-                     "forced_iname_deps_is_final": True,
-                     "depends_on": insn_dep,
-                     "tags": frozenset({tag}),
-                     "predicates": sf.predicates,
-                     "groups": frozenset({sf.group_name}),
-                     }
+        insn_args = {"depends_on": insn_dep}
 
         # In case of direct output we directly accumulate the result
         # of the Sumfactorization into some global data structure.
         if l == len(matrix_sequence) - 1 and get_form_option('fastdg') and sf.stage == 3:
             if sf.vectorized:
-                insn_args["forced_iname_deps"] = insn_args["forced_iname_deps"].union(frozenset({vec_iname[0].name}))
-            insn_dep = sf.output.realize_direct(matprod, output_inames, out_shape, insn_args)
+                insn_args["forced_iname_deps"] = frozenset({vec_iname[0].name})
+            insn_dep = sf.interface.realize_direct(matprod, output_inames, out_shape, **insn_args)
         else:
             # Issue the reduction instruction that implements the multiplication
             # at the same time store the instruction ID for the next instruction to depend on
@@ -217,22 +298,10 @@ def _realize_sum_factorization_kernel(sf):
                                               )
                                   })
 
-    # Measure times and count operations in c++ code
-    if get_option("instrumentation_level") >= 4:
-        stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
-                                           depends_on=frozenset({lp.match.Tagged(tag)}),
-                                           within_inames=frozenset(sf.within_inames))})
-        if sf.stage == 1:
-            qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
-            post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-            dump_accumulate_timer(timer_name)
-            frozenset({instruction(code="HP_TIMER_START({});".format(qp_timer_name),
-                                   depends_on=stop_insn)})
-
-    out = get_buffer_temporary(sf.buffer,
-                               shape=sf.output_shape,
-                               dim_tags=sf.output_dimtags,
-                               )
-    silenced_warning('read_no_write({})'.format(out))
-
-    return lp.TaggedVariable(out, sf.tag), insn_dep
+    # Construct a loopy kernel object
+    from dune.perftool.pdelab.localoperator import extract_kernel_from_cache
+    args = ("const char* buffer0", "const char* buffer1") + sf.interface.signature_args
+    signature = "void {}({}) const".format(sf.function_name, ", ".join(args))
+    kernel = extract_kernel_from_cache("kernel_default", sf.function_name, [signature], add_timings=False)
+    delete_cache_items("kernel_default")
+    return kernel
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index db93c6cf0145007e13dc82bbb7ff817c0b54e060..fb283a0536318c1585a0963a33ca76105571cb84 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -1,6 +1,6 @@
 """ A pymbolic node representing a sum factorization kernel """
 
-from dune.perftool.options import get_option
+from dune.perftool.options import get_form_option, get_option
 from dune.perftool.generation import (get_counted_variable,
                                       subst_rule,
                                       transform,
@@ -8,9 +8,9 @@ from dune.perftool.generation import (get_counted_variable,
 from dune.perftool.pdelab.geometry import local_dimension, world_dimension
 from dune.perftool.sumfact.quadrature import quadrature_inames
 from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray
-from dune.perftool.loopy.target import dtype_floatingpoint
+from dune.perftool.loopy.target import dtype_floatingpoint, type_floatingpoint
 from dune.perftool.loopy.vcl import ExplicitVCLCast, VCLLowerUpperLoad
-from dune.perftool.tools import get_leaf, maybe_wrap_subscript
+from dune.perftool.tools import get_leaf, maybe_wrap_subscript, remove_duplicates
 
 from pytools import ImmutableRecord, product
 
@@ -22,58 +22,85 @@ import frozendict
 import inspect
 
 
-class SumfactKernelInputBase(object):
+class SumfactKernelInterfaceBase(object):
+    """ A base class for the input/output of a sum factorization kernel
+    In stage 1, this represents the input object, in stage 3 the output object.
+    """
+    def realize(self, *a, **kw):
+        raise NotImplementedError
+
+    def realize_direct(self, *a, **kw):
+        raise NotImplementedError
+
     @property
-    def direct_input_is_possible(self):
-        return False
+    def within_inames(self):
+        return ()
 
-    def realize(self, sf, dep, index=0):
-        return dep
+    @property
+    def direct_is_possible(self):
+        return False
 
-    def realize_direct(self, inames):
+    @property
+    def stage(self):
         raise NotImplementedError
 
+    @property
+    def function_args(self):
+        return ()
+
+    @property
+    def signature_args(self):
+        return ()
+
+    @property
+    def function_name_suffix(self):
+        return ""
+
     def __repr__(self):
-        return "SumfactKernelInputBase()"
+        return "SumfactKernelInterfaceBase()"
 
 
-class VectorSumfactKernelInput(SumfactKernelInputBase):
-    def __init__(self, inputs):
-        assert(isinstance(inputs, tuple))
-        self.inputs = inputs
+class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
+    def __init__(self, interfaces):
+        assert(isinstance(interfaces, tuple))
+        self.interfaces = interfaces
 
     def __repr__(self):
-        return "_".join(repr(i) for i in self.inputs)
+        return "_".join(repr(i) for i in self.interfaces)
+
+    @property
+    def stage(self):
+        return 1
 
     @property
-    def direct_input_is_possible(self):
-        return all(i.direct_input_is_possible for i in self.inputs)
+    def direct_is_possible(self):
+        return all(i.direct_is_possible for i in self.interfaces)
 
     def realize(self, sf, dep):
-        for i, inp in enumerate(self.inputs):
+        for i, inp in enumerate(self.interfaces):
             dep = dep.union(inp.realize(sf, dep, index=i))
         return dep
 
     def realize_direct(self, shape, inames):
         # Check whether the input exhibits a favorable structure
         # (whether we can broadcast scalar values into SIMD registers)
-        total = set(self.inputs)
-        lower = set(self.inputs[:len(self.inputs) // 2])
-        upper = set(self.inputs[len(self.inputs) // 2:])
+        total = set(self.interfaces)
+        lower = set(self.interfaces[:len(self.interfaces) // 2])
+        upper = set(self.interfaces[len(self.interfaces) // 2:])
 
         if len(total) == 1:
             # All input coefficients use the exact same input coefficient.
             # We implement this by broadcasting it into a SIMD register
             return prim.Call(ExplicitVCLCast(dtype_floatingpoint()),
-                             (self.inputs[0].realize_direct(shape, inames),)
+                             (self.interfaces[0].realize_direct(shape, inames),)
                              )
         elif len(total) == 2 and len(lower) == 1 and len(upper) == 1:
             # The lower and the upper part of the SIMD register use
             # the same input coefficient, we combine the SIMD register
             # from two shorter SIMD types
             return prim.Call(VCLLowerUpperLoad(dtype_floatingpoint()),
-                             (self.inputs[0].realize_direct(shape, inames),
-                              self.inputs[len(self.inputs) // 2].realize_direct(shape, inames),
+                             (self.interfaces[0].realize_direct(shape, inames),
+                              self.interfaces[len(self.interfaces) // 2].realize_direct(shape, inames, which=1),
                               )
                              )
         else:
@@ -81,33 +108,41 @@ class VectorSumfactKernelInput(SumfactKernelInputBase):
             # need to load scalars into the SIMD vector.
             raise NotImplementedError("SIMD loads from scalars not implemented!")
 
+    @property
+    def function_args(self):
+        return sum((i.function_args for i in remove_duplicates(self.interfaces)), ())
 
-class SumfactKernelOutputBase(object):
     @property
-    def within_inames(self):
-        return ()
+    def signature_args(self):
+        if get_form_option("fastdg"):
+            return tuple("const {}* fastdg{}".format(type_floatingpoint(), i) for i, _ in enumerate(remove_duplicates(self.interfaces)))
+        else:
+            return ()
 
-    def realize(self, sf, result, insn_dep):
-        return dep
+    @property
+    def function_name_suffix(self):
+        return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces))
 
-    def realize_direct(self, result, inames, shape, args):
-        raise NotImplementedError
 
-    def __repr__(self):
-        return "SumfactKernelOutputBase()"
+class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
+    def __init__(self, interfaces):
+        self.interfaces = interfaces
 
+    def __repr__(self):
+        return "_".join(repr(o) for o in self.interfaces)
 
-class VectorSumfactKernelOutput(SumfactKernelOutputBase):
-    def __init__(self, outputs):
-        self.outputs = outputs
+    @property
+    def stage(self):
+        return 3
 
-    def __repr__(self):
-        return "_".join(repr(o) for o in self.outputs)
+    @property
+    def within_inames(self):
+        return self.interfaces[0].within_inames
 
     def _add_hadd(self, o, result):
         hadd_function = "horizontal_add"
-        if len(set(self.outputs)) > 1:
-            pos = self.outputs.index(o)
+        if len(set(self.interfaces)) > 1:
+            pos = self.interfaces.index(o)
             if pos == 0:
                 hadd_function = "horizontal_add_lower"
             else:
@@ -116,10 +151,10 @@ class VectorSumfactKernelOutput(SumfactKernelOutputBase):
         return prim.Call(prim.Variable(hadd_function), (result,))
 
     def realize(self, sf, result, insn_dep):
-        outputs = set(self.outputs)
+        outputs = set(self.interfaces)
 
-        trial_element, = set(o.trial_element for o in self.outputs)
-        trial_element_index, = set(o.trial_element_index for o in self.outputs)
+        trial_element, = set(o.trial_element for o in self.interfaces)
+        trial_element_index, = set(o.trial_element_index for o in self.interfaces)
         from dune.perftool.sumfact.accumulation import accum_iname
         element = get_leaf(trial_element, trial_element_index) if trial_element is not None else None
         inames = tuple(accum_iname(element, mat.rows, i)
@@ -134,8 +169,8 @@ class VectorSumfactKernelOutput(SumfactKernelOutputBase):
 
         return deps
 
-    def realize_direct(self, result, inames, shape, args):
-        outputs = set(self.outputs)
+    def realize_direct(self, result, inames, shape, **args):
+        outputs = set(self.interfaces)
 
         # If multiple horizontal_add's are to be performed with 'result'
         # we need to precompute the result!
@@ -143,15 +178,39 @@ class VectorSumfactKernelOutput(SumfactKernelOutputBase):
             substname = "haddsubst_{}".format("_".join([i.name for i in inames]))
             subst_rule(substname, (), result)
             result = prim.Call(prim.Variable(substname), ())
-            transform(lp.precompute, substname, precompute_outer_inames=args["forced_iname_deps"])
+            transform(lp.precompute, substname)
 
         deps = frozenset()
         for o in outputs:
             hadd_result = self._add_hadd(o, result)
-            deps = deps.union(o.realize_direct(hadd_result, inames, shape, args))
+            which = tuple(remove_duplicates(self.interfaces)).index(o)
+            deps = deps.union(o.realize_direct(hadd_result, inames, shape, which=which, **args))
 
         return deps
 
+    @property
+    def function_args(self):
+        if get_form_option("fastdg"):
+            return sum((i.function_args for i in remove_duplicates(self.interfaces)), ())
+        else:
+            return()
+
+    @property
+    def signature_args(self):
+        if get_form_option("fastdg"):
+            def _get_pair(i):
+                ret = ("{}* fastdg{}".format(type_floatingpoint(), i),)
+                if self.within_inames:
+                    ret = ret + ("unsigned int jacobian_offset{}".format(i),)
+                return ret
+            return sum((_get_pair(i) for i, _ in enumerate(remove_duplicates(self.interfaces))), ())
+        else:
+            return ()
+
+    @property
+    def function_name_suffix(self):
+        return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces))
+
 
 class SumfactKernelBase(object):
     pass
@@ -161,11 +220,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     def __init__(self,
                  matrix_sequence=None,
                  buffer=None,
-                 stage=1,
                  position_priority=None,
                  insn_dep=frozenset(),
-                 input=SumfactKernelInputBase(),
-                 output=SumfactKernelOutputBase(),
+                 interface=SumfactKernelInterfaceBase(),
                  predicates=frozenset(),
                  ):
         """Create a sum factorization kernel
@@ -219,31 +276,18 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
             for intermediate results. The memory is expected to be
             pre-initialized with the input or you have to provide
             direct_input (FastDGGridOperator).
-        stage: 1 or 3
         position_priority: Will be used in the dry run to order kernels
             when doing vectorization e.g. (dx u,dy u,dz u, u).
-        restriction: Restriction for faces values.
         insn_dep: An instruction ID that the first issued instruction
             should depend upon. All following ones will depend on each
             other.
-        input: An SumfactKernelInputBase instance describing the input of the kernel
-        accumvar: The accumulation variable to accumulate into
-        trial_element: The leaf element of the trial function space.
-            Used to correctly nest stage 3 in the jacobian case.
-        test_element: The leaf element of the test function space
-            Used to compute offsets in the fastdg case.
-        test_element_index: the component of the test_element
-        trial_element_index: the component of the trial_element
+        interface: An SumfactKernelInterfaceBase instance describing the input
+            (stage 1) or output (stage 3) of the kernel
         """
         # Assert the inputs!
         assert isinstance(matrix_sequence, tuple)
         assert all(isinstance(m, BasisTabulationMatrixBase) for m in matrix_sequence)
-
-        assert stage in (1, 3)
-
-        assert isinstance(input, SumfactKernelInputBase)
-        assert isinstance(output, SumfactKernelOutputBase)
-
+        assert isinstance(interface, SumfactKernelInterfaceBase)
         assert isinstance(insn_dep, frozenset)
 
         # The following construction is a bit weird: Dict comprehensions do not have
@@ -269,7 +313,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     def __str__(self):
         # Above stringifier just calls back into this
         return "SF{}:[{}]->[{}]".format(self.stage,
-                                        str(self.input),
+                                        str(self.interface),
                                         ", ".join(str(m) for m in self.matrix_sequence))
 
     mapper_method = "map_sumfact_kernel"
@@ -278,6 +322,11 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     # Some cache key definitions
     # Watch out for the documentation to see which key is used unter what circumstances
     #
+    @property
+    def function_name(self):
+        """ The name of the function that implements this kernel """
+        return "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence),
+                                    self.interface.function_name_suffix)
 
     @property
     def parallel_key(self):
@@ -304,11 +353,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
         work on the same input coefficient (stage 1) or accumulate
         into the same thing (stage 3)
         """
-        return (repr(self.input), repr(self.output))
-
-    @property
-    def group_name(self):
-        return "sfgroup_{}_{}".format(self.input, self.output)
+        return repr(self.interface)
 
     #
     # Some convenience methods to extract information about the sum factorization kernel
@@ -318,7 +363,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
         if self.parallel_key != other.parallel_key:
             return self.parallel_key < other.parallel_key
         if self.inout_key != other.inout_key:
-            return self.input_key < other.input_key
+            return self.inout_key < other.inout_key
         if self.position_priority == other.position_priority:
             return repr(self) < repr(other)
         if self.position_priority is None:
@@ -342,7 +387,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
 
     @property
     def within_inames(self):
-        return self.output.within_inames
+        return self.interface.within_inames
 
     def vec_index(self, sf):
         """ Map an unvectorized sumfact kernel object to its position
@@ -428,6 +473,10 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     def tag(self):
         return "sumfac"
 
+    @property
+    def stage(self):
+        return self.interface.stage
+
     #
     # Define properties for conformity with the interface of VectorizedSumfactKernel
     #
@@ -529,7 +578,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     def __str__(self):
         # Above stringifier just calls back into this
         return "VSF{}:[{}]->[{}]".format(self.stage,
-                                         ", ".join(str(k.input) for k in self.kernels),
+                                         ", ".join(str(k.interface) for k in self.kernels),
                                          ", ".join(str(mat) for mat in self.matrix_sequence))
 
     mapper_method = "map_vectorized_sumfact_kernel"
@@ -540,6 +589,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     # Some cache key definitions
     # Watch out for the documentation to see which key is used unter what circumstances
     #
+    @property
+    def function_name(self):
+        return "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence),
+                                    self.interface.function_name_suffix)
 
     @property
     def cache_key(self):
@@ -596,12 +649,15 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     #
 
     @property
-    def input(self):
-        return VectorSumfactKernelInput(tuple(k.input for k in self.kernels))
+    def stage(self):
+        return self.kernels[0].stage
 
     @property
-    def output(self):
-        return VectorSumfactKernelOutput(tuple(k.output for k in self.kernels))
+    def interface(self):
+        if self.stage == 1:
+            return VectorSumfactKernelInput(tuple(k.interface for k in self.kernels))
+        else:
+            return VectorSumfactKernelOutput(tuple(k.interface for k in self.kernels))
 
     @property
     def cache_key(self):
@@ -611,10 +667,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     def inout_key(self):
         return tuple(k.inout_key for k in self.kernels)
 
-    @property
-    def group_name(self):
-        return "_".join(k.group_name for k in self.kernels)
-
     @property
     def length(self):
         return self.kernels[0].length
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index daeabf9005cbff9c5917185b5711ceb6cb4342a3..5309fa832c7b805d4dc7e3d31213a2a70c0c2087 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -18,7 +18,6 @@ from dune.perftool.generation import (class_member,
                                       transform,
                                       valuearg
                                       )
-from dune.perftool.loopy.buffer import get_buffer_temporary
 from dune.perftool.loopy.target import dtype_floatingpoint
 from dune.perftool.loopy.vcl import ExplicitVCLCast, get_vcl_type_size
 from dune.perftool.pdelab.localoperator import (name_domain_field,
diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py
index b29ebe221387a5af529f0dec29592b4de8dc762a..4a95dbced5e24d7efd0ddadc422511a1fc1362cc 100644
--- a/python/dune/perftool/tools.py
+++ b/python/dune/perftool/tools.py
@@ -93,3 +93,12 @@ def get_leaf(element, index):
         leaf_element = element.extract_component(index)[1]
 
     return leaf_element
+
+
+def remove_duplicates(iterable):
+    """ Remove duplicates from an iterable while preserving the order """
+    seen = set()
+    for i in iterable:
+        if i not in seen:
+            yield i
+            seen.add(i)