diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index 97e7f1d540823d1753197272f0ccd0a415d8a8b2..e89f13c271ae6517baab92afbde305994c4c965f 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -303,7 +303,118 @@ def generate_standalone_code(sf, filename):
     set_option("opcounter", opcounting)
 
 
-def autotune_realization(sf):
+def generate_standalone_kernel_code(kernel, signature, filename):
+    with open(filename, 'w') as f:
+        # Write headers
+        headers = ['#include "config.h"',
+                   '#include <iostream>',
+                   '#include <fstream>',
+                   '#include <random>',
+                   '#include "benchmark/benchmark.h"',
+                   '#include <dune/codegen/common/vectorclass.hh>',
+                   '#include <dune/codegen/sumfact/horizontaladd.hh>',
+                   ]
+        f.write("\n".join(headers))
+
+        # Get a list of the function argument names
+        assert len(signature) == 1
+        sig = signature[0]
+        arguments = sig[sig.find('(') +1:sig.find(')')].split(',')
+        arguments = [a.split(' ')[-1] for a in arguments]
+
+        global_args = [a for a in kernel.args if a.name not in arguments]
+
+        # Declare global arguments
+        f.write('\n\n')
+        target = DuneTarget()
+        for g in global_args:
+            decl_info = g.decl_info(target, True, g.dtype)
+            for idi in decl_info:
+                ast_builder = target.get_device_ast_builder()
+                arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
+                arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
+                arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
+                f.write('{}\n'.format(arg_decl))
+
+        # Generate function we want to benchmark
+        f.write('\n')
+        f.write(sig[0:sig.find(')')+1])
+        f.writelines(lp.generate_body(kernel))
+        f.write('\n\n')
+
+        # Generate function that will do the benchmarking
+        f.write('static void BM_sumfact_kernel(benchmark::State& state){\n')
+
+        # Declare random generators
+        real = type_floatingpoint()
+        lines = ['  std::uniform_real_distribution<{}> unif(0,1);'.format(real),
+                 '  std::uniform_int_distribution<int> unif_int(0,128);',
+                 '  std::default_random_engine re;']
+        f.write('\n'.join(lines) + '\n')
+
+        # Declare function arguments
+        function_arguments = [a for a in kernel.args if a.name in arguments]
+        for arg in function_arguments:
+            if 'buffer' in arg.name:
+                byte_size = reduce(mul, arg.shape) * 8
+                f.write('  char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
+                                                                                 byte_size,
+                                                                                 arg.alignment),)
+            elif isinstance(arg, lp.ValueArg):
+                assert 'jacobian_offset' in arg.name
+                decl = arg.get_arg_decl(ast_builder)
+                decl = Initializer(decl, 'unif_int(re)')
+                f.write('  {}\n'.format(decl))
+            else:
+                assert 'fastdg' in arg.name
+                size = reduce(mul, arg.shape)
+                alignment = arg.dtype.itemsize
+                f.write('  {} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                               arg.name,
+                                                                               size,
+                                                                               alignment))
+
+        # Initialize arguments
+        def _initialize_arg(arg):
+            if isinstance(arg, lp.ValueArg):
+                return []
+            real = type_floatingpoint()
+            size = reduce(mul, arg.shape)
+            fill_name = arg.name + '_fill'
+            lines = ['  {}* {} = (double *) {};'.format(real, fill_name, arg.name),
+                     '  for (std::size_t i=0; i<{}; ++i){{'.format(size),
+                     '    {}[i] = unif(re);'.format(fill_name),
+                     '  }']
+            return lines
+
+        for arg in kernel.args:
+            lines = _initialize_arg(arg)
+            f.write('\n'.join(lines) + '\n')
+
+        # Benchmark loop
+        function_call = kernel.name + '({})'.format(','.join(arguments))
+        f.writelines(['  for (auto _ : state){\n',
+                      '    {};\n'.format(function_call),
+                      '  }\n',
+                      ])
+        f.write('}\n')
+
+        # Benchmark main
+        main = ['',
+                'BENCHMARK(BM_sumfact_kernel);',
+                '',
+                'BENCHMARK_MAIN();']
+        f.write('\n'.join(main))
+
+
+def autotune_realization(sf=None, kernel=None, signature=None):
+    if sf is None:
+        assert kernel is not None
+        assert signature is not None
+    else:
+        assert kernel is None
+        assert signature is None
+
     # Make sure that the benchmark directory exists
     dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
     if not os.path.exists(dir):
diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py
index d020c490b89f224a8c1e43fdb83f8135d2c3eea5..38171bdb05554b00e050bd0fc8899cf585c19e61 100644
--- a/python/dune/codegen/sumfact/realization.py
+++ b/python/dune/codegen/sumfact/realization.py
@@ -122,14 +122,16 @@ class BufferSwitcher(object):
         self.current = 0
 
     def get_temporary(self, sf=None, name=None, **kwargs):
+        assert sf
         assert name
-        bs = "buffer{}".format(self.current)
 
-        # Calculate correct alignment
+        bs = "buffer{}".format(self.current)
         shape = kwargs['shape']
         assert shape
         dim_tags = kwargs['dim_tags'].split(',')
         assert dim_tags
+
+        # Calculate correct alignment
         vec_size = 1
         if 'vec' in dim_tags:
             vec_size = shape[dim_tags.index('vec')]
@@ -142,8 +144,9 @@ class BufferSwitcher(object):
         # this buffer is used to store different data sizes we need to make
         # sure it is big enough.
         assert sf
-        size = _max_sum_factorization_buffer_size(sf) / vec_size
-        globalarg(bs, shape=(size, vec_size), alignment=alignment, dim_tags=['f', 'vec'])
+        size = _max_sum_factorization_buffer_size(sf)
+        globalarg(bs, shape=(size,), alignment=alignment, dim_tags=['f',])
+
         temporary_variable(name,
                            managed=True,
                            custom_base_storage=bs,
@@ -304,12 +307,6 @@ def realize_sumfact_kernel_function(sf):
                                               )
                                   })
 
-    # Register kernel transformations
-    for trafo in sf.transformations:
-        transform(trafo.kernel_transformation()[0],
-                  trafo.kernel_transformation()[1],
-                  **trafo.kernel_transformation()[2])
-
     # Construct a loopy kernel object
     from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
     args = ("const char* buffer0", "const char* buffer1") + sf.interface.signature_args
diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py
index 8fc82f3bc2650ea78cb73375c5fb501ee3522ea4..71fd8c71e94b5ca7feffa59ec876af8149fa039c 100644
--- a/python/dune/codegen/sumfact/symbolic.py
+++ b/python/dune/codegen/sumfact/symbolic.py
@@ -476,7 +476,6 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
                  insn_dep=frozenset(),
                  interface=SumfactKernelInterfaceBase(),
                  predicates=frozenset(),
-                 transformations=(),
                  ):
         """Create a sum factorization kernel
 
@@ -603,10 +602,6 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
             name_quad_perm = "_qpperm_{}".format("".join(str(a) for a in self.interface.quadrature_permutation))
             name = name + name_quad_perm
 
-        # Change name for applied transformations
-        for t in self.transformations:
-            name = name + '_' + t.name_appendix()
-
         return name
 
     @property
@@ -867,7 +862,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
                                  buffer=buffer,
                                  insn_dep=insn_dep,
                                  vertical_width=vertical_width,
-                                 transformations=transformations,
                                  )
 
         prim.Variable.__init__(self, "VecSUMFAC")
@@ -896,11 +890,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     def function_name(self):
         name = "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted),
                                     self.interface.function_name_suffix)
-
-        # Change name for applied transformations
-        for t in self.transformations:
-            name = name + '_' + t.name_appendix()
-
         return name
 
     @property