diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index 7455f1bb242fa042011f005e154e8088264ad47f..76a7f54f9d3516ee047913cb0ebd4947c189536b 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -84,6 +84,7 @@ class CodegenFormOptionsArray(ImmutableRecord):
     sumfact = CodegenOption(default=False, helpstr="Use sumfactorization")
     sumfact_regular_jacobians = CodegenOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)")
     sumfact_on_boundary = CodegenOption(default=True, helpstr="Whether boundary integrals should be vectorized. It might not be worth the hassle...")
+    sumfact_optimize_loop_order = CodegenOption(default=False, helpstr="Optimize order of loops in sumf factorization function using autotuning.")
     vectorization_quadloop = CodegenOption(default=False, helpstr="whether to generate code with explicit vectorization")
     vectorization_strategy = CodegenOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune")
     vectorization_not_fully_vectorized_error = CodegenOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize")
diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py
index 09160ea5b0fcdcbad23621ff6a8d8c66267df666..8a22a1d8f45c79d90b9a6e9fed643bdc30787c59 100644
--- a/python/dune/codegen/sumfact/accumulation.py
+++ b/python/dune/codegen/sumfact/accumulation.py
@@ -164,7 +164,7 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
             from dune.codegen.sumfact.basis import SumfactBasisMixin
             return SumfactBasisMixin.lfs_inames(SumfactBasisMixin(), get_leaf(self.trial_element, self.trial_element_index), self.restriction)
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # The result of stage 2 has the correct quadrature permutation but no
         # cost permutation is applied. The inames for this method are
         # quadrature and cost permuted. This means we need to reverse the cost
@@ -175,7 +175,8 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index d19fd7be5587258e8db1f3ada3301b1e7017f95e..b4a48b49473c64b3b46136f7654c34fc53f43ca0 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -1,20 +1,22 @@
 """ Autotuning for sum factorization kernels """
 
-from dune.codegen.generation import cache_restoring, delete_cache_items
-from dune.codegen.loopy.target import DuneTarget
-from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
-from dune.codegen.options import get_option, set_option
-from dune.codegen.error import CodegenAutotuneError
-
-import loopy as lp
-from pytools import product
-
 import os
 import re
 import subprocess
 import filelock
 import hashlib
+import json
+from operator import mul
 
+import loopy as lp
+from pytools import product
+from cgen import ArrayOf, AlignedAttribute, Initializer
+
+from dune.codegen.generation import cache_restoring, delete_cache_items
+from dune.codegen.loopy.target import DuneTarget, type_floatingpoint
+from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
+from dune.codegen.options import get_option, set_option
+from dune.codegen.error import CodegenAutotuneError
 
 def get_cmake_cache_entry(entry):
     for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"):
@@ -286,13 +288,139 @@ def generate_standalone_code(sf, filename):
         set_option("opcounter", opcounting)
 
 
-def autotune_realization(sf):
+def generate_standalone_kernel_code(kernel, signature, filename):
+    with open(filename, 'w') as f:
+        # Write headers
+        headers = ['#include "config.h"',
+                   '#include <iostream>',
+                   '#include <fstream>',
+                   '#include <random>',
+                   '#include "benchmark/benchmark.h"',
+                   '#include <dune/codegen/common/vectorclass.hh>',
+                   '#include <dune/codegen/sumfact/horizontaladd.hh>',
+                   ]
+        f.write("\n".join(headers))
+
+        # Get a list of the function argument names
+        assert len(signature) == 1
+        sig = signature[0]
+        arguments = sig[sig.find('(') +1:sig.find(')')].split(',')
+        arguments = [a.split(' ')[-1] for a in arguments]
+
+        global_args = [a for a in kernel.args if a.name not in arguments]
+
+        # Declare global arguments
+        f.write('\n\n')
+        target = DuneTarget()
+        for g in global_args:
+            decl_info = g.decl_info(target, True, g.dtype)
+            for idi in decl_info:
+                ast_builder = target.get_device_ast_builder()
+                arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
+                arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
+                arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
+                f.write('{}\n'.format(arg_decl))
+
+        # Generate function we want to benchmark
+        f.write('\n')
+        f.write(sig[0:sig.find(')')+1])
+        f.writelines(lp.generate_body(kernel))
+        f.write('\n\n')
+
+        # Generate function that will do the benchmarking
+        f.write('static void BM_sumfact_kernel(benchmark::State& state){\n')
+
+        # Declare random generators
+        real = type_floatingpoint()
+        lines = ['  std::uniform_real_distribution<{}> unif(0,1);'.format(real),
+                 '  std::uniform_int_distribution<int> unif_int(0,128);',
+                 '  std::default_random_engine re;']
+        f.write('\n'.join(lines) + '\n')
+
+        # Declare function arguments
+        function_arguments = [a for a in kernel.args if a.name in arguments]
+        for arg in function_arguments:
+            if 'buffer' in arg.name:
+                byte_size = reduce(mul, arg.shape) * 8
+                f.write('  char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
+                                                                                 byte_size,
+                                                                                 arg.alignment),)
+            elif isinstance(arg, lp.ValueArg):
+                assert 'jacobian_offset' in arg.name
+                decl = arg.get_arg_decl(ast_builder)
+                decl = Initializer(decl, 'unif_int(re)')
+                f.write('  {}\n'.format(decl))
+            else:
+                assert 'fastdg' in arg.name
+                size = reduce(mul, arg.shape)
+                alignment = arg.dtype.itemsize
+                f.write('  {} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                               arg.name,
+                                                                               size,
+                                                                               alignment))
+
+        # Initialize arguments
+        def _initialize_arg(arg):
+            if isinstance(arg, lp.ValueArg):
+                return []
+            real = type_floatingpoint()
+            size = reduce(mul, arg.shape)
+            fill_name = arg.name + '_fill'
+            lines = ['  {}* {} = (double *) {};'.format(real, fill_name, arg.name),
+                     '  for (std::size_t i=0; i<{}; ++i){{'.format(size),
+                     '    {}[i] = unif(re);'.format(fill_name),
+                     '  }']
+            return lines
+
+        for arg in kernel.args:
+            lines = _initialize_arg(arg)
+            f.write('\n'.join(lines) + '\n')
+
+        # Benchmark loop
+        function_call = kernel.name + '({})'.format(','.join(arguments))
+        f.writelines(['  for (auto _ : state){\n',
+                      '    {};\n'.format(function_call),
+                      '  }\n',
+                      ])
+        f.write('}\n')
+
+        # Benchmark main
+        main = ['',
+                'BENCHMARK(BM_sumfact_kernel);',
+                '',
+                'BENCHMARK_MAIN();']
+        f.write('\n'.join(main))
+
+
+def autotune_realization(sf=None, kernel=None, signature=None):
+    """"Generate an executable run a benchmark and return time
+
+    The benchmark can be generated from a SumfactKernel or a LoopKernel with a
+    function signature. For SumfactKernels you can generate benchmarks with or
+    without google benchmark, for LoopKernels only google benchmarks are
+    possible.
+
+    Parameters
+    ----------
+    sf : SumfactKernel or VectorizedSumfactKernel
+    kernel : loopy.kernel.LoopKernel
+    signature : str
+    """
+    if sf is None:
+        assert kernel and signature
+        assert get_option("autotune_google_benchmark")
+    else:
+        assert kernel is None and signature is None
+
     # Make sure that the benchmark directory exists
     dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
     if not os.path.exists(dir):
         os.mkdir(dir)
 
-    basename = "autotune_sumfact_{}".format(sf.function_name)
+    if sf is None:
+        basename = "autotune_sumfact_{}".format(kernel.name)
+    else:
+        basename = "autotune_sumfact_{}".format(sf.function_name)
     basename = hashlib.sha256(basename.encode()).hexdigest()
 
     filename = os.path.join(dir, "{}.cc".format(basename))
@@ -301,10 +429,14 @@ def autotune_realization(sf):
     executable = os.path.join(dir, basename)
 
     # Generate and compile a benchmark program
+    #
+    # Note: cache restoring is only necessary when generating from SumfactKernel
     with cache_restoring():
         with filelock.FileLock(lock):
             if not os.path.isfile(logname):
-                if get_option("autotune_google_benchmark"):
+                if sf is None:
+                    generate_standalone_kernel_code(kernel, signature, filename)
+                elif get_option("autotune_google_benchmark"):
                     generate_standalone_code_google_benchmark(sf, filename)
                 else:
                     generate_standalone_code(sf, filename)
@@ -339,7 +471,6 @@ def autotune_realization(sf):
 
             # Extract the result form the log file
             if get_option("autotune_google_benchmark"):
-                import json
                 with open(logname) as json_file:
                     data = json.load(json_file)
                     return data['benchmarks'][0]['cpu_time']
diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py
index d3a2a190255ad2178fc44b5a691f49beb919b706..57287efeb4aaef4d0243799e3561731c98aa2272 100644
--- a/python/dune/codegen/sumfact/basis.py
+++ b/python/dune/codegen/sumfact/basis.py
@@ -348,14 +348,15 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
 
         return insn_dep.union(frozenset({insn}))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Note: Here we do not need to reverse any permutation since this is
         # already done in the setup_input method above!
 
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py
index 689c2b60c30c06b821972b250627f19bd608095c..acb3599b0ffce6d4f393d90d6249f3083328cb56 100644
--- a/python/dune/codegen/sumfact/geometry.py
+++ b/python/dune/codegen/sumfact/geometry.py
@@ -459,11 +459,12 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
 
         return insn_dep.union(frozenset({insn}))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py
index 3a628576bf5a130fe359bfb78967018a5f8a5211..d020c490b89f224a8c1e43fdb83f8135d2c3eea5 100644
--- a/python/dune/codegen/sumfact/realization.py
+++ b/python/dune/codegen/sumfact/realization.py
@@ -60,6 +60,11 @@ def name_buffer_storage(buff, which):
     return name
 
 
+def _max_sum_factorization_buffer_size(sf):
+    size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width,
+               product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width)
+    return size
+
 @kernel_cached
 def _realize_sum_factorization_kernel(sf):
     insn_dep = sf.insn_dep
@@ -73,8 +78,7 @@ def _realize_sum_factorization_kernel(sf):
     for buf in buffers:
         # Determine the necessary size of the buffer. We assume that we do not
         # underintegrate the form!!!
-        size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width,
-                   product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width)
+        size = _max_sum_factorization_buffer_size(sf)
         temporary_variable("{}_dummy".format(buf),
                            shape=(size,),
                            custom_base_storage=buf,
@@ -117,10 +121,29 @@ class BufferSwitcher(object):
     def __init__(self):
         self.current = 0
 
-    def get_temporary(self, name=None, **kwargs):
+    def get_temporary(self, sf=None, name=None, **kwargs):
         assert name
         bs = "buffer{}".format(self.current)
-        globalarg(bs)
+
+        # Calculate correct alignment
+        shape = kwargs['shape']
+        assert shape
+        dim_tags = kwargs['dim_tags'].split(',')
+        assert dim_tags
+        vec_size = 1
+        if 'vec' in dim_tags:
+            vec_size = shape[dim_tags.index('vec')]
+        from dune.codegen.loopy.target import dtype_floatingpoint
+        dtype = np.dtype(kwargs.get("dtype", dtype_floatingpoint()))
+        alignment = dtype.itemsize * vec_size
+
+        # Add this buffer as global argument to the kernel. Add a shape to make
+        # benchmark generation for autotuning of loopy kernels easier. Since
+        # this buffer is used to store different data sizes we need to make
+        # sure it is big enough.
+        assert sf
+        size = _max_sum_factorization_buffer_size(sf) / vec_size
+        globalarg(bs, shape=(size, vec_size), alignment=alignment, dim_tags=['f', 'vec'])
         temporary_variable(name,
                            managed=True,
                            custom_base_storage=bs,
@@ -192,7 +215,8 @@ def realize_sumfact_kernel_function(sf):
         if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible:
             input_summand = sf.interface.realize_direct_input(input_inames, inp_shape)
         elif l == 0:
-            input_summand = sf.interface.realize_input(input_inames,
+            input_summand = sf.interface.realize_input(sf,
+                                                       input_inames,
                                                        inp_shape,
                                                        vec_iname,
                                                        vec_shape,
@@ -203,7 +227,8 @@ def realize_sumfact_kernel_function(sf):
             # Get a temporary that interprets the base storage of the input
             # as a column-major matrix. In later iteration of the matrix loop
             # this reinterprets the output of the previous iteration.
-            inp = buffer.get_temporary("buff_step{}_in".format(l),
+            inp = buffer.get_temporary(sf,
+                                       "buff_step{}_in".format(l),
                                        shape=inp_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
@@ -249,7 +274,8 @@ def realize_sumfact_kernel_function(sf):
                 output_inames = permute_backward(output_inames, sf.interface.quadrature_permutation)
                 output_shape = permute_backward(output_shape, sf.interface.quadrature_permutation)
 
-            out = buffer.get_temporary("buff_step{}_out".format(l),
+            out = buffer.get_temporary(sf,
+                                       "buff_step{}_out".format(l),
                                        shape=output_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
@@ -264,7 +290,8 @@ def realize_sumfact_kernel_function(sf):
 
         else:
             output_shape = tuple(out_shape[1:]) + (out_shape[0],)
-            out = buffer.get_temporary("buff_step{}_out".format(l),
+            out = buffer.get_temporary(sf,
+                                       "buff_step{}_out".format(l),
                                        shape=output_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py
index 3fddff6297e42cf807fe1bb0017b9b3dce7aa668..8fc82f3bc2650ea78cb73375c5fb501ee3522ea4 100644
--- a/python/dune/codegen/sumfact/symbolic.py
+++ b/python/dune/codegen/sumfact/symbolic.py
@@ -60,7 +60,7 @@ class SumfactKernelInterfaceBase(object):
         """
         raise NotImplementedError
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         """Interpret the input of sumfact kernel function in the right way (non fastdgg)
 
         This happens inside the sumfact kernel function.
@@ -73,6 +73,7 @@ class SumfactKernelInterfaceBase(object):
 
         Parameters
         ----------
+        sf : SumfactKernel or VectorizedSumfactKernel
         inames : tuple of pymbolic.primitives.Variable
             Inames for accesing the input. Ordered according to permuted matrix sequence.
         shape : tuple of int
@@ -252,11 +253,12 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
             dep = dep.union(inp.setup_input(sf, dep, index=i))
         return dep
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
@@ -358,7 +360,7 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
 
         return prim.Call(prim.Variable(hadd_function), (result,))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # The input for stage 3 is quadrature permuted. The inames and shape
         # passed to this method are quadrature and cost permuted. This means we
         # need to take the cost permutation back to get the right inames and
@@ -368,7 +370,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
 
         # Get a temporary that interprets the base storage of the input as a
         # column-major matrix.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py
index 76cf241c90d6a85d6276b29d0bd670a63bedfd82..f059c4c567852da01f116eeeb158eb2a208b6f89 100644
--- a/python/dune/codegen/sumfact/transformations.py
+++ b/python/dune/codegen/sumfact/transformations.py
@@ -4,6 +4,7 @@ import islpy as isl
 
 from dune.codegen.generation import get_global_context_value
 from dune.codegen.loopy.transformations.remove_reductions import remove_all_reductions
+from dune.codegen.options import get_form_option
 from dune.codegen.pdelab.geometry import world_dimension
 
 def move_zero_assignment_up(knl, move_up_inames):
@@ -95,6 +96,8 @@ def reorder_loops_in_tensor_contraction(knl, iname_order):
     using einsum notation from numpy. iname_order should be a string like
     'iklj' if the loops should be done in order i, k, l, j.
 
+    Without transformations those loops will be done in the order lkij.
+
     In the sum factorization kernel itself those inames are called:
 
     sf_out_inames_2_* : l
@@ -185,22 +188,63 @@ class LoopOrderTransformation(SumfactKernelFunctionTransformation):
         return 'looporder{}'.format(self.order)
 
 
+def autotune_loop_order(sf):
+    # Baseline is the non transformed kernel
+    from dune.codegen.sumfact.autotune import autotune_realization
+    best_order = None
+    best_cost = autotune_realization(sf)
+
+    # Go through all loop orderings. Note: Due to reduction removal there can
+    # be differences even in the case where the loop order here is the same as
+    # above.
+    import itertools as it
+    loop_orders = [''.join(p) for p in it.permutations('lkij')]
+    for order in loop_orders:
+        from dune.codegen.sumfact.transformations import reorder_loops_in_tensor_contraction
+        trafo = LoopOrderTransformation(order)
+        transformed_sf = sf.copy(transformations=sf.transformations + (trafo,))
+        cost = autotune_realization(transformed_sf)
+        if cost < best_cost:
+            best_cost = cost
+            best_order = order
+
+    return best_order
+
 def attach_transformations(sf, vsf):
     if vsf == 0:
         return 0
 
     if get_global_context_value("dry_run") == None:
 
-        # Example and test of such a transformation:
-
-        # # Store transformation in sumfact kernel
-        # from dune.codegen.sumfact.transformations import LoopOrderTransformation
-        # trafo = LoopOrderTransformation('kjli')
-        # vsf = vsf.copy(transformations=vsf.transformations + (trafo,))
-
-        # # Map the original kernel to the new one
-        # from dune.codegen.sumfact.vectorization import _cache_vectorization_info
-        # _cache_vectorization_info(sf, vsf)
+        from dune.codegen.options import set_form_option, set_option
+        # TODO
+        #
+        # set_form_option("sumfact_optimize_loop_order", True)
+        # set_option("autotune_google_benchmark", True)
+        if get_form_option("sumfact_optimize_loop_order"):
+            if get_global_context_value("integral_type") == 'cell':
+                # Find best loop order and store transformation in sumfact kernel
+                loop_order = autotune_loop_order(vsf)
+                print("palpo loop_order: {}".format(loop_order))
+                if loop_order != None:
+                    trafo = LoopOrderTransformation(loop_order)
+                    vsf = vsf.copy(transformations=vsf.transformations + (trafo,))
+
+                    # Map the original kernel to the transformed kernel
+                    from dune.codegen.sumfact.vectorization import _cache_vectorization_info
+                    _cache_vectorization_info(sf, vsf)
+
+        # TODO
+        #
+        # Exapmle for such an transformation
+        # if get_global_context_value("integral_type") == 'cell':
+        #     loop_order = 'lkji'
+        #     trafo = LoopOrderTransformation(loop_order)
+        #     vsf = vsf.copy(transformations=vsf.transformations + (trafo,))
+
+        #     # Map the original kernel to the transformed kernel
+        #     from dune.codegen.sumfact.vectorization import _cache_vectorization_info
+        #     _cache_vectorization_info(sf, vsf)
 
         return vsf