diff --git a/bin/donkey_benchmark_execution_wrapper.py b/bin/donkey_benchmark_execution_wrapper.py
index d383963318291ae947c70b1f083c320a60250303..22835c7cbce61336168396007e73703394eb7433 100755
--- a/bin/donkey_benchmark_execution_wrapper.py
+++ b/bin/donkey_benchmark_execution_wrapper.py
@@ -5,8 +5,16 @@ import sys
 import subprocess
 import os
 
+os.environ['DUNE_CODEGEN_THREADS'] = '20'
+
 # Run the actual command
-command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split()
+
+# The actual measurements will be called like this (mpi parallelism)
+# command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split()
+
+# Command for autotune benchmarks with threading
+command = "srun -p haswell10c -n 1 -c 20 --hint=nomultithread --cpu_bin=verbose,core".split()
+
 command.extend(sys.argv[1:])
 ret = subprocess.call(command)
 
diff --git a/python/dune/codegen/logging.conf b/python/dune/codegen/logging.conf
index 32974b12744dd1c23c36f561148943ec936b3ccb..1fcc4ba02962460ea3f1db8c678dbba5ef3fef2e 100644
--- a/python/dune/codegen/logging.conf
+++ b/python/dune/codegen/logging.conf
@@ -1,5 +1,5 @@
 [loggers]
-keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.vectorization
+keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.transformations,dune.codegen.sumfact.vectorization
 
 [handlers]
 keys=consoleHandler
@@ -16,6 +16,12 @@ handlers=consoleHandler
 qualname=dune.codegen.pdelab.localoperator
 propagate=0
 
+[logger_dune.codegen.sumfact.transformations]
+level=INFO
+handlers=consoleHandler
+qualname=dune.codegen.sumfact.transformations
+propagate=0
+
 [logger_dune.codegen.sumfact.vectorization]
 level=INFO
 handlers=consoleHandler
diff --git a/python/dune/codegen/loopy/transformations/performance.py b/python/dune/codegen/loopy/transformations/performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..02644e80c3c0f782309a165c050949e0f1d71cb2
--- /dev/null
+++ b/python/dune/codegen/loopy/transformations/performance.py
@@ -0,0 +1,8 @@
+from dune.codegen.options import get_form_option
+from dune.codegen.sumfact.transformations import sumfact_performance_transformations
+
+
+def performance_transformations(kernel, signature):
+    if get_form_option("sumfact"):
+        kernel = sumfact_performance_transformations(kernel, signature)
+    return kernel
diff --git a/python/dune/codegen/loopy/transformations/remove_reductions.py b/python/dune/codegen/loopy/transformations/remove_reductions.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ccca09c230f5ec32a9134035f3e883c63cc3680
--- /dev/null
+++ b/python/dune/codegen/loopy/transformations/remove_reductions.py
@@ -0,0 +1,71 @@
+import loopy as lp
+
+import pymbolic.primitives as prim
+
+
+def remove_reduction(knl, match):
+    """Removes all matching reductions and do direct accumulation in assignee instead"""
+
+    # Find reductions
+    for instr in lp.find_instructions(knl, match):
+        if isinstance(instr.expression, lp.symbolic.Reduction):
+            instructions = []
+
+            # Dependencies
+            depends_on = instr.depends_on
+            depending = []
+            for i in knl.instructions:
+                if instr.id in i.depends_on:
+                    depending.append(i.id)
+
+            # Remove the instruction from the kernel
+            knl = lp.remove_instructions(knl, set([instr.id]))
+
+            # Add instruction that sets assignee to zero
+            id_zero = instr.id + '_set_zero'
+            instructions.append(lp.Assignment(instr.assignee,
+                                              0,
+                                              within_inames=instr.within_inames,
+                                              id=id_zero,
+                                              tags=('set_zero',)
+                                              ))
+
+            # Add instruction that accumulates directly in assignee
+            assignee = instr.assignee
+            expression = prim.Sum((assignee, instr.expression.expr))
+            within_inames = frozenset(tuple(instr.within_inames) + instr.expression.inames)
+            id_accum = instr.id + '_accum'
+            instructions.append(lp.Assignment(assignee,
+                                              expression,
+                                              within_inames=within_inames,
+                                              id=id_accum,
+                                              depends_on=frozenset((id_zero,) + tuple(depends_on)),
+                                              tags=('assignment',)))
+
+            knl = knl.copy(instructions=knl.instructions + instructions)
+
+            # Restore dependencies
+            for dep in depending:
+                match = lp.match.Id(dep)
+                knl = lp.add_dependency(knl, match, id_accum)
+    return knl
+
+
+def remove_all_reductions(knl):
+    """Remove all reductions from loopy kernel
+
+    This removes all reductions by instead setting the assignee to zero and
+    directly accumulating in the assignee.
+    """
+    # Find ids of all reductions
+    ids = []
+    for instr in knl.instructions:
+        if isinstance(instr.expression, lp.symbolic.Reduction):
+            ids.append(instr.id)
+
+    # Remove reductions
+    for id in ids:
+        match = lp.match.Id(id)
+        knl = remove_reduction(knl, match)
+
+    return knl
diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index 257da40169f578a4b64a367f6aaa4cd55729253d..a4f65ce476fe2f610135234fb8d5a16b97401dec 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -85,6 +85,9 @@ class CodegenFormOptionsArray(ImmutableRecord):
     sumfact = CodegenOption(default=False, helpstr="Use sumfactorization")
     sumfact_regular_jacobians = CodegenOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)")
     sumfact_on_boundary = CodegenOption(default=True, helpstr="Whether boundary integrals should be vectorized. It might not be worth the hassle...")
+    sumfact_optimize_loop_order = CodegenOption(default=False, helpstr="Optimize order of loops in sumf factorization function using autotuning.")
+    sumfact_performance_transformations = CodegenOption(default=False, helpstr="Apply sum factorization specific performance transformations.")
+    sumfact_performance_transformations_testrun = CodegenOption(default=0, helpstr="If larger than zero determines test case to run.")
     vectorization_quadloop = CodegenOption(default=False, helpstr="whether to generate code with explicit vectorization")
     vectorization_strategy = CodegenOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune")
     vectorization_not_fully_vectorized_error = CodegenOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize")
diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py
index eb7e1d3a50e7f8ba4b7b0f476c54565925c15d2a..dfc553fbefee7fa8c2e6aa1d92ae7ea047acdd38 100644
--- a/python/dune/codegen/pdelab/localoperator.py
+++ b/python/dune/codegen/pdelab/localoperator.py
@@ -594,6 +594,10 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin
     for trafo in transformations:
         kernel = trafo[0](kernel, *trafo[1], **trafo[2])
 
+    # Apply performance transformations
+    from dune.codegen.loopy.transformations.performance import performance_transformations
+    kernel = performance_transformations(kernel, signature)
+
     from dune.codegen.loopy import heuristic_duplication
     kernel = heuristic_duplication(kernel)
 
diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py
index ae4a763063f0dc25303fc517356b58b02551eebc..c1ccd13a5552980db0be3788651065dc8a39654c 100644
--- a/python/dune/codegen/sumfact/accumulation.py
+++ b/python/dune/codegen/sumfact/accumulation.py
@@ -164,7 +164,7 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
             from dune.codegen.sumfact.basis import SumfactBasisMixin
             return SumfactBasisMixin.lfs_inames(SumfactBasisMixin(), get_leaf(self.trial_element, self.trial_element_index), self.restriction)
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # The result of stage 2 has the correct quadrature permutation but no
         # cost permutation is applied. The inames for this method are
         # quadrature and cost permuted. This means we need to reverse the cost
@@ -175,7 +175,8 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index a1d7442addc4e2d1d720b1cc81d8a914c1f75532..1e801f4550d17d89ef8972fe3ffa867d4a44ea25 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -1,21 +1,28 @@
 """ Autotuning for sum factorization kernels """
 
-from dune.codegen.generation import cache_restoring, delete_cache_items
-from dune.codegen.loopy.target import DuneTarget
-from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
-from dune.codegen.options import get_option, set_option
-from dune.codegen.error import CodegenAutotuneError
-
-import loopy as lp
-from pytools import product
-
 import os
 import re
 import subprocess
 import filelock
 import hashlib
+import logging
+import json
+from operator import mul
+import pkg_resources
+from six.moves import reduce
+import textwrap
 import time
 
+import loopy as lp
+from pytools import product
+from cgen import ArrayOf, AlignedAttribute, Initializer
+
+from dune.codegen.generation import cache_restoring, delete_cache_items
+from dune.codegen.loopy.target import DuneTarget, type_floatingpoint
+from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
+from dune.codegen.options import get_option, set_option
+from dune.codegen.error import CodegenAutotuneError
+
 
 def get_cmake_cache_entry(entry):
     for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"):
@@ -301,13 +308,207 @@ def generate_standalone_code(sf, filename):
     set_option("opcounter", opcounting)
 
 
-def autotune_realization(sf):
+def generate_standalone_kernel_code(kernel, signature, filename, transformations=None):
+    # Turn off opcounting
+    opcounting = get_option("opcounter")
+    set_option("opcounter", False)
+
+    # Remove opcounter from signature
+    p = re.compile('OpCounter::OpCounter<([^>]*)>')
+    assert len(signature) == 1
+    sig = signature[0]
+    sig = p.sub(r'\1', sig)
+    assert 'OpCounter' not in signature
+
+    # Which transformations were applied
+    codegen_transformations = ''
+    if transformations:
+        codegen_transformations = ''
+        for trafo in transformations:
+            codegen_transformations += '// {}\n'.format(trafo)
+
+    template = 'kernel_benchmark_template1.cc.in'
+    use_datasets = True
+
+    # Old benchmark template
+    # template = 'kernel_benchmark_template0.cc.in'
+    # use_datasets = False
+
+    template_filename = pkg_resources.resource_filename(__name__, template)
+    with open(template_filename, 'r') as f:
+        benchmark = f.read()
+
+    # Find function arguments and global arguments
+    arguments = sig[sig.find('(') + 1:sig.find(')')].split(',')
+    arguments = [a.split(' ')[-1] for a in arguments]
+    global_args = [a for a in kernel.args if a.name not in arguments]
+    buffer_arguments = [a for a in arguments if a.startswith('buff')]
+    input_arguments = [a for a in arguments if a not in buffer_arguments]
+
+    # Declare global arguments
+    codegen_declare_global_arguments = ''
+    target = DuneTarget()
+    for g in global_args:
+        decl_info = g.decl_info(target, True, g.dtype)
+        for idi in decl_info:
+            ast_builder = target.get_device_ast_builder()
+            arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
+            arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
+            arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
+            codegen_declare_global_arguments += '{}\n'.format(arg_decl)
+    codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, '  ')
+
+    # Helper function for argument initialization
+    def _initialize_arg(arg):
+        if isinstance(arg, lp.ValueArg):
+            return []
+        real = type_floatingpoint()
+        size = reduce(mul, arg.shape)
+        fill_name = arg.name + '_fill'
+        lines = ['  {}* {} = (double *) {};'.format(real, fill_name, arg.name),
+                 '  for (std::size_t i=0; i<{}; ++i){{'.format(size),
+                 '    {}[i] = unif(re);'.format(fill_name),
+                 '  }']
+        return lines
+
+    # Initialize global arguments
+    codegen_initialize_global_arguments = ''
+    for arg in global_args:
+        lines = _initialize_arg(arg)
+        codegen_initialize_global_arguments += '\n'.join(lines) + '\n'
+    codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, '  ')
+
+    codegen_initialize_input = ''
+
+    # Function we want to benchmark
+    codegen_benchmark_function = ''
+    codegen_benchmark_function += sig[0:sig.find(')') + 1]
+    codegen_benchmark_function += lp.generate_body(kernel)
+    codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, '  ')
+
+    # Declare function arguments
+    codegen_declare_arguments = []
+    codegen_declare_input = []
+    function_arguments = [a for a in kernel.args if a.name in arguments]
+    for arg in function_arguments:
+        if 'buffer' in arg.name:
+            byte_size = reduce(mul, arg.shape) * 8
+            codegen_declare_arguments.append('  char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
+                                                                                                      byte_size,
+                                                                                                      arg.alignment),)
+        elif isinstance(arg, lp.ValueArg):
+            assert 'jacobian_offset' in arg.name
+            decl = arg.get_arg_decl(ast_builder)
+            decl = Initializer(decl, 'unif_int(re)')
+            codegen_declare_arguments.append(('  {}\n'.format(decl)))
+        else:
+            assert 'fastdg' in arg.name
+            size = reduce(mul, arg.shape)
+            min_stride = min([tag.stride for tag in arg.dim_tags])
+            size *= min_stride
+            alignment = arg.dtype.itemsize
+            real = type_floatingpoint()
+            if use_datasets:
+                codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                                                             arg.name,
+                                                                                                             size,
+                                                                                                             alignment)))
+            else:
+                codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
+                                                                                                   arg.name,
+                                                                                                   size,
+                                                                                                   alignment)))
+
+    codegen_declare_arguments = ''.join(codegen_declare_arguments)
+    codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, '  ')
+    codegen_declare_input = ''.join(codegen_declare_input)
+    codegen_declare_input = textwrap.indent(codegen_declare_input, '  ')
+
+    # Initialize function arguments
+    codegen_initialize_arguments = ''
+    codegen_initialize_input = ''
+    for arg in function_arguments:
+        if 'fastdg' in arg.name:
+            if use_datasets:
+                lines = _initialize_arg(arg)
+                lines = ['  ' + a for a in lines]
+                lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines]
+                lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){')
+                lines.append('}')
+                codegen_initialize_input += '\n'.join(lines) + '\n'
+            else:
+                lines = _initialize_arg(arg)
+                codegen_initialize_arguments += '\n'.join(lines) + '\n'
+        else:
+            lines = _initialize_arg(arg)
+            codegen_initialize_arguments += '\n'.join(lines) + '\n'
+    codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, '  ')
+    codegen_initialize_input = textwrap.indent(codegen_initialize_input, '  ')
+
+    # Call the benchmark function
+    if use_datasets:
+        arguments_with_datasets = arguments.copy()
+        arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments]
+        codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n'
+        codegen_call_benchmark_function += '  ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n'
+        for arg in input_arguments:
+            codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg)
+        codegen_call_benchmark_function += '}'
+    else:
+        codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n'
+    codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, '    ')
+
+    # Replace placeholders in benchmark template
+    benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input)
+    benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function)
+    benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments)
+    benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments)
+    benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function)
+
+    # Write benchmark source file
+    with open(filename, 'w') as f:
+        f.writelines(benchmark)
+
+    # Maybe turn opcounting on again
+    set_option("opcounter", opcounting)
+
+
+def autotune_realization(sf=None, kernel=None, signature=None, transformations=None):
+    """Generate an microbenchmark, compile run and return time
+
+    Parameters
+    ----------
+    sf: SumfactKernel or VectorizedSumfactKernel
+    kernel: loopy.kernel.LoopKernel
+    signature: str
+    transformation: list of str
+        Will be used to distinguish between autotune targets
+    """
+    if sf is None:
+        assert kernel is not None
+        assert signature is not None
+    else:
+        assert kernel is None
+        assert signature is None
+
+    logger = logging.getLogger(__name__)
+
     # Make sure that the benchmark directory exists
     dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
     if not os.path.exists(dir):
         os.mkdir(dir)
 
-    basename = "autotune_sumfact_{}".format(sf.function_name)
+    if sf is None:
+        basename = "autotune_sumfact_{}".format(kernel.name)
+    else:
+        basename = "autotune_sumfact_{}".format(sf.function_name)
+    if transformations:
+        for trafo in transformations:
+            basename = '{}_{}'.format(basename, trafo)
     basename = hashlib.sha256(basename.encode()).hexdigest()
 
     filename = os.path.join(dir, "{}.cc".format(basename))
@@ -316,10 +517,16 @@ def autotune_realization(sf):
     executable = os.path.join(dir, basename)
 
     # Generate and compile a benchmark program
+    #
+    # Note: cache restoring is only necessary when generating from SumfactKernel
     with cache_restoring():
         with filelock.FileLock(lock):
             if not os.path.isfile(logname):
-                if get_option("autotune_google_benchmark"):
+                logger.debug('Generate autotune target in file {}'.format(filename))
+
+                if sf is None:
+                    generate_standalone_kernel_code(kernel, signature, filename, transformations)
+                elif get_option("autotune_google_benchmark"):
                     generate_standalone_code_google_benchmark(sf, filename)
                 else:
                     generate_standalone_code(sf, filename)
@@ -331,6 +538,7 @@ def autotune_realization(sf):
 
                 call.extend(compiler_invocation(executable, filename))
                 devnull = open(os.devnull, 'w')
+                os.environ['DUNE_CODEGEN_THREADS'] = '1'
                 ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
                 if ret != 0:
                     raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call)))
@@ -349,6 +557,7 @@ def autotune_realization(sf):
                 call.append(executable)
                 if get_option("autotune_google_benchmark"):
                     call.append("--benchmark_out={}".format(logname))
+                    call.append("--benchmark_repetitions=5")
                     # call.append("--benchmark_out_format=csv")
                 else:
                     call.append(logname)
@@ -366,7 +575,15 @@ def autotune_realization(sf):
                 with open(logname) as json_file:
                     try:
                         data = json.load(json_file)
-                        return data['benchmarks'][0]['cpu_time']
+                        minimal_time = 1e80
+                        for b in data['benchmarks']:
+                            if b['name'].endswith('_mean') or b['name'].endswith('_median') or b['name'].endswith('_stddev'):
+                                pass
+                            else:
+                                if b['cpu_time'] < minimal_time:
+                                    minimal_time = b['cpu_time']
+                        assert minimal_time < 1e80
+                        return minimal_time
                     except Exception as e:
                         print("Error while loading file {}".format(logname))
                         raise e
diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py
index c9b75eb445af01e9420e850650b12054dd0115f3..757d51870aa7ccf690b9362b9b0522a4892f792b 100644
--- a/python/dune/codegen/sumfact/basis.py
+++ b/python/dune/codegen/sumfact/basis.py
@@ -345,14 +345,15 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
 
         return insn_dep.union(frozenset({insn}))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Note: Here we do not need to reverse any permutation since this is
         # already done in the setup_input method above!
 
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py
index d37558f5c40f42355b5fa99ed9a4ea25ab7ade28..8ac8aaa4faa1a4849c3b88fb54f87b0f87ca7b2b 100644
--- a/python/dune/codegen/sumfact/geometry.py
+++ b/python/dune/codegen/sumfact/geometry.py
@@ -456,11 +456,12 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord):
 
         return insn_dep.union(frozenset({insn}))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in
new file mode 100644
index 0000000000000000000000000000000000000000..c4b49da5aec59006a9f007534dc7711b98ef0387
--- /dev/null
+++ b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in
@@ -0,0 +1,37 @@
+// Transformations:
+${CODEGEN_TRANSFORMATIONS}
+
+#include "config.h"
+#include <iostream>
+#include <fstream>
+#include <random>
+#include "benchmark/benchmark.h"
+#include <dune/codegen/common/vectorclass.hh>
+#include <dune/codegen/sumfact/horizontaladd.hh>
+
+${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+
+${CODEGEN_BENCHMARK_FUNCTION}
+
+static void BM_sumfact_kernel(benchmark::State& state){
+  std::uniform_real_distribution<double> unif(0,1);
+  std::uniform_int_distribution<int> unif_int(0,1);
+  std::default_random_engine re;
+
+  ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+  ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}
+
+  ${CODEGEN_DECLARE_INPUT}
+  ${CODEGEN_INITIALIZE_INPUT}
+
+  ${CODEGEN_DECLARE_ARGUMENTS}
+  ${CODEGEN_INITIALIZE_ARGUMENTS}
+
+  for (auto _ : state){
+    ${CODEGEN_CALL_BENCHMARK_FUNCTION}
+  }
+}
+
+${CODEGEN_BENCHMARK}
+
+BENCHMARK_MAIN();
diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in
new file mode 100644
index 0000000000000000000000000000000000000000..c03577d672123b3063020841b731fc1fd9780fce
--- /dev/null
+++ b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in
@@ -0,0 +1,58 @@
+// Transformations:
+${CODEGEN_TRANSFORMATIONS}
+
+#include "config.h"
+
+#include <iostream>
+#include <fstream>
+#include <random>
+#include <cstdlib>
+
+#include "benchmark/benchmark.h"
+#include <dune/codegen/common/vectorclass.hh>
+#include <dune/codegen/sumfact/horizontaladd.hh>
+
+const int datasets = 512;
+
+class Benchmark{
+  ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}
+
+  ${CODEGEN_DECLARE_INPUT}
+
+public:
+  Benchmark (){
+    std::uniform_real_distribution<double> unif(0,1);
+    std::uniform_int_distribution<int> unif_int(0,1);
+    std::default_random_engine re;
+
+    ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}
+
+    ${CODEGEN_INITIALIZE_INPUT}
+  }
+
+  ${CODEGEN_BENCHMARK_FUNCTION}
+
+  void run_benchmark(){
+    ${CODEGEN_DECLARE_ARGUMENTS}
+
+    std::uniform_real_distribution<double> unif(0,1);
+    std::uniform_int_distribution<int> unif_int(0,1);
+    std::default_random_engine re;
+
+    ${CODEGEN_INITIALIZE_ARGUMENTS}
+
+    ${CODEGEN_CALL_BENCHMARK_FUNCTION}
+  }
+
+};
+
+static void BM_sumfact_kernel(benchmark::State& state){
+  Benchmark benchmark;
+  for (auto _ : state){
+    benchmark.run_benchmark();
+  }
+}
+
+BENCHMARK(BM_sumfact_kernel) -> Threads(atoi(std::getenv("DUNE_CODEGEN_THREADS"))) -> UseRealTime();
+
+BENCHMARK_MAIN();
diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py
index 14cd0b1d0fe3814f294c22bc00f25260f4ee0233..461f5b0a3ace21e09a7a3707c4849aa1c36e6972 100644
--- a/python/dune/codegen/sumfact/realization.py
+++ b/python/dune/codegen/sumfact/realization.py
@@ -30,7 +30,6 @@ from dune.codegen.sumfact.quadrature import quadrature_points_per_direction
 from dune.codegen.sumfact.symbolic import (SumfactKernel,
                                            VectorizedSumfactKernel,
                                            )
-from dune.codegen.sumfact.vectorization import attach_vectorization_info
 from dune.codegen.sumfact.accumulation import sumfact_iname
 from dune.codegen.loopy.target import dtype_floatingpoint
 from dune.codegen.loopy.vcl import ExplicitVCLCast
@@ -61,6 +60,12 @@ def name_buffer_storage(buff, which):
     return name
 
 
+def _max_sum_factorization_buffer_size(sf):
+    size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width,
+               product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width)
+    return size
+
+
 @kernel_cached
 def _realize_sum_factorization_kernel(sf):
     insn_dep = sf.insn_dep
@@ -74,8 +79,7 @@ def _realize_sum_factorization_kernel(sf):
     for buf in buffers:
         # Determine the necessary size of the buffer. We assume that we do not
         # underintegrate the form!!!
-        size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width,
-                   product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width)
+        size = _max_sum_factorization_buffer_size(sf)
         temporary_variable("{}_dummy".format(buf),
                            shape=(size,),
                            custom_base_storage=buf,
@@ -118,10 +122,32 @@ class BufferSwitcher(object):
     def __init__(self):
         self.current = 0
 
-    def get_temporary(self, name=None, **kwargs):
+    def get_temporary(self, sf=None, name=None, **kwargs):
+        assert sf
         assert name
+
         bs = "buffer{}".format(self.current)
-        globalarg(bs)
+        shape = kwargs['shape']
+        assert shape
+        dim_tags = kwargs['dim_tags'].split(',')
+        assert dim_tags
+
+        # Calculate correct alignment
+        vec_size = 1
+        if 'vec' in dim_tags:
+            vec_size = shape[dim_tags.index('vec')]
+        from dune.codegen.loopy.target import dtype_floatingpoint
+        dtype = np.dtype(kwargs.get("dtype", dtype_floatingpoint()))
+        alignment = dtype.itemsize * vec_size
+
+        # Add this buffer as global argument to the kernel. Add a shape to make
+        # benchmark generation for autotuning of loopy kernels easier. Since
+        # this buffer is used to store different data sizes we need to make
+        # sure it is big enough.
+        assert sf
+        size = _max_sum_factorization_buffer_size(sf)
+        globalarg(bs, shape=(size,), alignment=alignment, dim_tags=['f', ])
+
         temporary_variable(name,
                            managed=True,
                            custom_base_storage=bs,
@@ -193,7 +219,8 @@ def realize_sumfact_kernel_function(sf):
         if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible:
             input_summand = sf.interface.realize_direct_input(input_inames, inp_shape)
         elif l == 0:
-            input_summand = sf.interface.realize_input(input_inames,
+            input_summand = sf.interface.realize_input(sf,
+                                                       input_inames,
                                                        inp_shape,
                                                        vec_iname,
                                                        vec_shape,
@@ -204,7 +231,8 @@ def realize_sumfact_kernel_function(sf):
             # Get a temporary that interprets the base storage of the input
             # as a column-major matrix. In later iteration of the matrix loop
             # this reinterprets the output of the previous iteration.
-            inp = buffer.get_temporary("buff_step{}_in".format(l),
+            inp = buffer.get_temporary(sf,
+                                       "buff_step{}_in".format(l),
                                        shape=inp_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
@@ -250,7 +278,8 @@ def realize_sumfact_kernel_function(sf):
                 output_inames = permute_backward(output_inames, sf.interface.quadrature_permutation)
                 output_shape = permute_backward(output_shape, sf.interface.quadrature_permutation)
 
-            out = buffer.get_temporary("buff_step{}_out".format(l),
+            out = buffer.get_temporary(sf,
+                                       "buff_step{}_out".format(l),
                                        shape=output_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
@@ -265,7 +294,8 @@ def realize_sumfact_kernel_function(sf):
 
         else:
             output_shape = tuple(out_shape[1:]) + (out_shape[0],)
-            out = buffer.get_temporary("buff_step{}_out".format(l),
+            out = buffer.get_temporary(sf,
+                                       "buff_step{}_out".format(l),
                                        shape=output_shape + vec_shape,
                                        dim_tags=ftags,
                                        )
diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py
index 60641b0ce4af4ddef525a4c03a438c4ebe277c0c..dfd9383f93c79d3c50bfcca24cdd717fcc16aa58 100644
--- a/python/dune/codegen/sumfact/symbolic.py
+++ b/python/dune/codegen/sumfact/symbolic.py
@@ -60,7 +60,7 @@ class SumfactKernelInterfaceBase(object):
         """
         raise NotImplementedError
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         """Interpret the input of sumfact kernel function in the right way (non fastdgg)
 
         This happens inside the sumfact kernel function.
@@ -73,6 +73,7 @@ class SumfactKernelInterfaceBase(object):
 
         Parameters
         ----------
+        sf : SumfactKernel or VectorizedSumfactKernel
         inames : tuple of pymbolic.primitives.Variable
             Inames for accesing the input. Ordered according to permuted matrix sequence.
         shape : tuple of int
@@ -252,11 +253,12 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
             dep = dep.union(inp.setup_input(sf, dep, index=i))
         return dep
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
@@ -358,7 +360,7 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
 
         return prim.Call(prim.Variable(hadd_function), (result,))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
+    def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags):
         # The input for stage 3 is quadrature permuted. The inames and shape
         # passed to this method are quadrature and cost permuted. This means we
         # need to take the cost permutation back to get the right inames and
@@ -368,7 +370,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
 
         # Get a temporary that interprets the base storage of the input as a
         # column-major matrix.
-        inp = buf.get_temporary("buff_step0_in",
+        inp = buf.get_temporary(sf,
+                                "buff_step0_in",
                                 shape=shape + vec_shape,
                                 dim_tags=ftags,
                                 )
@@ -382,9 +385,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
     def realize_direct_output(self, result, inames, shape, **args):
         outputs = set(self.interfaces)
 
-        # If multiple horizontal_add's are to be performed with 'result'
-        # we need to precompute the result!
         if len(outputs) > 1:
+            # Introduce substrule for the argument of the horizontal add
             substname = "haddsubst_{}".format("_".join([i.name for i in inames]))
             subst_rule(substname, (), result)
             result = prim.Call(prim.Variable(substname), ())
@@ -503,6 +505,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
         the transformation brings the next reduction index in the fastest
         position.
 
+        In einsum notation from numpy this can be written as three contractions
+        of the form: 'ij,jkl->kli'
+
         It can make sense to permute the order of directions. If you have
         a small m_l (e.g. stage 1 on faces) it is better to do direction l
         first. This can be done by:
@@ -533,6 +538,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
             other.
         interface: An SumfactKernelInterfaceBase instance describing the input
             (stage 1) or output (stage 3) of the kernel
+
         """
         # Assert the inputs!
         assert isinstance(matrix_sequence, tuple)
@@ -828,6 +834,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
                  vertical_width=1,
                  buffer=None,
                  insn_dep=frozenset(),
+                 transformations=(),
                  ):
         # Assert the input data structure
         assert isinstance(kernels, tuple)
@@ -880,8 +887,9 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     #
     @property
     def function_name(self):
-        return "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted),
+        name = "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted),
                                     self.interface.function_name_suffix)
+        return name
 
     @property
     def cache_key(self):
diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py
new file mode 100644
index 0000000000000000000000000000000000000000..a752d217add1013bbcb22771824cf992103b2237
--- /dev/null
+++ b/python/dune/codegen/sumfact/transformations.py
@@ -0,0 +1,422 @@
+import re
+import logging
+
+import loopy as lp
+import pymbolic.primitives as prim
+import islpy as isl
+
+from dune.codegen.generation import (get_counted_variable,
+                                     get_global_context_value,
+                                     )
+from dune.codegen.loopy.transformations.remove_reductions import remove_all_reductions, remove_reduction
+from dune.codegen.options import get_form_option, get_option
+from dune.codegen.pdelab.geometry import world_dimension
+from dune.codegen.error import CodegenAutotuneError
+from dune.codegen.sumfact.autotune import autotune_realization
+
+
+def _current_iname_order(current_inames, new_iname_order):
+    """Sort the inames for this contraction according to new_iname order"""
+    current_iname_order = []
+    for i in new_iname_order:
+        for j in current_inames:
+            if i in j:
+                current_iname_order.append(j)
+    return current_iname_order
+
+
+def _get_inames_of_reduction(instr, iname_permutation):
+    # Check which loops are inside the reduction loop
+    reduction_index = iname_permutation[-1]
+    within_reduction = tuple(i > reduction_index for i in iname_permutation)
+
+    # Basename of inner inames
+    dim = world_dimension()
+    inner_inames_base = []
+    for index, within in enumerate(within_reduction):
+        if within:
+            inner_inames_base.append('sf_out_inames_{}'.format(dim - 1 - index))
+
+    # Name of inner inames and outer inames
+    inner_inames = []
+    outer_inames = []
+    vec_inames = []
+    for i in [iname for iname in instr.within_inames]:
+        for j in inner_inames_base:
+            if j in i:
+                inner_inames.append(i)
+        if 'sf_vec' in i:
+            vec_inames.append(i)
+        elif i not in inner_inames:
+            outer_inames.append(i)
+
+    # Reduction iname
+    regex = re.compile('sf_red_([0-9]*)')
+    reduction_index = set(regex.findall(str(instr)))
+    if len(reduction_index) == 0:
+        reduction_iname = None
+    else:
+        assert len(reduction_index) == 1
+        reduction_index = reduction_index.pop()
+        reduction_iname = 'sf_red_{}'.format(reduction_index)
+
+    return outer_inames, reduction_iname, inner_inames, vec_inames
+
+
+class FindReductionMapper(lp.symbolic.WalkMapper):
+    def __init__(self):
+        self.reductions = []
+
+    def map_reduction(self, expr, **kwargs):
+        self.reductions.append(expr)
+        return lp.symbolic.WalkMapper.map_reduction(self, expr, **kwargs)
+
+
+def _reorder_reduction_loops(kernel, match, iname_order):
+    """Reorder loops in instructions containing one reduction
+
+    In order to reorder the loops we need to manually do the accumulation in a
+    temporary that is large enough to fit all the data created in loops inside
+    the reduction loop.
+
+    Parameters
+    ----------
+    kernel: loopy.kernel.LoopKernel
+    match:
+    iname_order: tuple of str
+        prefered loop order for this instruction
+    """
+    instructions = lp.find_instructions(kernel, match)
+    for instr in instructions:
+        # Get reduction
+        reduction = FindReductionMapper()
+        reduction(instr.expression)
+        assert len(reduction.reductions) == 1
+        reduction = reduction.reductions[0]
+
+        # Reduction iname, inner inames and vetcor inames
+        assert len(reduction.inames) == 1
+        reduction_iname = reduction.inames[0]
+        assert set([reduction_iname]) == instr.reduction_inames()
+        if reduction_iname not in iname_order:
+            lp.prioritize_loops(kernel, iname_order)
+            continue
+        inner_inames = iname_order[iname_order.index(reduction_iname) + 1:]
+        if len(inner_inames) == 0:
+            lp.prioritize_loops(kernel, iname_order)
+            continue
+        # TODO: There should be a better way to do that
+        vec_inames = [i for i in instr.within_inames if 'sf_vec' in i]
+        assert len(vec_inames) < 2
+
+        # {{{ Create new temporary variable
+
+        # Create dim_tags
+        dim_tags = ','.join(['f'] * len(inner_inames))
+        vectorized = len(vec_inames) > 0
+        if vectorized:
+            assert len(vec_inames) == 1
+            dim_tags = dim_tags + ',vec'
+
+        # Create shape
+        shape = tuple(kernel.get_constant_iname_length(i) for i in inner_inames + vec_inames)
+
+        # Update temporary_variables of this kernel
+        from dune.codegen.loopy.temporary import DuneTemporaryVariable
+        accum_variable = get_counted_variable('accum_variable')
+        from dune.codegen.loopy.target import dtype_floatingpoint
+        dtype = lp.types.NumpyType(dtype_floatingpoint())
+        var = {accum_variable: DuneTemporaryVariable(accum_variable,
+                                                     dtype=dtype,
+                                                     shape=shape,
+                                                     dim_tags=dim_tags,
+                                                     managed=True)}
+        tv = kernel.temporary_variables.copy()
+        tv.update(var)
+        kernel = kernel.copy(temporary_variables=tv)
+
+        # }}}
+
+        # Set accumulation variable to zero
+        accum_init_inames = tuple(prim.Variable(i) for i in inner_inames)
+        if vectorized:
+            accum_init_inames = accum_init_inames + (prim.Variable(vec_inames[0]),)
+        assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
+        accum_init_id = instr.id + '_accum_init'
+        accum_init_instr = lp.Assignment(assignee,
+                                         0,
+                                         within_inames=instr.within_inames,
+                                         id=accum_init_id,
+                                         depends_on=instr.depends_on,
+                                         tags=('accum_init',),
+                                         )
+        kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr])
+
+        # Accumulate in temporary variable
+        assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
+        expression = prim.Sum((assignee, reduction.expr))
+        within_inames = frozenset(tuple(instr.within_inames) + reduction.inames)
+        accum_id = instr.id + '_accum'
+        accum_instr = lp.Assignment(assignee,
+                                    expression,
+                                    within_inames=within_inames,
+                                    id=accum_id,
+                                    depends_on=frozenset([accum_init_id]),
+                                    tags=('accum',),
+                                    )
+        kernel = kernel.copy(instructions=kernel.instructions + [accum_instr])
+
+        # Replace reduction in insn with accumulation result
+        def map_reduction(expr, rec):
+            return assignee
+        mapper = lp.symbolic.ReductionCallbackMapper(map_reduction)
+        new_expression = mapper(instr.expression)
+        assign_id = instr.id + '_assign'
+        new_instr = instr.copy(expression=new_expression,
+                               id=assign_id,
+                               depends_on=frozenset(list(instr.depends_on) + [accum_id]))
+        kernel = kernel.copy(instructions=kernel.instructions + [new_instr])
+
+        # Fix dependencies and remove old instruction
+        for i in kernel.instructions:
+            if instr.id in i.depends_on:
+                match = lp.match.Id(i.id)
+                kernel = lp.add_dependency(kernel, match, assign_id)
+        kernel = lp.remove_instructions(kernel, set([instr.id]))
+
+        # Duplicate and reorder loops for accumulation
+        ids = [accum_init_id, accum_id]
+        duplicate_inames = tuple(inner_inames)
+        for idx in ids:
+            match = lp.match.Id(idx)
+            kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+            match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
+            current_iname_order = _current_iname_order(match_inames, iname_order)
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+
+        # Reorder assignment loops
+        kernel = lp.prioritize_loops(kernel, iname_order)
+        return kernel
+
+
+def _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation):
+    """Reorder the loop nests of a tensor contraction accumulating directly in the data structure"""
+    dim = world_dimension()
+
+    # Nothing to do if permutation is identity
+    if iname_permutation == tuple(range(dim + 1)):
+        return kernel
+
+    # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
+    default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
+    from dune.codegen.sumfact.permutation import permute_backward
+    new_iname_order = permute_backward(default_iname_order, iname_permutation)
+
+    for instr in kernel.instructions:
+        # Inames used in this reduction
+        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
+                                                                                           iname_permutation)
+        if reduction_iname:
+            current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
+        else:
+            current_inames = outer_inames + inner_inames + vec_inames
+        current_iname_order = _current_iname_order(current_inames,
+                                                   new_iname_order)
+
+        # We can directly use lp.prioritize_loops if
+        # - The reduction is the innermost loop
+        # - There is no reduction (eg reduced direction on faces)
+        if iname_permutation[-1] == dim or reduction_iname is None:
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+            continue
+
+        if isinstance(instr.expression, lp.symbolic.Reduction):
+            # If the instruction is a reduction we can directly accumulate in the assignee
+            assert set(inner_inames).issubset(set(i.name for i in instr.assignee.index_tuple))
+            match = lp.match.Id(instr.id)
+            kernel = remove_reduction(kernel, match)
+
+            lp.prioritize_loops(kernel, current_iname_order)
+            duplicate_inames = tuple(inner_inames)
+            match = lp.match.Id(instr.id + '_set_zero')
+            kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+            match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
+            set_zero_iname_order = _current_iname_order(match_inames, new_iname_order)
+            lp.prioritize_loops(kernel, tuple(set_zero_iname_order))
+        else:
+            # In stage 3 this is usually not a reduction. In this case direct
+            # accumulation is not possible and we accumulate in a temporay
+            # variable instead
+            match = lp.match.Id(instr.id)
+            kernel = _reorder_reduction_loops(kernel, match, current_iname_order)
+
+    return kernel
+
+
+def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
+    """Reorder the loop nests of a tensor contraction using an accumulation variable"""
+    dim = world_dimension()
+
+    # Nothing to do if permutation is identity
+    if iname_permutation == tuple(range(dim + 1)):
+        return kernel
+
+    # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
+    default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
+    from dune.codegen.sumfact.permutation import permute_backward
+    new_iname_order = permute_backward(default_iname_order, iname_permutation)
+
+    for instr in kernel.instructions:
+        # Inames used in this reduction
+        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
+                                                                                           iname_permutation)
+        if reduction_iname:
+            current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
+        else:
+            current_inames = outer_inames + inner_inames + vec_inames
+        current_iname_order = _current_iname_order(current_inames, new_iname_order)
+
+        # We can directly use lp.prioritize_loops if:
+        # - The reduction is the innermost loop
+        # - There is no reduction (eg reduced direction on faces)
+        if iname_permutation[-1] == dim or reduction_iname is None:
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+            continue
+
+        match = lp.match.Id(instr.id)
+        kernel = _reorder_reduction_loops(kernel, match, current_iname_order)
+
+    return kernel
+
+
+def reorder_loops_in_tensor_contraction(kernel, iname_permutation, accum_variable=True):
+    """Change loop order in tensor contractions of sum factorization kernel
+
+    Since there is a reduction involved this implies more than just reordering
+    loops. There are two possibilities to handle the reduction:
+
+    1) Generate an object of correct size for the accumulation and assign to
+    the result data structure afterwards
+
+    2) Make sure to set the correct entries of the result data structure to
+    zero and accumulate inplace
+
+    Parameters
+    ----------
+    kernel: loopy.kernel.LoopKernel
+    iname_permutation: tuple of ints
+        How to permute the loop inames. Should contain some permutation of the
+        numbers 0 to dim+1
+    accum_variable: bool
+        If True use method 1 else use method 2 from above
+
+    Notes
+    -----
+    Using einsum notation from numpy a contraction of a sum factorization
+    kernel can be written as:
+
+    - 3D: 'ij,jkl->kli'
+    - 2D: 'ij,jk->ki'
+
+    In the sum factorization kernel itself those inames are called:
+
+    sf_out_inames_2_* : l
+    sf_out_inames_1_* : k
+    sf_out_inames_0_* : i
+    sf_red_* : j
+
+    where * represents the current direction (0,1,2 for 3D problems). The
+    default order of the loops is
+
+    (sf_out_iname_2_*, sf_out_iname_1_*, sf_out_iname_0_*, red_*)
+
+    A permutation vector of (3,2,0,1) would mean that we do
+
+    (sf_out_iname_0_*, red_*, sf_out_iname_1_*, sf_out_iname_2_*)
+
+    instead.
+
+    """
+    if accum_variable:
+        kernel = _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation)
+        return kernel
+    else:
+        kernel = _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation)
+        return kernel
+
+
+def tensor_contraction_loop_order_generator(kernel):
+    yield kernel, ['None']
+
+    dim = world_dimension()
+    identity = range(dim + 1)
+    import itertools
+    for permutation in itertools.permutations(identity):
+        # Culling of stupid variant
+        if permutation[0] == dim:
+            continue
+
+        new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, accum_variable=True)
+        yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_True'.format(permutation)]
+
+        new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, accum_variable=False)
+        yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_False'.format(permutation)]
+
+
+def simple_autotuner(kernel_generator, signature):
+    kernel, transformations = next(kernel_generator)
+    best_cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations)
+    best_kernel = kernel
+    best_transformations = transformations
+    for kernel, transformations in kernel_generator:
+        cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations)
+        if cost < best_cost:
+            best_cost = cost
+            best_kernel = kernel
+            best_transformations = transformations
+    return best_kernel, best_transformations
+
+
+def autotune_tensor_contraction_loop_order(kernel, signature):
+    logger = logging.getLogger(__name__)
+
+    assert get_option('autotune_google_benchmark')
+    from dune.codegen.loopy.transformations.matchfma import match_fused_multiply_add
+    kernel = match_fused_multiply_add(kernel)
+    generator = tensor_contraction_loop_order_generator(kernel)
+    kernel, transformations = simple_autotuner(generator, signature)
+    logger.debug('autotute_tensor_contraction_loop_order - kernel {} transformations {}'.format(kernel.name, transformations))
+    return kernel
+
+
+def sumfact_performance_transformations(kernel, signature):
+    if get_form_option('sumfact_performance_transformations'):
+        assert not get_form_option('sumfact_performance_transformations_testrun')
+        if kernel.name.startswith('sfimpl'):
+            kernel = autotune_tensor_contraction_loop_order(kernel, signature)
+
+    # Testing performance transformations
+    elif get_form_option('sumfact_performance_transformations_testrun'):
+        assert not get_form_option('sumfact_performance_transformations')
+        if kernel.name.startswith('sfimpl'):
+            dim = world_dimension()
+            testcase = get_form_option('sumfact_performance_transformations_testrun')
+            if dim == 2:
+                testrun_dict = {
+                    1: [(reorder_loops_in_tensor_contraction, ((0, 2, 1), True))],
+                    2: [(reorder_loops_in_tensor_contraction, ((0, 2, 1), False))],
+                    3: [(reorder_loops_in_tensor_contraction, ((2, 0, 1), True))],
+                    4: [(reorder_loops_in_tensor_contraction, ((2, 0, 1), False))],
+                }
+                for trafo, arguments in testrun_dict[testcase]:
+                    kernel = trafo(kernel, *arguments)
+            else:
+                testrun_dict = {
+                    1: [(reorder_loops_in_tensor_contraction, ((3, 2, 0, 1), True))],
+                    2: [(reorder_loops_in_tensor_contraction, ((3, 2, 0, 1), False))],
+                    3: [(reorder_loops_in_tensor_contraction, ((2, 0, 1, 3), True))],
+                    4: [(reorder_loops_in_tensor_contraction, ((2, 0, 1, 3), False))],
+                }
+                for trafo, arguments in testrun_dict[testcase]:
+                    kernel = trafo(kernel, *arguments)
+    return kernel
diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt
index 5bb10fbcd303a916ce1ab353d7ecac8972f5dbe4..c81c94bc9fd536645149940d290846abce857715 100644
--- a/test/sumfact/poisson/CMakeLists.txt
+++ b/test/sumfact/poisson/CMakeLists.txt
@@ -134,3 +134,24 @@ if(benchmark_FOUND)
                                     INIFILE poisson_fastdg_volumes_3d_benchmark.mini
                                     )
 endif()
+
+
+#================================================
+# Poisson fastdg with performance transformations
+#================================================
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl
+                                  BASENAME sumfact_poisson_fastdg_2d_performance_transformations
+                                  INIFILE poisson_fastdg_2d_performance_transformations.mini
+                                  )
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
+                                  BASENAME sumfact_poisson_fastdg_3d_performance_transformations
+                                  INIFILE poisson_fastdg_3d_performance_transformations.mini
+                                  )
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl
+                                  BASENAME sumfact_poisson_dg_2d_performance_transformations
+                                  INIFILE poisson_dg_2d_performance_transformations.mini
+                                  )
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
+                                  BASENAME sumfact_poisson_dg_3d_performance_transformations
+                                  INIFILE poisson_dg_3d_performance_transformations.mini
+                                  )
diff --git a/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..47b0e5269c9ebf62d03328e0f5370528a14ac9f6
--- /dev/null
+++ b/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini
@@ -0,0 +1,28 @@
+__name = sumfact_poisson_dg_2d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 16 16
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = explicit, none | expand gradvec
+fastdg = 0
+geometry_mixins = sumfact_equidistant
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+
+[formcompiler.ufl_variants]
+degree = 2
\ No newline at end of file
diff --git a/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..0b6d75f5c1181bcfe9f5c7ab9c6a941cd9653251
--- /dev/null
+++ b/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini
@@ -0,0 +1,29 @@
+__name = sumfact_poisson_dg_3d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 8 8 8
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+# vectorization_strategy = explicit, none | expand gradvec
+vectorization_strategy = model, none | expand gradvec
+fastdg = 0
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+geometry_mixins = sumfact_equidistant
+
+[formcompiler.ufl_variants]
+degree = 2
diff --git a/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..22461b9cc8bbafc6a9e15c2489bd35bf432bb5de
--- /dev/null
+++ b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini
@@ -0,0 +1,28 @@
+__name = sumfact_poisson_fastdg_2d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 16 16
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = model, none | expand gradvec
+fastdg = 1
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+geometry_mixins = sumfact_equidistant
+
+[formcompiler.ufl_variants]
+degree = 2
\ No newline at end of file
diff --git a/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..0e9cfea80bab55167a277ce8e189761e9b780420
--- /dev/null
+++ b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini
@@ -0,0 +1,28 @@
+__name = sumfact_poisson_fastdg_3d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{testrun_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun}
+
+cells = 8 8 8
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = explicit, none | expand gradvec
+fastdg = 1
+geometry_mixins = sumfact_equidistant
+sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun
+
+[formcompiler.ufl_variants]
+degree = 2