diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index c8eece5274d632fc764286bbcf1f346ba28092bd..3fa1a27d125e242ec0bd59bfc58c09b055f27818 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -86,6 +86,7 @@ class CodegenFormOptionsArray(ImmutableRecord):
     sumfact_regular_jacobians = CodegenOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)")
     sumfact_on_boundary = CodegenOption(default=True, helpstr="Whether boundary integrals should be vectorized. It might not be worth the hassle...")
     sumfact_optimize_loop_order = CodegenOption(default=False, helpstr="Optimize order of loops in sumf factorization function using autotuning.")
+    sumfact_performance_transformations = CodegenOption(default=False, helpstr="Apply sum factorization specific performance transformations.")
     vectorization_quadloop = CodegenOption(default=False, helpstr="whether to generate code with explicit vectorization")
     vectorization_strategy = CodegenOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune")
     vectorization_not_fully_vectorized_error = CodegenOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize")
diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index 393b21ff3c00e5765c525c10dfe4c7b41a56c59b..5cc2178ce0adc8083ceeda7224b4180e2e4186ba 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -7,6 +7,7 @@ import filelock
 import hashlib
 import json
 from operator import mul
+from six.moves import reduce
 import time
 
 import loopy as lp
diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py
index 71fd8c71e94b5ca7feffa59ec876af8149fa039c..61867987ff21a6348fe7f567a63c64836f7a81d2 100644
--- a/python/dune/codegen/sumfact/symbolic.py
+++ b/python/dune/codegen/sumfact/symbolic.py
@@ -385,12 +385,14 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
     def realize_direct_output(self, result, inames, shape, **args):
         outputs = set(self.interfaces)
 
+        # Introduce substrule for the argument of the horizontal add
+        substname = "haddsubst_{}".format("_".join([i.name for i in inames]))
+        subst_rule(substname, (), result)
+        result = prim.Call(prim.Variable(substname), ())
+
         # If multiple horizontal_add's are to be performed with 'result'
         # we need to precompute the result!
         if len(outputs) > 1:
-            substname = "haddsubst_{}".format("_".join([i.name for i in inames]))
-            subst_rule(substname, (), result)
-            result = prim.Call(prim.Variable(substname), ())
             transform(lp.precompute, substname)
 
         deps = frozenset()
diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py
index 41c0d690da9d8aa525871d77dc8672947795b5f8..04beaa1d765d567e172bf0162fccd121d10f6309 100644
--- a/python/dune/codegen/sumfact/transformations.py
+++ b/python/dune/codegen/sumfact/transformations.py
@@ -1,4 +1,5 @@
 import re
+import logging
 
 import loopy as lp
 import pymbolic.primitives as prim
@@ -182,17 +183,17 @@ def _get_inames_of_reduction(instr, iname_permutation):
     inner_inames_base = []
     for index, within in enumerate(within_reduction):
         if within:
-            inner_inames_base.append('sf_out_inames_{}'.format(dim-1-index))
+            inner_inames_base.append('sf_out_inames_{}'.format(dim - 1 - index))
 
     # Name of inner inames and outer inames
     inner_inames = []
     outer_inames = []
     vec_inames = []
-    for i in [x.name for x in instr.assignee.index_tuple]:
+    for i in [iname for iname in instr.within_inames]:
         for j in inner_inames_base:
             if j in i:
                 inner_inames.append(i)
-        if 'vec' in i:
+        if 'sf_vec' in i:
             vec_inames.append(i)
         elif i not in inner_inames:
             outer_inames.append(i)
@@ -200,13 +201,12 @@ def _get_inames_of_reduction(instr, iname_permutation):
     # Reduction iname
     regex = re.compile('sf_red_([0-9]*)')
     reduction_index = set(regex.findall(str(instr)))
-    if isinstance(instr.expression, lp.symbolic.Reduction):
+    if len(reduction_index) == 0:
+        reduction_iname = None
+    else:
         assert len(reduction_index) == 1
         reduction_index = reduction_index.pop()
         reduction_iname = 'sf_red_{}'.format(reduction_index)
-    else:
-        assert len(reduction_index) == 0
-        reduction_iname = None
 
     return outer_inames, reduction_iname, inner_inames, vec_inames
 
@@ -285,11 +285,11 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
     dim = world_dimension()
 
     # Nothing to do if permutation is identity
-    if iname_permutation == tuple(range(dim+1)):
+    if iname_permutation == tuple(range(dim + 1)):
         return kernel
 
     # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
-    default_iname_order = ['sf_out_inames_{}'.format(dim-1-i) for i in range(dim)] + ['sf_red']
+    default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
     from dune.codegen.sumfact.permutation import permute_backward
     new_iname_order = permute_backward(default_iname_order, iname_permutation)
 
@@ -308,21 +308,21 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
         return current_iname_order
 
     for instr in kernel.instructions:
+        # Inames used in this reduction
+        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
+                                                                                           iname_permutation)
+
         # We can directly use lp.prioritize_loops if:
         # - The reduction is the innermost loop
         # - There is no reduction (eg reduced direction on faces)
-        if iname_permutation[-1] == dim or not isinstance(instr.expression, lp.symbolic.Reduction):
-            # Inames used in this reduction
-            outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
-                                                                                               iname_permutation)
+        if iname_permutation[-1] == dim or reduction_iname is None:
             current_iname_order = _current_new_iname_order(outer_inames,
                                                            reduction_iname,
                                                            inner_inames,
                                                            new_iname_order)
             kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
             continue
-
-        assert isinstance(instr.expression.operation, lp.library.reduction.SumReductionOperation)
+        assert isinstance(instr.expression, lp.symbolic.Reduction)
 
         # Dependencies. Note: We change the instructions below so we cannot do
         # depends_on=instr.depends_on. But we know that the id of instr is
@@ -337,33 +337,36 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
         # Remove reduction
         kernel = lp.remove_instructions(kernel, set([instr.id]))
 
-        # Inames used in this reduction
-        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr, iname_permutation)
-
         # Create dim_tags
         dim_tags = ','.join(['f'] * len(inner_inames))
-        agg_variable = kernel.temporary_variables[instr.assignee.aggregate.name]
-        vectorized = isinstance(agg_variable.dim_tags[-1], lp.kernel.array.VectorArrayDimTag)
+        vectorized = len(vec_inames) > 0
         if vectorized:
             assert len(vec_inames) == 1
             dim_tags = dim_tags + ',vec'
 
         # Create shape
-        agg_inames = tuple(i.name for i in instr.assignee.index_tuple)
-        agg_shape = agg_variable.shape
-        shape = tuple(agg_shape[agg_inames.index(i)] for i in inner_inames)
-        if vectorized:
-            shape = shape + (agg_shape[-1],)
+        def _get_iname_bound(kernel, iname):
+            # TODO: Not sure if that works in all cases
+            ldi, = kernel.get_leaf_domain_indices((iname,))
+            domain = kernel.domains[ldi]
+            pwaff = domain.dim_max(0)
+            bound = lp.symbolic.pw_aff_to_expr(pwaff)
+            return bound + 1
+        shape = tuple(_get_iname_bound(kernel, i) for i in inner_inames + vec_inames)
 
         # Update temporary_variables of this kernel
         from dune.codegen.loopy.temporary import DuneTemporaryVariable
         accum_variable = get_counted_variable('accum_variable')
+        from dune.codegen.loopy.target import dtype_floatingpoint
+        dtype = lp.types.NumpyType(dtype_floatingpoint())
         var = {accum_variable: DuneTemporaryVariable(accum_variable,
-                                                     dtype=agg_variable.dtype,
+                                                     dtype=dtype,
                                                      shape=shape,
                                                      dim_tags=dim_tags,
                                                      managed=True)}
-        kernel.temporary_variables.update(var)
+        tv = kernel.temporary_variables.copy()
+        tv.update(var)
+        kernel = kernel.copy(temporary_variables=tv)
 
         # Set accumulation variable to zero
         accum_init_inames = tuple(prim.Variable(i) for i in inner_inames)
@@ -378,7 +381,7 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
                                          depends_on=depends_on,
                                          tags=('accum_init',),
                                          )
-        kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr,])
+        kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr])
 
         # Accumulate in temporary variable
         assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
@@ -389,26 +392,46 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
                                     expression,
                                     within_inames=within_inames,
                                     id=accum_id,
-                                    depends_on=frozenset([accum_init_id,]),
+                                    depends_on=frozenset([accum_init_id]),
                                     tags=('accum',),
                                     )
-        kernel = kernel.copy(instructions=kernel.instructions + [accum_instr,])
-
-        # Assign accumulation result
-        assign_id = instr.id + '_assign'
-        assign_instr = lp.Assignment(instr.assignee,
-                                     assignee,
-                                     within_inames=within_inames,
-                                     id=assign_id,
-                                     depends_on=frozenset([accum_id,]),
-                                     tags=('accum_assign',),
-                                     )
-        kernel = kernel.copy(instructions=kernel.instructions + [assign_instr,])
+        kernel = kernel.copy(instructions=kernel.instructions + [accum_instr])
+
+        if 'haddsubst' not in str(instr):
+            # Write back the result if this reduction does not come from a substitution
+            assign_id = instr.id + '_assign'
+            assign_instr = lp.Assignment(instr.assignee,
+                                         assignee,
+                                         within_inames=within_inames,
+                                         id=assign_id,
+                                         depends_on=frozenset([accum_id]),
+                                         tags=('accum_assign',),
+                                         )
+            kernel = kernel.copy(instructions=kernel.instructions + [assign_instr])
 
-        # Restore dependencies
-        for dep in depending:
-            match = lp.match.Id(dep)
-            kernel = lp.add_dependency(kernel, match, assign_id)
+            # Restore dependencies
+            for dep in depending:
+                match = lp.match.Id(dep)
+                kernel = lp.add_dependency(kernel, match, assign_id)
+        else:
+            # If this reduction comes from a substitution take the
+            # corresponding assignment and replace the substitution with the
+            # accumulation result
+            assert len(depending) == 1
+            assignment, = [i for i in kernel.instructions if depending[0] == i.id]
+            other = [i for i in kernel.instructions if i.id != assignment.id]
+            replace = {prim.Variable(instr.assignee.name): assignee}
+            from dune.codegen.loopy.symbolic import substitute
+            new_expression = substitute(assignment.expression, replace)
+            new_instr = assignment.copy(expression=new_expression,
+                                        depends_on=frozenset(list(assignment.depends_on) + [accum_id]))
+            kernel = kernel.copy(instructions=other + [new_instr])
+
+            # Remove the temporary variable that belongs to the
+            # substitution. It is no longer used.
+            tv = kernel.temporary_variables.copy()
+            del tv[instr.assignee.name]
+            kernel = kernel.copy(temporary_variables=tv)
 
         # Reordering loops only works if we duplicate some inames
         duplicate_inames = tuple(i.name for i in accum_init_inames)
@@ -416,8 +439,13 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
             duplicate_inames = duplicate_inames[:-1]
         match = lp.match.Id(accum_init_id)
         kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
-        match = lp.match.Id(assign_id)
-        kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+        # Reorder loops of the assignment of the result
+        if 'haddsubst' not in str(instr):
+            match = lp.match.Id(assign_id)
+            kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+        else:
+            match = lp.match.Id(assignment.id)
+            kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
 
         # Change loop order
         current_iname_order = _current_new_iname_order(outer_inames,
@@ -478,37 +506,32 @@ def reorder_loops_in_tensor_contraction(kernel, iname_permutation, accum_variabl
 
     """
     if accum_variable:
-        return _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation)
+        kernel = _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation)
+        return kernel
     else:
-        # Need to adapts this!
+        # TODO: Need to adapt this!
         assert False
-        return _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation)
+        kernel = _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation)
+        return kernel
 
 
 def tensor_contraction_loop_order_generator(kernel):
     dim = world_dimension()
-    assert dim == 3
-
-    indices = ['l', 'k', 'i', 'j']
+    identity = range(dim + 1)
     import itertools
-    for loop_order in itertools.permutations(indices):
-        # palpo TODO: Heavy culling for 'quick' tests during development
-        if loop_order[0] != 'l' or loop_order[1] != 'k':
+    for permutation in itertools.permutations(identity):
+        # Culling of stupid variant
+        if permutation[0] == dim:
             continue
 
-        order = ''.join(loop_order)
-        new_kernel = reorder_loops_in_tensor_contraction(kernel, order, True)
-        yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_True'.format(order),]
+        new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, True)
+        yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_True'.format(permutation)]
 
-        new_kernel = reorder_loops_in_tensor_contraction(kernel, loop_order, False)
-        yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_False'.format(order),]
+        # new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, False)
+        # yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_False'.format(permutation),]
 
 
 def simple_autotuner(kernel_generator, signature):
-    # palpo TODO
-    from dune.codegen.options import set_option
-    set_option("autotune_google_benchmark", True)
-
     kernel, transformations = next(kernel_generator)
     best_cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations)
     best_kernel = kernel
@@ -521,23 +544,27 @@ def simple_autotuner(kernel_generator, signature):
 
 
 def autotune_tensor_contraction_loop_order(kernel, signature):
+    assert get_option('autotune_google_benchmark')
     from dune.codegen.loopy.transformations.matchfma import match_fused_multiply_add
     kernel = match_fused_multiply_add(kernel)
-
     generator = tensor_contraction_loop_order_generator(kernel)
     return simple_autotuner(generator, signature)
 
 
 def sumfact_performance_transformations(kernel, signature):
-    if kernel.name.startswith('sfimpl'):
-        # 2D
-        # kernel = reorder_loops_in_tensor_contraction(kernel, (2,0,1), True)
-
-        # 3D
-        # kernel = reorder_loops_in_tensor_contraction(kernel, (3,2,0,1), True)
-        # kernel = reorder_loops_in_tensor_contraction(kernel, (1,2,0,3), True)
-
-        # kernel = autotune_tensor_contraction_loop_order(kernel, signature)
-
-        pass
+    logger = logging.getLogger(__name__)
+
+    if get_form_option('sumfact_performance_transformations'):
+        logger.info('Apply sumfact performance transformations')
+        if kernel.name.startswith('sfimpl'):
+            # # TODO
+            # dim = world_dimension()
+            # if dim == 2:
+            #     kernel = reorder_loops_in_tensor_contraction(kernel, (2,0,1), True)
+            # else:
+            #     kernel = reorder_loops_in_tensor_contraction(kernel, (3,2,0,1), True)
+            #     # kernel = reorder_loops_in_tensor_contraction(kernel, (1,2,0,3), True)
+
+            kernel = autotune_tensor_contraction_loop_order(kernel, signature)
+            pass
     return kernel
diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt
index 5bb10fbcd303a916ce1ab353d7ecac8972f5dbe4..929f06fe5df790dcd0bed7b3589ec9affd026fd1 100644
--- a/test/sumfact/poisson/CMakeLists.txt
+++ b/test/sumfact/poisson/CMakeLists.txt
@@ -134,3 +134,16 @@ if(benchmark_FOUND)
                                     INIFILE poisson_fastdg_volumes_3d_benchmark.mini
                                     )
 endif()
+
+
+#================================================
+# Poisson fastdg with performance transformations
+#================================================
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl
+                                  BASENAME sumfact_poisson_fastdg_2d_performance_transformations
+                                  INIFILE poisson_fastdg_2d_performance_transformations.mini
+                                  )
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
+                                  BASENAME sumfact_poisson_fastdg_3d_performance_transformations
+                                  INIFILE poisson_fastdg_3d_performance_transformations.mini
+                                  )
diff --git a/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..ee6b6080906326fd0a6fff28a28c17cc2937feb0
--- /dev/null
+++ b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini
@@ -0,0 +1,27 @@
+__name = sumfact_poisson_fastdg_2d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+
+cells = 16 16
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = explicit, none | expand gradvec
+fastdg = 1
+sumfact_performance_transformations = 1
+geometry_mixins = sumfact_equidistant
+
+[formcompiler.ufl_variants]
+degree = 2
\ No newline at end of file
diff --git a/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini
new file mode 100644
index 0000000000000000000000000000000000000000..f1e2a56f1a8e54160c0f9f75f28c37704d3d2bf8
--- /dev/null
+++ b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini
@@ -0,0 +1,28 @@
+__name = sumfact_poisson_fastdg_3d_performance_transformations_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+
+cells = 8 8 8
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+# vectorization_strategy = explicit, none | expand gradvec
+vectorization_strategy = model, none | expand gradvec
+fastdg = 1
+sumfact_performance_transformations = 1
+geometry_mixins = sumfact_equidistant
+
+[formcompiler.ufl_variants]
+degree = 2