diff --git a/python/dune/codegen/loopy/transformations/performance.py b/python/dune/codegen/loopy/transformations/performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8995e32687078aa8272086f210b6da5e088e916
--- /dev/null
+++ b/python/dune/codegen/loopy/transformations/performance.py
@@ -0,0 +1,7 @@
+from dune.codegen.options import get_form_option
+from dune.codegen.sumfact.transformations import sumfact_performance_transformations
+
+def performance_transformations(kernel, signature):
+    if get_form_option("sumfact"):
+        kernel = sumfact_performance_transformations(kernel, signature)
+    return kernel
diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py
index 70ab9f4793a6163559d070e33e3600f9cd8c5578..3c06fa2bc0dcd14f68248a4c063dc0ba906b2568 100644
--- a/python/dune/codegen/pdelab/localoperator.py
+++ b/python/dune/codegen/pdelab/localoperator.py
@@ -593,6 +593,10 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin
     for trafo in transformations:
         kernel = trafo[0](kernel, *trafo[1], **trafo[2])
 
+    # Apply performance transformations
+    from dune.codegen.loopy.transformations.performance import performance_transformations
+    kernel = performance_transformations(kernel, signature)
+
     from dune.codegen.loopy import heuristic_duplication
     kernel = heuristic_duplication(kernel)
 
diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py
index 8a22a1d8f45c79d90b9a6e9fed643bdc30787c59..c1ccd13a5552980db0be3788651065dc8a39654c 100644
--- a/python/dune/codegen/sumfact/accumulation.py
+++ b/python/dune/codegen/sumfact/accumulation.py
@@ -548,9 +548,6 @@ def generate_accumulation_instruction(expr, visitor):
     from dune.codegen.sumfact.vectorization import attach_vectorization_info
     vsf = attach_vectorization_info(sf)
 
-    from dune.codegen.sumfact.transformations import attach_transformations
-    vsf = attach_transformations(sf, vsf)
-
     # Make sure we have a buffer that we can set up the input with
     buffer = vsf.buffer
     if buffer is None:
diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py
index 57287efeb4aaef4d0243799e3561731c98aa2272..757d51870aa7ccf690b9362b9b0522a4892f792b 100644
--- a/python/dune/codegen/sumfact/basis.py
+++ b/python/dune/codegen/sumfact/basis.py
@@ -220,9 +220,6 @@ class SumfactBasisMixin(GenericBasisMixin):
         from dune.codegen.sumfact.vectorization import attach_vectorization_info
         vsf = attach_vectorization_info(sf)
 
-        from dune.codegen.sumfact.transformations import attach_transformations
-        vsf = attach_transformations(sf, vsf)
-
         self.indices = None
 
         # If this sum factorization kernel was not used in the dry run we
diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py
index acb3599b0ffce6d4f393d90d6249f3083328cb56..8ac8aaa4faa1a4849c3b88fb54f87b0f87ca7b2b 100644
--- a/python/dune/codegen/sumfact/geometry.py
+++ b/python/dune/codegen/sumfact/geometry.py
@@ -224,9 +224,6 @@ class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin):
         from dune.codegen.sumfact.vectorization import attach_vectorization_info
         vsf = attach_vectorization_info(sf)
 
-        from dune.codegen.sumfact.transformations import attach_transformations
-        vsf = attach_transformations(sf, vsf)
-
         # If this sum factorization kernel was not used in the dry run we
         # just return 0
         if vsf == 0:
@@ -545,9 +542,6 @@ def _name_jacobian(i, j, restriction, visitor):
     from dune.codegen.sumfact.vectorization import attach_vectorization_info
     vsf = attach_vectorization_info(sf)
 
-    from dune.codegen.sumfact.transformations import attach_transformations
-    vsf = attach_transformations(sf, vsf)
-
     # If this sum factorization kernel was not used in the dry run we
     # just return 0
     if vsf == 0:
diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py
index f059c4c567852da01f116eeeb158eb2a208b6f89..ffe5ca4181843233b9693ddda9bc117afae2d54f 100644
--- a/python/dune/codegen/sumfact/transformations.py
+++ b/python/dune/codegen/sumfact/transformations.py
@@ -1,19 +1,22 @@
+import re
+
 import loopy as lp
 import pymbolic.primitives as prim
 import islpy as isl
 
 from dune.codegen.generation import get_global_context_value
 from dune.codegen.loopy.transformations.remove_reductions import remove_all_reductions
-from dune.codegen.options import get_form_option
+from dune.codegen.options import get_form_option, get_option
 from dune.codegen.pdelab.geometry import world_dimension
+from dune.codegen.error import CodegenAutotuneError
 
-def move_zero_assignment_up(knl, move_up_inames):
+def move_zero_assignment_up(kernel, move_up_inames):
     if len(move_up_inames) == 0:
-        return knl
+        return kernel
 
     # Find the instruction we want to move around
     cond = lp.match.Tagged('set_zero')
-    instructions = lp.find_instructions(knl, cond)
+    instructions = lp.find_instructions(kernel, cond)
     move_iname_set = set(map(lambda x: prim.Variable(x), move_up_inames))
     instr = None
     for i in instructions:
@@ -25,12 +28,12 @@ def move_zero_assignment_up(knl, move_up_inames):
     assert (instr!=None)
 
     # Remove it
-    knl = lp.remove_instructions(knl, set([instr.id]))
+    kernel = lp.remove_instructions(kernel, set([instr.id]))
 
     # Create loop domains: In order to move it upwards we need to create
     # additional loops
     iname_appendix = '_move_up'
-    domains = knl.domains
+    domains = kernel.domains
     for iname in move_up_inames:
         # Find loop bound for this iname
         for dom in domains:
@@ -66,12 +69,12 @@ def move_zero_assignment_up(knl, move_up_inames):
     instructions = []
     instructions.append(instr.copy(assignee=assignee,
                                    within_inames=frozenset(within_inames)))
-    knl = knl.copy(instructions=knl.instructions + instructions,
+    kernel = kernel.copy(instructions=kernel.instructions + instructions,
                    domains=domains)
 
     # Add dependency to inner assignment instructions
     cond = lp.match.Tagged('assignment')
-    assignment_instructions = lp.find_instructions(knl, cond)
+    assignment_instructions = lp.find_instructions(kernel, cond)
     instr = None
     for i in assignment_instructions:
         instr_iname_set = set(i.assignee.index_tuple)
@@ -82,12 +85,12 @@ def move_zero_assignment_up(knl, move_up_inames):
 
     id_zero = instructions[0].id
     cond = lp.match.Id(instr.id)
-    knl = lp.add_dependency(knl, cond, id_zero)
+    kernel = lp.add_dependency(kernel, cond, id_zero)
 
-    return knl
+    return kernel
 
 
-def reorder_loops_in_tensor_contraction(knl, iname_order):
+def reorder_loops_in_tensor_contraction(kernel, iname_order):
     """Reorder the loop nest of the tensor contractions
 
     iname_order is a string that specifies the loop order. We use the following convention:
@@ -119,7 +122,7 @@ def reorder_loops_in_tensor_contraction(knl, iname_order):
     # needs a rework anyway so I just do the 3D case first.
     assert dim==3
 
-    knl = remove_all_reductions(knl)
+    kernel = remove_all_reductions(kernel)
 
     # TODO: Doc after rewrite
     reduction_iname = 'j'
@@ -132,7 +135,7 @@ def reorder_loops_in_tensor_contraction(knl, iname_order):
 
     # cond = lp.match.Tagged('set_zero')
     cond = lp.match.Tagged('assignment')
-    instructions = lp.find_instructions(knl, cond)
+    instructions = lp.find_instructions(kernel, cond)
     for instr in instructions:
         inames = tuple(map(lambda x: x.name, instr.assignee.index_tuple))
         current_move_up_inames = []
@@ -141,12 +144,11 @@ def reorder_loops_in_tensor_contraction(knl, iname_order):
                 if i.find(j) >= 0:
                     current_move_up_inames.append(i)
 
-        knl = move_zero_assignment_up(knl, current_move_up_inames)
+        kernel = move_zero_assignment_up(kernel, current_move_up_inames)
 
         # TODO: There should be a better method than searching the string for
         # 'sf_red'. Unfortunately there are sometimes Call instructions due to
         # broadcasts. That makes different ways difficult.
-        import re
         regex = re.compile('sf_red_([0-9]*)')
         reduction_index = set(regex.findall(str(instr)))
         assert len(reduction_index) == 1
@@ -162,90 +164,21 @@ def reorder_loops_in_tensor_contraction(knl, iname_order):
         for i in current_move_up_inames:
             prefered_iname_order.append(i)
         prefered_iname_order = tuple(prefered_iname_order)
-        knl = lp.prioritize_loops(knl, prefered_iname_order)
-
-    return knl
-
-
-class SumfactKernelFunctionTransformation(object):
-    def kernel_transformation(self):
-        """Transformation that will be applied to sumfact kernel function"""
-        raise NotImplementedError
-
-    def name_appendix(self):
-        """Name will be appended to name of sumfact kernel function"""
-        raise NotImplementedError
-
-
-class LoopOrderTransformation(SumfactKernelFunctionTransformation):
-    def __init__(self, order):
-        self.order = order
-
-    def kernel_transformation(self):
-        return (reorder_loops_in_tensor_contraction, self.order, {})
-
-    def name_appendix(self):
-        return 'looporder{}'.format(self.order)
-
-
-def autotune_loop_order(sf):
-    # Baseline is the non transformed kernel
-    from dune.codegen.sumfact.autotune import autotune_realization
-    best_order = None
-    best_cost = autotune_realization(sf)
-
-    # Go through all loop orderings. Note: Due to reduction removal there can
-    # be differences even in the case where the loop order here is the same as
-    # above.
-    import itertools as it
-    loop_orders = [''.join(p) for p in it.permutations('lkij')]
-    for order in loop_orders:
-        from dune.codegen.sumfact.transformations import reorder_loops_in_tensor_contraction
-        trafo = LoopOrderTransformation(order)
-        transformed_sf = sf.copy(transformations=sf.transformations + (trafo,))
-        cost = autotune_realization(transformed_sf)
-        if cost < best_cost:
-            best_cost = cost
-            best_order = order
-
-    return best_order
-
-def attach_transformations(sf, vsf):
-    if vsf == 0:
-        return 0
-
-    if get_global_context_value("dry_run") == None:
-
-        from dune.codegen.options import set_form_option, set_option
-        # TODO
-        #
-        # set_form_option("sumfact_optimize_loop_order", True)
-        # set_option("autotune_google_benchmark", True)
-        if get_form_option("sumfact_optimize_loop_order"):
-            if get_global_context_value("integral_type") == 'cell':
-                # Find best loop order and store transformation in sumfact kernel
-                loop_order = autotune_loop_order(vsf)
-                print("palpo loop_order: {}".format(loop_order))
-                if loop_order != None:
-                    trafo = LoopOrderTransformation(loop_order)
-                    vsf = vsf.copy(transformations=vsf.transformations + (trafo,))
-
-                    # Map the original kernel to the transformed kernel
-                    from dune.codegen.sumfact.vectorization import _cache_vectorization_info
-                    _cache_vectorization_info(sf, vsf)
-
-        # TODO
-        #
-        # Exapmle for such an transformation
-        # if get_global_context_value("integral_type") == 'cell':
-        #     loop_order = 'lkji'
-        #     trafo = LoopOrderTransformation(loop_order)
-        #     vsf = vsf.copy(transformations=vsf.transformations + (trafo,))
-
-        #     # Map the original kernel to the transformed kernel
-        #     from dune.codegen.sumfact.vectorization import _cache_vectorization_info
-        #     _cache_vectorization_info(sf, vsf)
-
-        return vsf
-
-    return sf
+        kernel = lp.prioritize_loops(kernel, prefered_iname_order)
+
+    return kernel
+
+
+def sumfact_performance_transformations(kernel, signature):
+    if kernel.name.startswith('sfimpl'):
+        # from dune.codegen.loopy.transformations.matchfma import match_fused_multiply_add
+        # kernel = match_fused_multiply_add(kernel)
+        # kernel = reorder_loops_in_tensor_contraction(kernel, 'ijlk')
+
+        # from dune.codegen.sumfact.autotune import autotune_realization
+        # test = autotune_realization(kernel=kernel, signature=signature)
+
+        # from pudb import set_trace; set_trace()
+
+        pass
+    return kernel