diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py
index 8342a3435d879d53e1c633a2bb022c2d50d02fdb..41c0d690da9d8aa525871d77dc8672947795b5f8 100644
--- a/python/dune/codegen/sumfact/transformations.py
+++ b/python/dune/codegen/sumfact/transformations.py
@@ -172,123 +172,317 @@ def _reorder_loops_in_tensor_contraction_direct(kernel, iname_order):
     return kernel
 
 
-def _reorder_loops_in_tensor_contraction_accum(kernel, iname_order):
+def _get_inames_of_reduction(instr, iname_permutation):
+    # Check which loops are inside the reduction loop
+    reduction_index = iname_permutation[-1]
+    within_reduction = tuple(i > reduction_index for i in iname_permutation)
+
+    # Basename of inner inames
     dim = world_dimension()
-    assert dim == 3
+    inner_inames_base = []
+    for index, within in enumerate(within_reduction):
+        if within:
+            inner_inames_base.append('sf_out_inames_{}'.format(dim-1-index))
+
+    # Name of inner inames and outer inames
+    inner_inames = []
+    outer_inames = []
+    vec_inames = []
+    for i in [x.name for x in instr.assignee.index_tuple]:
+        for j in inner_inames_base:
+            if j in i:
+                inner_inames.append(i)
+        if 'vec' in i:
+            vec_inames.append(i)
+        elif i not in inner_inames:
+            outer_inames.append(i)
+
+    # Reduction iname
+    regex = re.compile('sf_red_([0-9]*)')
+    reduction_index = set(regex.findall(str(instr)))
+    if isinstance(instr.expression, lp.symbolic.Reduction):
+        assert len(reduction_index) == 1
+        reduction_index = reduction_index.pop()
+        reduction_iname = 'sf_red_{}'.format(reduction_index)
+    else:
+        assert len(reduction_index) == 0
+        reduction_iname = None
 
-    if iname_order.endswith('j'):
-        return kernel
+    return outer_inames, reduction_iname, inner_inames, vec_inames
 
-    # kernel = remove_all_reductions(kernel)
-    kernel = _reorder_loops_in_tensor_contraction_direct(kernel, iname_order)
 
-    cond = lp.match.Tagged('set_zero')
-    for instr in lp.find_instructions(kernel, cond):
-        assert len(instr.depends_on) == 0
+def _duplicate_assignment_inames(kernel, match):
+    instructions = lp.find_instructions(kernel, match)
+    for instr in instructions:
+        assert isinstance(instr, lp.kernel.instruction.Assignment)
 
-        # Depending on this instruction
+        # Dependencies
+        match = lp.match.Id(instr.id)
+        depends_on = instr.depends_on
         depending = []
         for i in kernel.instructions:
             if instr.id in i.depends_on:
                 depending.append(i.id)
-        assert len(depending) == 1
 
-        active_inames = [i.name.endswith('move_up') for i in instr.assignee.index]
-        from loopy.kernel.array import VectorArrayDimTag
-        agg = kernel.temporary_variables[instr.assignee.aggregate.name]
-        if isinstance(agg.dim_tags[-1], VectorArrayDimTag):
-            active_inames[-1] = True
-
-        # Instead of setting the variable itself to zero we set an accumulation
-        # variable to zero
+        # Remove instruction
         kernel = lp.remove_instructions(kernel, set([instr.id]))
-        accum_variable = get_counted_variable('accum_variable')
-        accum_init_inames = tuple(i for (i,j) in zip(instr.assignee.index, active_inames) if j)
-        assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
-        accum_init_id = instr.id + '_accum_init'
-        accum_init_instr = lp.Assignment(assignee,
-                                         0,
-                                         within_inames=instr.within_inames,
-                                         id=accum_init_id,
-                                         tags=('accum_variable',)
-                                         )
-        kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr,])
+
+        def _duplicate_name(iname):
+            iname_appendix = '_duplicate'
+            return iname.name + iname_appendix
+
+        agg_variable = kernel.temporary_variables[instr.assignee.aggregate.name]
+        vectorized = isinstance(agg_variable.dim_tags[-1], lp.kernel.array.VectorArrayDimTag)
+        if vectorized:
+            inames = instr.assignee.index_tuple[:-1]
+        else:
+            inames = instr.assignee.index_tuple
+
+        # Create new domains
+        domains = kernel.domains
+        new_domains = []
+        for iname in inames:
+            # Find loop bound for the corresponding domain
+            for dom in domains:
+                if iname.name in dom.get_var_names(isl.dim_type.set):
+                    # TODO There must be better way to get this information using isl
+                    str_begin = str(dom).find(iname.name + ' =') + len(iname.name) + 3
+                    if str_begin == len(iname.name) + 3 - 1:
+                        str_begin = str(dom).find(iname.name + ' <=') + len(iname.name) + 4
+                    str_end = str_begin + str(dom)[str_begin:].find(' ')
+                    loop_bound = int(str(dom)[str_begin:str_end]) + 1
+                    break
+
+            # Create new domain
+            domain = "{{ [{0}] : 0<={0}<{1} }}".format(_duplicate_name(iname), loop_bound)
+            domain = lp.kernel.creation.parse_domains(domain, {})
+            new_domains.append(domain)
+        for domain in new_domains:
+            domains = domains + domain
+
+        # Create new inames
+        new_inames = tuple(prim.Variable(_duplicate_name(i)) for i in inames)
+        if vectorized:
+            new_inames = new_inames + (instr.assignee.index_tuple[-1],)
+
+        # Create new instruction within the new inames
+        assignee = prim.Subscript(instr.assignee.aggregate, new_inames)
+        new_instruction = instr.copy(assignee=assignee,
+                                     depends_on=depends_on,
+                                     within_inames=frozenset([i.name for i in new_inames]))
+        kernel = kernel.copy(instructions=kernel.instructions + [new_instruction],
+                             domains=domains)
 
         # Restore dependencies
         for dep in depending:
             match = lp.match.Id(dep)
-            kernel = lp.add_dependency(kernel, match, accum_init_id)
+            kernel = lp.add_dependency(kernel, match, new_instruction.id)
+
+    return kernel
+
+
+def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
+    dim = world_dimension()
+
+    # Nothing to do if permutation is identity
+    if iname_permutation == tuple(range(dim+1)):
+        return kernel
 
-        # Make accumulation variable a temporary_variable of this kernel
-        #
-        # Create dim tags for accum variable
-        dim_tags = ','.join(['f'] * sum(active_inames))
-        if isinstance(agg.dim_tags[-1], VectorArrayDimTag):
-            dim_tags = ','.join(['f'] * (sum(active_inames)-1)) + ",vec"
+    # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
+    default_iname_order = ['sf_out_inames_{}'.format(dim-1-i) for i in range(dim)] + ['sf_red']
+    from dune.codegen.sumfact.permutation import permute_backward
+    new_iname_order = permute_backward(default_iname_order, iname_permutation)
+
+    # Get the real names with direction indices in the right order
+    def _current_new_iname_order(outer, reduction, inner, new_iname_order):
+        if reduction:
+            reduction = [reduction]
+        else:
+            reduction = []
+        all_inames = outer + reduction + inner
+        current_iname_order = []
+        for i in new_iname_order:
+            for j in all_inames:
+                if i in j:
+                    current_iname_order.append(j)
+        return current_iname_order
+
+    for instr in kernel.instructions:
+        # We can directly use lp.prioritize_loops if:
+        # - The reduction is the innermost loop
+        # - There is no reduction (eg reduced direction on faces)
+        if iname_permutation[-1] == dim or not isinstance(instr.expression, lp.symbolic.Reduction):
+            # Inames used in this reduction
+            outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
+                                                                                               iname_permutation)
+            current_iname_order = _current_new_iname_order(outer_inames,
+                                                           reduction_iname,
+                                                           inner_inames,
+                                                           new_iname_order)
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+            continue
 
-        # Create shape for accum variable
-        shape = tuple(i for (i,j) in zip(agg.shape, active_inames) if j)
+        assert isinstance(instr.expression.operation, lp.library.reduction.SumReductionOperation)
 
+        # Dependencies. Note: We change the instructions below so we cannot do
+        # depends_on=instr.depends_on. But we know that the id of instr is
+        # still valid so we can use that!
+        match = lp.match.Id(instr.id)
+        depends_on = lp.find_instructions(kernel, match)[0].depends_on
+        depending = []
+        for i in kernel.instructions:
+            if instr.id in i.depends_on:
+                depending.append(i.id)
+
+        # Remove reduction
+        kernel = lp.remove_instructions(kernel, set([instr.id]))
+
+        # Inames used in this reduction
+        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr, iname_permutation)
+
+        # Create dim_tags
+        dim_tags = ','.join(['f'] * len(inner_inames))
+        agg_variable = kernel.temporary_variables[instr.assignee.aggregate.name]
+        vectorized = isinstance(agg_variable.dim_tags[-1], lp.kernel.array.VectorArrayDimTag)
+        if vectorized:
+            assert len(vec_inames) == 1
+            dim_tags = dim_tags + ',vec'
+
+        # Create shape
+        agg_inames = tuple(i.name for i in instr.assignee.index_tuple)
+        agg_shape = agg_variable.shape
+        shape = tuple(agg_shape[agg_inames.index(i)] for i in inner_inames)
+        if vectorized:
+            shape = shape + (agg_shape[-1],)
+
+        # Update temporary_variables of this kernel
         from dune.codegen.loopy.temporary import DuneTemporaryVariable
+        accum_variable = get_counted_variable('accum_variable')
         var = {accum_variable: DuneTemporaryVariable(accum_variable,
-                                                     dtype=agg.dtype,
+                                                     dtype=agg_variable.dtype,
                                                      shape=shape,
                                                      dim_tags=dim_tags,
                                                      managed=True)}
         kernel.temporary_variables.update(var)
 
-        # Accumulate in accumulate variable
-        #
-        # Find accumulation instruction
-        accum_instr = lp.find_instructions(kernel, lp.match.Id(depending[0]))[0]
-        assert accum_instr.assignee == accum_instr.expression.children[0]
-
-        # Dependencies
-        depends_on = accum_instr.depends_on
-        depending = []
-        for i in kernel.instructions:
-            if accum_instr.id in i.depends_on:
-                depending.append(i.id)
+        # Set accumulation variable to zero
+        accum_init_inames = tuple(prim.Variable(i) for i in inner_inames)
+        if vectorized:
+            accum_init_inames = accum_init_inames + (prim.Variable(vec_inames[0]),)
+        assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
+        accum_init_id = instr.id + '_accum_init'
+        accum_init_instr = lp.Assignment(assignee,
+                                         0,
+                                         within_inames=instr.within_inames,
+                                         id=accum_init_id,
+                                         depends_on=depends_on,
+                                         tags=('accum_init',),
+                                         )
+        kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr,])
 
-        # Replace with accumulation in accum_variable
-        kernel = lp.remove_instructions(kernel, set([accum_instr.id]))
-        accum_inames = tuple(i for (i,j) in zip(accum_instr.assignee.index, active_inames) if j)
-        assignee = prim.Subscript(prim.Variable(accum_variable,), accum_inames)
-        expression = prim.Sum((assignee, accum_instr.expression.children[1]))
-        accum_id = accum_instr.id + '_accumvar'
-        new_accum_instr = lp.Assignment(assignee,
-                                        expression,
-                                        within_inames=accum_instr.within_inames,
-                                        id=accum_id,
-                                        depends_on=depends_on,
-                                        )
-        kernel = kernel.copy(instructions=kernel.instructions + [new_accum_instr,])
+        # Accumulate in temporary variable
+        assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
+        expression = prim.Sum((assignee, instr.expression.expr))
+        within_inames = frozenset(tuple(instr.within_inames) + instr.expression.inames)
+        accum_id = instr.id + '_accum'
+        accum_instr = lp.Assignment(assignee,
+                                    expression,
+                                    within_inames=within_inames,
+                                    id=accum_id,
+                                    depends_on=frozenset([accum_init_id,]),
+                                    tags=('accum',),
+                                    )
+        kernel = kernel.copy(instructions=kernel.instructions + [accum_instr,])
 
         # Assign accumulation result
-        #
-        # The reduction is already done
-        within_inames = frozenset(i for i in accum_instr.within_inames if 'red' not in i)
-        assign_id = accum_instr.id + '_assign'
-        assign_instr = lp.Assignment(accum_instr.assignee,
+        assign_id = instr.id + '_assign'
+        assign_instr = lp.Assignment(instr.assignee,
                                      assignee,
                                      within_inames=within_inames,
                                      id=assign_id,
                                      depends_on=frozenset([accum_id,]),
+                                     tags=('accum_assign',),
                                      )
         kernel = kernel.copy(instructions=kernel.instructions + [assign_instr,])
 
+        # Restore dependencies
         for dep in depending:
             match = lp.match.Id(dep)
             kernel = lp.add_dependency(kernel, match, assign_id)
 
+        # Reordering loops only works if we duplicate some inames
+        duplicate_inames = tuple(i.name for i in accum_init_inames)
+        if vectorized:
+            duplicate_inames = duplicate_inames[:-1]
+        match = lp.match.Id(accum_init_id)
+        kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+        match = lp.match.Id(assign_id)
+        kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+
+        # Change loop order
+        current_iname_order = _current_new_iname_order(outer_inames,
+                                                       reduction_iname,
+                                                       inner_inames,
+                                                       new_iname_order)
+        kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+
     return kernel
 
 
-def reorder_loops_in_tensor_contraction(kernel, iname_order, accum_variable=True):
+def reorder_loops_in_tensor_contraction(kernel, iname_permutation, accum_variable=True):
+    """Change loop order in tensor contractions of sum factorization kernel
+
+    Since there is a reduction involved this implies more than just reordering
+    loops. There are two possibilities to handle the reduction:
+
+    1) Generate an object of correct size for the accumulation and assign to
+    the result data structure afterwards
+
+    2) Make sure to set the correct entries of the result data structure to
+    zero and accumulate inplace
+
+    Parameters
+    ----------
+    kernel: loopy.kernel.LoopKernel
+    iname_permutation: tuple of ints
+        How to permute the loop inames. Should contain some permutation of the
+        numbers 0 to dim+1
+    accum_variable: bool
+        If True use method 1 else use method 2 from above
+
+    Notes
+    -----
+    Using einsum notation from numpy a contraction of a sum factorization
+    kernel can be written as:
+
+    - 3D: 'ij,jkl->kli'
+    - 2D: 'ij,jk->ki'
+
+    In the sum factorization kernel itself those inames are called:
+
+    sf_out_inames_2_* : l
+    sf_out_inames_1_* : k
+    sf_out_inames_0_* : i
+    sf_red_* : j
+
+    where * represents the current direction (0,1,2 for 3D problems). The
+    default order of the loops is
+
+    (sf_out_iname_2_*, sf_out_iname_1_*, sf_out_iname_0_*, red_*)
+
+    A permutation vector of (3,2,0,1) would mean that we do
+
+    (sf_out_iname_0_*, red_*, sf_out_iname_1_*, sf_out_iname_2_*)
+
+    instead.
+
+    """
     if accum_variable:
-        return _reorder_loops_in_tensor_contraction_accum(kernel, iname_order)
+        return _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation)
     else:
-        return _reorder_loops_in_tensor_contraction_direct(kernel, iname_order)
+        # Need to adapts this!
+        assert False
+        return _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation)
 
 
 def tensor_contraction_loop_order_generator(kernel):
@@ -336,7 +530,13 @@ def autotune_tensor_contraction_loop_order(kernel, signature):
 
 def sumfact_performance_transformations(kernel, signature):
     if kernel.name.startswith('sfimpl'):
-        # kernel = reorder_loops_in_tensor_contraction_with_accumulation_variable(kernel, "ljik")
+        # 2D
+        # kernel = reorder_loops_in_tensor_contraction(kernel, (2,0,1), True)
+
+        # 3D
+        # kernel = reorder_loops_in_tensor_contraction(kernel, (3,2,0,1), True)
+        # kernel = reorder_loops_in_tensor_contraction(kernel, (1,2,0,3), True)
+
         # kernel = autotune_tensor_contraction_loop_order(kernel, signature)
 
         pass