diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index ab5ce9e1500baf2070cc0b8ad69d30ede8ec713e..ea22fd5d03950c65f2655a99ede0bce491ca5678 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -1,4 +1,5 @@
 import copy
+import itertools
 
 from dune.perftool.loopy.symbolic import substitute
 from dune.perftool.pdelab.argument import (name_accumulation_variable,
@@ -328,6 +329,85 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
             insn_dep = emit_sumfact_kernel(None, restriction, insn_dep)
 
 
+def _sf_permutation_heuristic(permutations, stage):
+    """Heuristic to choose a permutation
+
+    - Stage 1: Pick the permutation where in permutations[1:] most
+      elements are ordered by size
+    - Stage 3: Pick the permutation where in permutations[:-1] most
+      elements are ordered by size
+    """
+    def cost(perm, stage):
+        cost = 0
+        for i in range(0,len(perm)-2):
+            if stage==1:
+                if perm[i+1]>perm[i+2]:
+                    cost += 1
+            if stage==3:
+                if perm[0]>perm[i+1]:
+                    cost += 1
+        return cost
+
+    perm = min(permutations, key=lambda i:cost(i,stage))
+    return perm
+
+
+def _sf_flop_cost(a_matrices):
+    """Computational cost of sumfactorization with this list of a_matrices
+    """
+    cost = 0;
+    for l in range(len(a_matrices)):
+        cost_m = 1
+        cost_n = 1
+        for i in range(l+1):
+            cost_m *= a_matrices[i].rows
+        for i in range(l,len(a_matrices)):
+            cost_n *= a_matrices[i].cols
+        cost += cost_m * cost_n
+    return cost
+
+
+def _sf_permutation_strategy(a_matrices, stage):
+    """Choose permutation of a_matices list based on computational cost
+
+    Note: If there are multiple permutations with the same cost a
+    heuristic is used to pick one.
+    """
+    # Combine permutation and a_matrices list
+    perm = [i for i, _ in enumerate(a_matrices)]
+    perm_a_matrices = zip(perm, a_matrices)
+
+    # Find cost for all possible permutations of a_matrices list
+    perm_cost = []
+    for permutation in itertools.permutations(perm_a_matrices):
+        perm, series = zip(*permutation)
+        cost = _sf_flop_cost(series)
+        perm_cost.append((perm,cost))
+
+    # Find minimal cost and all permutations with that cost
+    _, costs = zip(*perm_cost)
+    minimal_cost = min(costs)
+    minimal_cost_permutations = [p[0] for p in perm_cost if p[1]==minimal_cost]
+
+    # Use heuristic to pic one of the minimal cost permutations
+    perm = _sf_permutation_heuristic(minimal_cost_permutations, stage)
+    return perm
+
+
+def _permute_forward(t, perm):
+    tmp = []
+    for pos in perm:
+        tmp.append(t[pos])
+    return tuple(tmp)
+
+
+def _permute_backward(t, perm):
+    tmp = [None]*len(t)
+    for i, pos in enumerate(perm):
+        tmp[pos] = t[i]
+    return tuple(tmp)
+
+
 @generator_factory(item_tags=("sumfactkernel",), context_tags=("kernel",), cache_key_generator=lambda a, b, s, **kw: (a, b, s, kw.get("restriction", 0)))
 def sum_factorization_kernel(a_matrices,
                              buf,
@@ -364,13 +444,22 @@ def sum_factorization_kernel(a_matrices,
     A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
     R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
 
-    So the multiplication with A_l is reduction over one index and the
-    transformation brings the next reduction index in the fastest
+    So the multiplication with A_l is a reduction over one index and
+    the transformation brings the next reduction index in the fastest
     position.
 
     Note: In the code below the transformation step is directly done
     in the reduction instruction by adapting the assignee!
 
+    It can make sense to permute the order of directions. If you have
+    a small m_l (e.g. stage 1 on faces) it is better to do direction l
+    first. This can be done permuting:
+
+    - The order of the A matrices.
+    - Permuting the input tensor.
+    - Permuting the output tensor (this assures that the directions of
+      the output tensor are again ordered from 0 to d-1).
+
     Arguments:
     ----------
     a_matrices: An iterable of AMatrix instances
@@ -391,7 +480,6 @@ def sum_factorization_kernel(a_matrices,
     restriction: Restriction for faces values.
     direct_input: Global data structure containing input for
         sumfactorization (e.g. when using FastDGGridOperator).
-
     """
     if get_global_context_value("dry_run", False):
         return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset()
@@ -418,6 +506,18 @@ def sum_factorization_kernel(a_matrices,
                                   within_inames=additional_inames,
                                   )})
 
+    # Decide in which order we want to process directions in the
+    # sumfactorization. A clever ordering can lead to a reduced
+    # complexity. This will e.g. happen at faces where we only have
+    # one quadratue point m_l=1 if l is the normal direction of the
+    # face.
+    #
+    # Rule of thumb: small m's early and large n's late.
+    perm = _sf_permutation_strategy(a_matrices, stage)
+
+    # Permute a_matrices
+    a_matrices = _permute_forward(a_matrices, perm)
+
     # Product of all matrices
     for l, a_matrix in enumerate(a_matrices):
         # Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication
@@ -446,16 +546,29 @@ def sum_factorization_kernel(a_matrices,
         # * an input temporary (default)
         # * a global data structure (if FastDGGridOperator is in use)
         # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator)
+        input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])
         if l == 0 and direct_input is not None:
+            # See comment bellow
+            input_inames = _permute_backward(input_inames, perm)
+            inp_shape = _permute_backward(inp_shape, perm)
+
             globalarg(direct_input, dtype=np.float64, shape=inp_shape)
             if a_matrix.vectorized:
                 input_summand = prim.Call(prim.Variable("Vec4d"),
                                           (prim.Subscript(prim.Variable(direct_input),
-                                                          (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])),))
+                                                          input_inames),))
             else:
                 input_summand = prim.Subscript(prim.Variable(direct_input),
-                                               (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) + vec_iname)
+                                               input_inames + vec_iname)
         else:
+            # If we did permute the order of a matrices above we also
+            # permuted the order of out_inames. Unfortunately the
+            # order of our input is from 0 to d-1. This means we need
+            # to permute _back_ to get the right coefficients.
+            if l == 0:
+                inp_shape = _permute_backward(inp_shape, perm)
+                input_inames = _permute_backward(input_inames, perm)
+
             # Get a temporary that interprets the base storage of the input
             # as a column-major matrix. In later iteration of the amatrix loop
             # this reinterprets the output of the previous iteration.
@@ -467,7 +580,7 @@ def sum_factorization_kernel(a_matrices,
             silenced_warning('read_no_write({})'.format(inp))
 
             input_summand = prim.Subscript(prim.Variable(inp),
-                                           (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) + vec_iname)
+                                           input_inames + vec_iname)
 
         switch_base_storage(buf)
 
@@ -478,14 +591,16 @@ def sum_factorization_kernel(a_matrices,
         # corresponding shape (out_shape[0]) goes to the end (slowest
         # direction) and everything stays column major (ftags->fortran
         # style).
+        #
+        # If we are in the last step we reverse the permutation.
+        output_shape = tuple(out_shape[1:]) + (out_shape[0],)
+        if l == len(a_matrices)-1:
+            output_shape = _permute_backward(output_shape, perm)
         out = get_buffer_temporary(buf,
-                                   shape=tuple(out_shape[1:]) + (out_shape[0],) + vec_shape,
+                                   shape=output_shape + vec_shape,
                                    dim_tags=ftags)
 
         # Write the matrix-matrix multiplication expression
-        # matprod = Product((prim.Subscript(prim.Variable(a_matrix.name),
-        #                                   (prim.Variable(i), k_expr) + vec_iname),
-        #                    input_summand))
         matprod = Product((prim.Subscript(prim.Variable(a_matrix.name),
                                           (prim.Variable(out_inames[0]), k_expr) + vec_iname),
                            input_summand))
@@ -494,8 +609,12 @@ def sum_factorization_kernel(a_matrices,
         if a_matrix.cols != 1:
             matprod = lp.Reduction("sum", k, matprod)
 
-        # Here we also move the new direction (out_inames[0]) to the end
-        assignee = prim.Subscript(prim.Variable(out), tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) + vec_iname)
+        # Here we also move the new direction (out_inames[0]) to the
+        # end and reverse permutation
+        output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),)
+        if l == len(a_matrices)-1:
+            output_inames = _permute_backward(output_inames, perm)
+        assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
 
         # Issue the reduction instruction that implements the multiplication
         # at the same time store the instruction ID for the next instruction to depend on