diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py
index b32541f69169a23d0c626d280a53f0ec91698122..3264a62c96844b372b7366b4260ef985f9f76904 100644
--- a/python/dune/codegen/sumfact/accumulation.py
+++ b/python/dune/codegen/sumfact/accumulation.py
@@ -171,7 +171,7 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord):
             from dune.codegen.sumfact.basis import lfs_inames
             return lfs_inames(get_leaf(self.trial_element, self.trial_element_index), self.restriction)
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buffer, ftags, l):
+    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
         # TODO: This should happen in stage 2 and not in stage 3
         shape = permute_backward(shape, self.cost_permutation)
         inames = permute_backward(inames, self.cost_permutation)
@@ -179,10 +179,10 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buffer.get_temporary("buff_step{}_in".format(l),
-                                   shape=shape + vec_shape,
-                                   dim_tags=ftags,
-                                   )
+        inp = buf.get_temporary("buff_step0_in",
+                                shape=shape + vec_shape,
+                                dim_tags=ftags,
+                                )
 
         # The input temporary will only be read from, so we need to silence
         # the loopy warning
diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py
index 067f31e7384c43d9e31875cf705234fbd8b35a91..a26bae5bfc61717f06fe1bbd4d0abc99f28240cf 100644
--- a/python/dune/codegen/sumfact/basis.py
+++ b/python/dune/codegen/sumfact/basis.py
@@ -175,17 +175,17 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord):
 
         return insn_dep.union(frozenset({insn}))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buffer, ftags, l):
+    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Note: Here we do not need to reverse any permutation since this is
         # already done in the setup_input method above!
 
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buffer.get_temporary("buff_step{}_in".format(l),
-                                   shape=shape + vec_shape,
-                                   dim_tags=ftags,
-                                   )
+        inp = buf.get_temporary("buff_step0_in",
+                                shape=shape + vec_shape,
+                                dim_tags=ftags,
+                                )
 
         # The input temporary will only be read from, so we need to silence
         # the loopy warning
diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py
index cd0d34b7a07d0e1cf341fb0a629dc3500f1684e2..7cd4bf0245f297ee52c6ee52bc02575f4b5f5839 100644
--- a/python/dune/codegen/sumfact/geometry.py
+++ b/python/dune/codegen/sumfact/geometry.py
@@ -189,14 +189,14 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableRecord):
 
         return insn_dep.union(frozenset({insn}))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buffer, ftags, l):
+    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buffer.get_temporary("buff_step{}_in".format(l),
-                                   shape=shape + vec_shape,
-                                   dim_tags=ftags,
-                                   )
+        inp = buf.get_temporary("buff_step0_in",
+                                shape=shape + vec_shape,
+                                dim_tags=ftags,
+                                )
 
         # The input temporary will only be read from, so we need to silence
         # the loopy warning
diff --git a/python/dune/codegen/sumfact/permutation.py b/python/dune/codegen/sumfact/permutation.py
index 1f775eba4e8ddf4cc5df8c5a8295e4794ea24d87..7f37dfeae795031ec81b292b96e65649f8fc78cc 100644
--- a/python/dune/codegen/sumfact/permutation.py
+++ b/python/dune/codegen/sumfact/permutation.py
@@ -145,6 +145,9 @@ def sumfact_quadrature_permutation_strategy(dim, restriction):
         if restriction == Restriction.POSITIVE:
             return _order_on_self(restriction)
         else:
+            # Still do normal direction first. The other two directions need to
+            # be done in reverse order to go through the quadrature points in
+            # the same order as on self (draw cubes!).
             assert restriction == Restriction.NEGATIVE
             l = list(_order_on_self(restriction))
             return (l[0], l[2], l[1])
diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py
index d1f10d6f9c659fd8c719e19a59658245e5c27b39..010118b10a1c2f1d68109f16b8f5471d03da9224 100644
--- a/python/dune/codegen/sumfact/realization.py
+++ b/python/dune/codegen/sumfact/realization.py
@@ -201,7 +201,6 @@ def realize_sumfact_kernel_function(sf):
                                                        vec_shape,
                                                        buffer,
                                                        ftags,
-                                                       l,
                                                        )
         else:
             # Get a temporary that interprets the base storage of the input
diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py
index 25b3a79578364f9be94374e03399881e06dc9ae9..96db55bb2f83744d56617cd1af028aaf98f41afd 100644
--- a/python/dune/codegen/sumfact/symbolic.py
+++ b/python/dune/codegen/sumfact/symbolic.py
@@ -35,23 +35,43 @@ class SumfactKernelInterfaceBase(object):
     In stage 1, this represents the input object, in stage 3 the output object.
     """
     def setup_input(self, sf, insn_dep, index=0):
-        """Create and fill an input array for sumfact kernel function (non fastdg)
+        """Create and fill an input array for a stage 1 sumfact kernel function (non fastdg)
 
-        This happens before the function call.
+        This happens before the function call. The input will be quadrature
+        (for unstructured grids) and cost permuted.
+
+        Parameters
+        ----------
+        sf : SumfactKernel or VectorizedSumfactKernel
+        insn_dep : frozenset
+            Instructions this setup depends on.
+        index : int
+            Vectorization index, SIMD lane.
 
-        TODO: Add note about permutation
-        TODO: Document input arguments
         """
         raise NotImplementedError
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buffer, ftags, l):
+    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
         """Interpret the input of sumfact kernel function in the right way (non fastdgg)
 
         This happens inside the sumfact kernel function.
 
-        TODO: Cleanup arguments
-        TODO: Add note about permutation
-        TODO: Document input arguments
+        Stage 1: Input is already permuted the right way in setup_input.
+
+        Stage 3: TODO -> Check permutation in accumulation.py
+
+        Parameters
+        ----------
+        inames : tuple of pymbolic.primitives.Variable
+        shape : tuple of int
+        vec_iname : tuple of pymbolic.primitives.Variable
+            In case of vectorized kernel provide vectorization iname.
+        vec_shape : tuple of int
+            In case of vectorized kernel provide the number of vectorized kernels.
+        buf : dune.codegen.sumfact.realization.BufferSwitcher
+            Provides the input variable.
+        ftags : str
+            dim_tags needed to access input variable correctly.
         """
         raise NotImplementedError
 
@@ -66,7 +86,7 @@ class SumfactKernelInterfaceBase(object):
         raise NotImplementedError
 
     def accumulate_output(self, sf, result, insn_dep, inames=None, additional_inames=()):
-        """Generate accumulate instruction after sumfact kernel function (non fastdg)
+        """Generate accumulate instruction after a stage 3 sumfact kernel function (non fastdg)
 
         This happens after the function call.
 
@@ -80,8 +100,13 @@ class SumfactKernelInterfaceBase(object):
 
         This happens inside the sumfact kernel function.
 
+        Stage 1: Reverse cost permutation, output should only be quadrature
+        permuted.
+
+        Stage 3: Reverse cost and quadrature permutation. The output will be
+        sorted according to dof/residual vector.
+
         TODO: Cleanup arguments
-        TODO: Add note about permutation
         TODO: Document input arguments
         """
         inames = permute_backward(inames, self.cost_permutation)
@@ -223,16 +248,16 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
             dep = dep.union(inp.setup_input(sf, dep, index=i))
         return dep
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buffer, ftags, l):
+    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
         # TODO: vector_cost_permutation not used!
 
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
-        inp = buffer.get_temporary("buff_step{}_in".format(l),
-                                   shape=shape + vec_shape,
-                                   dim_tags=ftags,
-                                   )
+        inp = buf.get_temporary("buff_step0_in",
+                                shape=shape + vec_shape,
+                                dim_tags=ftags,
+                                )
 
         # The input temporary will only be read from, so we need to silence
         # the loopy warning
@@ -332,7 +357,7 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
 
         return prim.Call(prim.Variable(hadd_function), (result,))
 
-    def realize_input(self, inames, shape, vec_iname, vec_shape, buffer, ftags, l):
+    def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
         # The input for stage 3 is quadrature permuted. The inames and shape
         # passed to this method are quadrature and cost permuted. This means we
         # need to take the cost permutation back to get the right inames and
@@ -342,10 +367,10 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
 
         # Get a temporary that interprets the base storage of the input as a
         # column-major matrix.
-        inp = buffer.get_temporary("buff_step{}_in".format(l),
-                                   shape=shape + vec_shape,
-                                   dim_tags=ftags,
-                                   )
+        inp = buf.get_temporary("buff_step0_in",
+                                shape=shape + vec_shape,
+                                dim_tags=ftags,
+                                )
 
         # The input temporary will only be read from, so we need to silence
         # the loopy warning