diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py
index e360c84775a05fdbd30591dc9f8d7c4b9b26d068..516da6f08b6c2825f616c925448175b61d454a43 100644
--- a/python/dune/codegen/sumfact/accumulation.py
+++ b/python/dune/codegen/sumfact/accumulation.py
@@ -135,7 +135,7 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord):
         return ImmutableRecord.__repr__(self)
 
     def get_keyword_arguments(self):
-        """Get dictionary of keyword arguments needed to initialize this classIFIERS
+        """Get dictionary of keyword arguments needed to initialize this class
 
         Extract keyword arguments from the ImmutableRecord and modify
         accordingly. You need to set the correct matrix sequence before using
@@ -172,7 +172,10 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord):
             return lfs_inames(get_leaf(self.trial_element, self.trial_element_index), self.restriction)
 
     def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
-        # TODO: This should happen in stage 2 and not in stage 3
+        # The result of stage 2 has the correct quadrature permutation but no
+        # cost permutation is applied. The inames for this method are
+        # quadrature and cost permuted. This means we need to reverse the cost
+        # permutation to access the result of stage 2 in the correct way.
         shape = permute_backward(shape, self.cost_permutation)
         inames = permute_backward(inames, self.cost_permutation)
 
@@ -258,9 +261,8 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord):
 
         return frozenset({dep})
 
-    def realize_direct_output(self, result, inames, shape, which=0, reverse_cost_permutation=True, **args):
-        if reverse_cost_permutation:
-            inames = permute_backward(inames, self.cost_permutation)
+    def realize_direct_output(self, result, inames, shape, which=0, **args):
+        inames = permute_backward(inames, self.cost_permutation)
         inames = permute_backward(inames, self.quadrature_permutation)
 
         direct_output = "fastdg{}".format(which)
diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py
index 8ab14a7d2f060bf0898bc76d58c1d3efc1e953c1..049e0be9f5adfae7d48ef9adc40a951adcc55924 100644
--- a/python/dune/codegen/sumfact/basis.py
+++ b/python/dune/codegen/sumfact/basis.py
@@ -95,7 +95,7 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord):
         return repr(self)
 
     def get_keyword_arguments(self):
-        """Get dictionary of keyword arguments needed to initialize this classIFIERS
+        """Get dictionary of keyword arguments needed to initialize this class
 
         Extract keyword arguments from the ImmutableRecord and modify
         accordingly. You need to set the correct matrix sequence before using
diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py
index 7cd4bf0245f297ee52c6ee52bc02575f4b5f5839..b95b348c43d5820aea9e7e6b2d32491f1fc03023 100644
--- a/python/dune/codegen/sumfact/geometry.py
+++ b/python/dune/codegen/sumfact/geometry.py
@@ -104,7 +104,7 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableRecord):
         return repr(self)
 
     def get_keyword_arguments(self):
-        """Get dictionary of keyword arguments needed to initialize this classIFIERS
+        """Get dictionary of keyword arguments needed to initialize this class
 
         Extract keyword arguments from the ImmutableRecord and modify
         accordingly. You need to set the correct matrix sequence before using
diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py
index a9bd0b8baddfbc7c269d92e360bb8ad3bd194910..f14599e5e7009997da267bfd16402d1371e6a6ec 100644
--- a/python/dune/codegen/sumfact/realization.py
+++ b/python/dune/codegen/sumfact/realization.py
@@ -194,7 +194,6 @@ def realize_sumfact_kernel_function(sf):
         if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible:
             input_summand = sf.interface.realize_direct_input(input_inames, inp_shape)
         elif l == 0:
-            # TODO: Simplify arguments!
             input_summand = sf.interface.realize_input(input_inames,
                                                        inp_shape,
                                                        vec_iname,
diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py
index 2b91aafa7ea2597e5b547d99671be3ac85949a81..b8b5b857728ea063edec3812b2132d210ffe93b3 100644
--- a/python/dune/codegen/sumfact/symbolic.py
+++ b/python/dune/codegen/sumfact/symbolic.py
@@ -31,8 +31,19 @@ import inspect
 
 
 class SumfactKernelInterfaceBase(object):
-    """ A base class for the input/output of a sum factorization kernel
+    """A base class for the input/output of a sum factorization kernel
     In stage 1, this represents the input object, in stage 3 the output object.
+
+    Notes about permutations:
+    - setup_input: handle cost and quadrature permutation
+    - realize_input stage 1: no permutations
+    - realize_input stage 3: only cost permutation
+    - realize_direct_input: cost and quadrature permutation
+    - accumulate_output: no permutation
+    - realize_direct_output: cost and quadrature permutation
+
+    In the vectorized case most permutation handling is forwarded to the scalar
+    kernels.
     """
     def setup_input(self, sf, insn_dep, index=0):
         """Create and fill an input array for a stage 1 sumfact kernel function (non fastdg)
@@ -57,7 +68,9 @@ class SumfactKernelInterfaceBase(object):
 
         Stage 1 : Input is already permuted the right way in setup_input.
 
-        Stage 3 : TODO -> Check permutation in accumulation.py
+        Stage 3 : The inames are cost and quadrature permuted but the input is
+          only quadrature permuted. This means we need to reverse the cost
+          permutation on the inames.
 
         Parameters
         ----------
@@ -118,12 +131,12 @@ class SumfactKernelInterfaceBase(object):
         """
         raise NotImplementedError
 
-    def realize_direct_output(self, result, iname, shape, which=0, reverse_cost_permutation=True, **kwargs):
+    def realize_direct_output(self, result, iname, shape, which=0, **kwargs):
         """Accumulate results directly in the sumfact kernel function (fastdg)
 
         This happens inside the sumfact kernel function.
 
-        TODO: Add note about permutation
+        Needs to handle cost and quadrature permutation.
 
         Parameters
         ----------
@@ -133,8 +146,6 @@ class SumfactKernelInterfaceBase(object):
         shape : tuple of ints
         which : int
             TODO Doc me!
-        reverse_cost_permutation : tuple of ints
-            TODO Doc me!
         **kwargs :
             Key word arguments passed to loopy instruction
         """
@@ -209,33 +220,30 @@ class SumfactKernelInterfaceBase(object):
 
 
 class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
-    def __init__(self, interfaces, perm):
+    def __init__(self, interfaces):
         assert(isinstance(interfaces, tuple))
         self.interfaces = interfaces
-        self.vector_cost_permutation = perm
 
     def __repr__(self):
         return "_".join(repr(i) for i in self.interfaces)
 
     @property
     def quadrature_permutation(self):
-        # TODO: For now we assure that all kerneles have the same quadrature_permutation
+        # TODO: For now we only vectorize sumfact kernels with the same
+        # quadrature permutation. This should be extended.
         for i in self.interfaces:
             assert i.quadrature_permutation == self.interfaces[0].quadrature_permutation
         return self.interfaces[0].quadrature_permutation
 
     @property
     def cost_permutation(self):
-        # The cost_permutation of the underlying scalar SumfactKernel can be
-        # different for each kernel.
-        # raise RuntimeError("cost_permutation should not be called on VectorSumfactKernelInput")
-
-        # TODO!
-        cost_permutation = self.interfaces[0].cost_permutation
+        # This should hold true due to the choice of quadrature
+        # permutation. For both structured and unstructured grids the order of
+        # the global directions should be the same leading to the same cost
+        # permutation for all those sum factorization kernels.
         for i in self.interfaces:
-            assert i.cost_permutation == cost_permutation
-
-        return self.vector_cost_permutation
+            assert i.cost_permutation == self.interfaces[0].cost_permutation
+        return self.interfaces[0].cost_permutation
 
     @property
     def stage(self):
@@ -251,8 +259,6 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
         return dep
 
     def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags):
-        # TODO: vector_cost_permutation not used!
-
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the matrix loop
         # this reinterprets the output of the previous iteration.
@@ -268,8 +274,6 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
         return prim.Subscript(prim.Variable(inp), inames + vec_iname)
 
     def realize_direct_input(self, inames, shape):
-        # TODO: vector_cost_permutation not used!
-
         # Check whether the input exhibits a favorable structure
         # (whether we can broadcast scalar values into SIMD registers)
         total = set(self.interfaces)
@@ -317,25 +321,26 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
 
 
 class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
-    def __init__(self, interfaces, perm):
+    def __init__(self, interfaces):
         self.interfaces = interfaces
-        self._cost_permutation = perm
 
     def __repr__(self):
         return "_".join(repr(o) for o in self.interfaces)
 
     @property
     def cost_permutation(self):
-        return self._cost_permutation
+        # This should hold true due to the choice of quadrature
+        # permutation. For both structured and unstructured grids the order of
+        # the global directions should be the same leading to the same cost
+        # permutation for all those sum factorization kernels.
+        for i in self.interfaces:
+            assert i.cost_permutation == self.interfaces[0].cost_permutation
+        return self.interfaces[0].cost_permutation
 
     @property
     def quadrature_permutation(self):
-        # TODO: This should be turned into an error.
-        #
-        #
-        # The quadrature permutation could be different for different scalar kernels!
-        # raise RuntimeError('quadrature_permutation should not be called on VectorSumfactKernelOutput')
-        #
+        # TODO: For now we only vectorize sumfact kernels with the same
+        # quadrature permutation. This should be extended .
         for i in self.interfaces:
             assert i.quadrature_permutation == self.interfaces[0].quadrature_permutation
         return self.interfaces[0].quadrature_permutation
@@ -381,10 +386,6 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
         return prim.Subscript(prim.Variable(inp), inames + vec_iname)
 
     def realize_direct_output(self, result, inames, shape, **args):
-        # TODO: Find out what needs to happen here
-        inames = permute_backward(inames, self.cost_permutation)
-        shape = permute_backward(shape, self.cost_permutation)
-
         outputs = set(self.interfaces)
 
         # If multiple horizontal_add's are to be performed with 'result'
@@ -403,7 +404,6 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
                                                       inames,
                                                       shape,
                                                       which=which,
-                                                      reverse_cost_permutation=False,
                                                       **args))
 
         return deps
@@ -610,7 +610,6 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
         # quadrature_permutation. This should be handled like upper/lower
         # vectorization
         return tuple(m.quadrature_size for m in self.matrix_sequence_quadrature_permuted) + tuple(m.basis_size for m in self.matrix_sequence_quadrature_permuted) + (self.stage, self.buffer, self.interface.within_inames) + (self.interface.direct_is_possible, self.interface.quadrature_permutation)
-        # return tuple(m.quadrature_size for m in self.matrix_sequence_quadrature_permuted) + tuple(m.basis_size for m in self.matrix_sequence_quadrature_permuted) + (self.stage, self.buffer, self.interface.within_inames) + (self.interface.direct_is_possible,)
 
     @property
     def cache_key(self):
@@ -951,7 +950,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
 
     @property
     def cost_permutation(self):
-        return self.kernels[0].cost_permutation
+        raise RuntimeError("cost_permutation should not be used on VectorizedSumfactKernel.")
 
     @property
     def stage(self):
@@ -1001,11 +1000,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
 
     @property
     def interface(self):
-        perm = self.cost_permutation
         if self.stage == 1:
-            return VectorSumfactKernelInput(tuple(k.interface for k in self.kernels), perm)
+            return VectorSumfactKernelInput(tuple(k.interface for k in self.kernels))
         else:
-            return VectorSumfactKernelOutput(tuple(k.interface for k in self.kernels), perm)
+            return VectorSumfactKernelOutput(tuple(k.interface for k in self.kernels))
 
     @property
     def cache_key(self):