diff --git a/python/dune/codegen/pdelab/argument.py b/python/dune/codegen/pdelab/argument.py
index ce16cd1121458ab27325312a727fd74c98d708cc..6bf800d59cc6ddeb683873793a4e72f9d0b11f0b 100644
--- a/python/dune/codegen/pdelab/argument.py
+++ b/python/dune/codegen/pdelab/argument.py
@@ -143,7 +143,10 @@ def pymbolic_coefficient(container, lfs, index):
     if not isinstance(lfs, Expression):
         lfs = Variable(lfs)
 
-    return Call(CoefficientAccess(container), (lfs, Variable(index),))
+    if isinstance(index, str):
+        index = Variable(index)
+
+    return Call(CoefficientAccess(container), (lfs, index,))
 
 
 def type_coefficientcontainer():
diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py
index e4512dc199f5eacb533e57cafbe9b8eac2bf91d1..34599f55ad26f5859bfc5de4ce4996d6fd5e4098 100644
--- a/python/dune/codegen/sumfact/accumulation.py
+++ b/python/dune/codegen/sumfact/accumulation.py
@@ -15,6 +15,7 @@ from dune.codegen.generation import (backend,
                                      instruction,
                                      post_include,
                                      kernel_cached,
+                                     silenced_warning,
                                      temporary_variable,
                                      transform,
                                      valuearg
@@ -30,7 +31,8 @@ from dune.codegen.pdelab.restriction import restricted_name
 from dune.codegen.pdelab.signatures import assembler_routine_name
 from dune.codegen.pdelab.geometry import world_dimension
 from dune.codegen.pdelab.spaces import name_lfs
-from dune.codegen.sumfact.permutation import (permute_forward,
+from dune.codegen.sumfact.permutation import (permute_backward,
+                                              permute_forward,
                                               sumfact_cost_permutation_strategy,
                                               sumfact_quadrature_permutation_strategy,
                                               )
@@ -239,6 +241,25 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord):
                                       tags=frozenset({"sumfact_stage3"}),
                                       **args)})
 
+    def realize_input(self, shape, inames, vec_shape, vec_iname, buffer, ftags, l):
+        # TODO: This should happen in stage 2 and not in stage 3
+        shape = permute_backward(shape, self.cost_permutation)
+        inames = permute_backward(inames, self.cost_permutation)
+
+        # Get a temporary that interprets the base storage of the input
+        # as a column-major matrix. In later iteration of the matrix loop
+        # this reinterprets the output of the previous iteration.
+        inp = buffer.get_temporary("buff_step{}_in".format(l),
+                                   shape=shape + vec_shape,
+                                   dim_tags=ftags,
+                                   )
+
+        # The input temporary will only be read from, so we need to silence
+        # the loopy warning
+        silenced_warning('read_no_write({})'.format(inp))
+
+        return prim.Subscript(prim.Variable(inp), inames + vec_iname)
+
     @property
     def function_name_suffix(self):
         if get_form_option("fastdg"):
diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py
index ccc3440698263643f67da433afa0ab53bb436b37..1db2b4bfc117161b261339942b4e2ecd065553ed 100644
--- a/python/dune/codegen/sumfact/basis.py
+++ b/python/dune/codegen/sumfact/basis.py
@@ -14,8 +14,10 @@ from dune.codegen.generation import (backend,
                                      iname,
                                      instruction,
                                      kernel_cached,
+                                     silenced_warning,
                                      temporary_variable,
                                      )
+from dune.codegen.loopy.flatten import flatten_index
 from dune.codegen.loopy.target import type_floatingpoint
 from dune.codegen.sumfact.tabulation import (basis_functions_per_direction,
                                              construct_basis_matrix_sequence,
@@ -107,24 +109,50 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord):
     def direct_is_possible(self):
         return get_form_option("fastdg")
 
-    def realize(self, sf, insn_dep, index=0):
+    def setup_input(self, sf, insn_dep, index=0):
+        """Setup input for a sum factorization kernel function
+
+        Write the coefficients into an array that can be passed to a sum
+        factorization kernel function (necessary if direct input "fastdg" is
+        not possible).
+
+        index: Vectorization index
+        """
+        # Inames for interating over the coefficients. We take them from the
+        # cost permuted matrix sequence. In order to get the inames in order
+        # x,y,... we need to take the permutation back.
+        shape_cost_permuted = tuple(mat.basis_size for mat in sf.matrix_sequence_cost_permuted)
+        shape_ordered = permute_backward(shape_cost_permuted, self.cost_permutation)
+        shape_ordered = permute_backward(shape_ordered, self.quadrature_permutation)
+        inames_cost_permuted = tuple(sumfact_iname(length, "setup_inames_" + str(k)) for k, length in enumerate(shape_cost_permuted))
+        inames_ordered = permute_backward(inames_cost_permuted, self.cost_permutation)
+        inames_ordered = permute_backward(inames_ordered, self.quadrature_permutation)
+
+        # The coefficient needs to be accessed with a flat index of inames ordered x,y,...
+        flat_index = flatten_index(tuple(prim.Variable(i) for i in inames_ordered),
+                                   shape_ordered,
+                                   order="f")
+
+        # Get the coefficient container
         lfs = name_lfs(self.element, self.restriction, self.element_index)
-        basisiname = sumfact_iname(name_lfs_bound(lfs), "basis")
         container = self.coeff_func(self.restriction)
         from dune.codegen.pdelab.argument import pymbolic_coefficient as pc
-        coeff = pc(container, lfs, basisiname)
+        coeff = pc(container, lfs, flat_index)
 
-        # Get the input temporary!
+        # The array that will be passed to the sum factorization kernel
+        # function should contain the coefficients in the cost permuted order!
         from dune.codegen.sumfact.realization import name_buffer_storage
         name = "input_{}".format(sf.buffer)
+        ftags = ",".join(["f"] * (sf.length + 1))
         temporary_variable(name,
-                           shape=(product(mat.basis_size for mat in sf.matrix_sequence_quadrature_permuted), sf.vector_width),
+                           shape=(sf.vector_width,) + shape_cost_permuted,
                            custom_base_storage=name_buffer_storage(sf.buffer, 0),
                            managed=True,
+                           dim_tags=ftags,
                            )
-
         assignee = prim.Subscript(prim.Variable(name),
-                                  (prim.Variable(basisiname),) + (index,))
+                                  (index,) + tuple(prim.Variable(i) for i in inames_cost_permuted))
+
         insn = instruction(assignee=assignee,
                            expression=coeff,
                            depends_on=sf.insn_dep.union(insn_dep),
@@ -134,6 +162,16 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord):
         return insn_dep.union(frozenset({insn}))
 
     def realize_direct(self, shape, inames, which=0):
+        # If the input comes directly from a global data structure inames are
+        # ordered x,y,z,...
+        #
+        # The inames and shape passed to this method come from the cost
+        # permuted matrix sequence so we need to permute them back
+        shape = permute_backward(shape, self.cost_permutation)
+        shape = permute_backward(shape, self.quadrature_permutation)
+        inames = permute_backward(inames, self.cost_permutation)
+        inames = permute_backward(inames, self.quadrature_permutation)
+
         arg = "fastdg{}".format(which)
 
         from dune.codegen.sumfact.accumulation import _dof_offset
@@ -145,16 +183,23 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord):
 
         return prim.Subscript(prim.Variable(arg), inames)
 
-    def realize_input(self, shape, inames, which=0):
-        if self.direct_is_possible:
-            shape = permute_backward(shape, self.cost_permutation)
-            shape = permute_backward(shape, self.quadrature_permutation)
-            inames = permute_backward(inames, self.cost_permutation)
-            inames = permute_backward(inames, self.quadrature_permutation)
+    def realize_input(self, shape, inames, vec_shape, vec_iname, buffer, ftags, l):
+        # Note: Here we do not need to reverse any permutation since this is
+        # already done in the setup_input method above!
 
-            return self.realize_direct(shape, inames)
-        else:
-            raise NotImplementedError("TODO")
+        # Get a temporary that interprets the base storage of the input
+        # as a column-major matrix. In later iteration of the matrix loop
+        # this reinterprets the output of the previous iteration.
+        inp = buffer.get_temporary("buff_step{}_in".format(l),
+                                   shape=shape + vec_shape,
+                                   dim_tags=ftags,
+                                   )
+
+        # The input temporary will only be read from, so we need to silence
+        # the loopy warning
+        silenced_warning('read_no_write({})'.format(inp))
+
+        return prim.Subscript(prim.Variable(inp), inames + vec_iname)
 
     @property
     def function_name_suffix(self):
diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py
index ce2922a89739ac7077683f6caba9014dc0357e28..8652822725a24c53816ca7fe5cc20752ceb37bf3 100644
--- a/python/dune/codegen/sumfact/geometry.py
+++ b/python/dune/codegen/sumfact/geometry.py
@@ -15,6 +15,7 @@ from dune.codegen.generation import (backend,
                                      globalarg,
                                      valuearg,
                                      )
+from dune.codegen.loopy.flatten import flatten_index
 from dune.codegen.options import get_option
 from dune.codegen.pdelab.geometry import (local_dimension,
                                           world_dimension,
@@ -26,9 +27,10 @@ from dune.codegen.pdelab.localoperator import (name_ansatz_gfs_constructor_param
                                                lop_template_range_field,
                                                )
 from dune.codegen.pdelab.restriction import restricted_name
-from dune.codegen.sumfact.accumulation import basis_sf_kernels
+from dune.codegen.sumfact.accumulation import basis_sf_kernels, sumfact_iname
 from dune.codegen.sumfact.basis import construct_basis_matrix_sequence
-from dune.codegen.sumfact.permutation import (permute_forward,
+from dune.codegen.sumfact.permutation import (permute_backward,
+                                              permute_forward,
                                               sumfact_cost_permutation_strategy,
                                               sumfact_quadrature_permutation_strategy,
                                               )
@@ -113,15 +115,36 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableRecord):
     def direct_is_possible(self):
         return False
 
-    def realize(self, sf, insn_dep, index=0):
-        # Note: world_dimension, since we only do evaluation of cell geometry mappings
+    def setup_input(self, sf, insn_dep, index=0):
+        # Inames for interating over the coefficients (in this case the
+        # coordinate of the component 'sefl.direction' of the corners). We take
+        # them from the cost permuted matrix sequence. In order to get the
+        # inames in order x,y,... we need to take the permutation back.
+        shape_cost_permuted = tuple(mat.basis_size for mat in sf.matrix_sequence_cost_permuted)
+        shape_ordered = permute_backward(shape_cost_permuted, self.cost_permutation)
+        shape_ordered = permute_backward(shape_ordered, self.quadrature_permutation)
+        inames_cost_permuted = tuple(sumfact_iname(length, "corner_setup_inames_" + str(k)) for k, length in enumerate(shape_cost_permuted))
+        inames_ordered = permute_backward(inames_cost_permuted, self.cost_permutation)
+        inames_ordered = permute_backward(inames_ordered, self.quadrature_permutation)
+
+        # Flat indices needed to access pdelab corner method
+        flat_index_ordered = flatten_index(tuple(prim.Variable(i) for i in inames_ordered),
+                                           shape_ordered,
+                                           order="f")
+        flat_index_cost_permuted = flatten_index(tuple(prim.Variable(i) for i in inames_cost_permuted),
+                                                 shape_cost_permuted,
+                                                 order="f")
+
+        # The array that will be passed to the sum factorization kernel
+        # function should contain the coefficients in the cost permuted order!
         name = "input_{}".format(sf.buffer)
+        ftags = ",".join(["f"] * (sf.length + 1))
         temporary_variable(name,
-                           shape=(2 ** world_dimension(), sf.vector_width),
+                           shape=(sf.vector_width,) + shape_cost_permuted,
                            custom_base_storage=name_buffer_storage(sf.buffer, 0),
                            managed=True,
+                           dim_tags=ftags,
                            )
-        ciname = global_corner_iname(self.restriction)
 
         if self.restriction == 0:
             geo = name_geometry()
@@ -132,23 +155,38 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableRecord):
         #     method does return a non-scalar, which does not fit into the current
         #     loopy philosophy for function calls. This problem will be solved once
         #     #11 is resolved. Admittedly, the code looks *really* ugly until that happens.
-        code = "{}[{}*{}+{}] = {}.corner({})[{}];".format(name,
-                                                          sf.vector_width,
-                                                          ciname,
-                                                          index,
-                                                          geo,
-                                                          ciname,
-                                                          self.direction,
-                                                          )
+        code = "{}[{}*({})+{}] = {}.corner({})[{}];".format(name,
+                                                            sf.vector_width,
+                                                            str(flat_index_cost_permuted),
+                                                            index,
+                                                            geo,
+                                                            str(flat_index_ordered),
+                                                            self.direction,
+                                                            )
 
         insn = instruction(code=code,
-                           within_inames=frozenset({ciname}),
+                           within_inames=frozenset(inames_cost_permuted),
                            assignees=(name,),
                            tags=frozenset({"sumfact_stage{}".format(sf.stage)}),
                            )
 
         return insn_dep.union(frozenset({insn}))
 
+    def realize_input(self, shape, inames, vec_shape, vec_iname, buffer, ftags, l):
+        # Get a temporary that interprets the base storage of the input
+        # as a column-major matrix. In later iteration of the matrix loop
+        # this reinterprets the output of the previous iteration.
+        inp = buffer.get_temporary("buff_step{}_in".format(l),
+                                   shape=shape + vec_shape,
+                                   dim_tags=ftags,
+                                   )
+
+        # The input temporary will only be read from, so we need to silence
+        # the loopy warning
+        silenced_warning('read_no_write({})'.format(inp))
+
+        return prim.Subscript(prim.Variable(inp), inames + vec_iname)
+
 
 @backend(interface="spatial_coordinate", name="default")
 def pymbolic_spatial_coordinate_multilinear(do_predicates, visitor):
diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py
index 7207dd3829a4fc4e192278b2b43b5c9cbeaae8f0..daf872452a4a3c7db992e2831a3ae8f00451f927 100644
--- a/python/dune/codegen/sumfact/realization.py
+++ b/python/dune/codegen/sumfact/realization.py
@@ -75,8 +75,8 @@ def _realize_sum_factorization_kernel(sf):
     for buf in buffers:
         # Determine the necessary size of the buffer. We assume that we do not
         # underintegrate the form!!!
-        size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width,
-                   product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width)
+        size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width,
+                   product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width)
         temporary_variable("{}_dummy".format(buf),
                            shape=(size,),
                            custom_base_storage=buf,
@@ -85,7 +85,7 @@ def _realize_sum_factorization_kernel(sf):
 
     # Realize the input if it is not direct
     if sf.stage == 1 and not sf.interface.direct_is_possible:
-        insn_dep = insn_dep.union(sf.interface.realize(sf, insn_dep))
+        insn_dep = insn_dep.union(sf.interface.setup_input(sf, insn_dep))
 
     # Trigger generation of the sum factorization kernel function
     qp = quadrature_points_per_direction()
@@ -190,24 +190,20 @@ def realize_sumfact_kernel_function(sf):
         # * a value from a global data structure, broadcasted to a vector type
         #   (vectorized + FastDGGridOperator)
         input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])
+
         if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible:
-            input_summand = sf.interface.realize_input(inp_shape, input_inames)
+            input_summand = sf.interface.realize_direct(inp_shape, input_inames)
+        elif l == 0:
+            # TODO: Simplify arguments!
+            input_summand = sf.interface.realize_input(inp_shape,
+                                                       input_inames,
+                                                       vec_shape,
+                                                       vec_iname,
+                                                       buffer,
+                                                       ftags,
+                                                       l,
+                                                       )
         else:
-            # If we did permute the order of a matrices above we also
-            # permuted the order of out_inames. Unfortunately the
-            # order of our input is from 0 to d-1. This means we need
-            # to permute _back_ to get the right coefficients.
-            if l == 0:
-                inp_shape = permute_backward(inp_shape, sf.cost_permutation)
-                input_inames = permute_backward(input_inames, sf.cost_permutation)
-                if sf.stage == 1:
-                    # In the unstructured case the sf.matrix_sequence_quadrature_permuted could
-                    # already be permuted according to
-                    # sf.interface.quadrature_permutation. We also need to reverse this
-                    # permutation to get the input from 0 to d-1.
-                    inp_shape = permute_backward(inp_shape, sf.interface.quadrature_permutation)
-                    input_inames = permute_backward(input_inames, sf.interface.quadrature_permutation)
-
             # Get a temporary that interprets the base storage of the input
             # as a column-major matrix. In later iteration of the matrix loop
             # this reinterprets the output of the previous iteration.
@@ -237,6 +233,7 @@ def realize_sumfact_kernel_function(sf):
         # end and reverse permutation
         output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),)
         if l == len(matrix_sequence) - 1:
+            # TODO: Move permutations to interface!
             output_inames = permute_backward(output_inames, sf.cost_permutation)
             if sf.stage == 3:
                 output_inames = permute_backward(output_inames, sf.interface.quadrature_permutation)
@@ -262,6 +259,7 @@ def realize_sumfact_kernel_function(sf):
             # If we are in the last step we reverse the permutation.
             output_shape = tuple(out_shape[1:]) + (out_shape[0],)
             if l == len(matrix_sequence) - 1:
+                # TODO: Move permutations to interface
                 output_shape = permute_backward(output_shape, sf.cost_permutation)
                 if sf.stage == 3:
                     output_shape = permute_backward(output_shape, sf.interface.quadrature_permutation)
diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py
index 0b303c640952a98d40278e3b76fe6958e44a5d91..e51606dccad2fed8f4c5e736717b14ed6e057b0b 100644
--- a/python/dune/codegen/sumfact/symbolic.py
+++ b/python/dune/codegen/sumfact/symbolic.py
@@ -2,6 +2,7 @@
 
 from dune.codegen.options import get_form_option, get_option
 from dune.codegen.generation import (get_counted_variable,
+                                     silenced_warning,
                                      subst_rule,
                                      transform,
                                      )
@@ -99,9 +100,10 @@ class SumfactKernelInterfaceBase(object):
 
 
 class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
-    def __init__(self, interfaces):
+    def __init__(self, interfaces, perm):
         assert(isinstance(interfaces, tuple))
         self.interfaces = interfaces
+        self.vector_cost_permutation = perm
 
     def __repr__(self):
         return "_".join(repr(i) for i in self.interfaces)
@@ -124,7 +126,7 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
         for i in self.interfaces:
             assert i.cost_permutation == cost_permutation
 
-        return cost_permutation
+        return vector_cost_permutation
 
     @property
     def stage(self):
@@ -134,12 +136,14 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
     def direct_is_possible(self):
         return all(i.direct_is_possible for i in self.interfaces)
 
-    def realize(self, sf, dep):
+    def setup_input(self, sf, dep):
         for i, inp in enumerate(self.interfaces):
-            dep = dep.union(inp.realize(sf, dep, index=i))
+            dep = dep.union(inp.setup_input(sf, dep, index=i))
         return dep
 
     def realize_direct(self, shape, inames):
+        # TODO: vector_cost_permutation not used!
+
         # Check whether the input exhibits a favorable structure
         # (whether we can broadcast scalar values into SIMD registers)
         total = set(self.interfaces)
@@ -166,16 +170,22 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
             # need to load scalars into the SIMD vector.
             raise NotImplementedError("SIMD loads from scalars not implemented!")
 
-    def realize_input(self, shape, inames):
-        if self.direct_is_possible:
-            shape = permute_backward(shape, self.cost_permutation)
-            shape = permute_backward(shape, self.quadrature_permutation)
-            inames = permute_backward(inames, self.cost_permutation)
-            inames = permute_backward(inames, self.quadrature_permutation)
+    def realize_input(self, shape, inames, vec_shape, vec_iname, buffer, ftags, l):
+        # TODO: vector_cost_permutation not used!
 
-            return self.realize_direct(shape, inames)
-        else:
-            raise NotImplementedError("TODO")
+        # Get a temporary that interprets the base storage of the input
+        # as a column-major matrix. In later iteration of the matrix loop
+        # this reinterprets the output of the previous iteration.
+        inp = buffer.get_temporary("buff_step{}_in".format(l),
+                                   shape=shape + vec_shape,
+                                   dim_tags=ftags,
+                                   )
+
+        # The input temporary will only be read from, so we need to silence
+        # the loopy warning
+        silenced_warning('read_no_write({})'.format(inp))
+
+        return prim.Subscript(prim.Variable(inp), inames + vec_iname)
 
     @property
     def function_args(self):
@@ -198,8 +208,9 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
 
 
 class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
-    def __init__(self, interfaces):
+    def __init__(self, interfaces, perm):
         self.interfaces = interfaces
+        self.vector_cost_permutation = perm
 
     def __repr__(self):
         return "_".join(repr(o) for o in self.interfaces)
@@ -231,6 +242,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
         return prim.Call(prim.Variable(hadd_function), (result,))
 
     def realize(self, sf, result, insn_dep):
+        # TODO: vector_cost_permutation not used!
+
         outputs = set(self.interfaces)
 
         trial_element, = set(o.trial_element for o in self.interfaces)
@@ -250,6 +263,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
         return deps
 
     def realize_direct(self, result, inames, shape, **args):
+        # TODO: vector_cost_permutation not used!
+
         outputs = set(self.interfaces)
 
         # If multiple horizontal_add's are to be performed with 'result'
@@ -268,6 +283,25 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
 
         return deps
 
+    def realize_input(self, shape, inames, vec_shape, vec_iname, buffer, ftags, l):
+        # TODO: Include permutations of scalar kernels as soon as they could be different
+        shape = permute_backward(shape, self.vector_cost_permutation)
+        inames = permute_backward(inames, self.vector_cost_permutation)
+
+        # Get a temporary that interprets the base storage of the input
+        # as a column-major matrix. In later iteration of the matrix loop
+        # this reinterprets the output of the previous iteration.
+        inp = buffer.get_temporary("buff_step{}_in".format(l),
+                                   shape=shape + vec_shape,
+                                   dim_tags=ftags,
+                                   )
+
+        # The input temporary will only be read from, so we need to silence
+        # the loopy warning
+        silenced_warning('read_no_write({})'.format(inp))
+
+        return prim.Subscript(prim.Variable(inp), inames + vec_iname)
+
     @property
     def function_args(self):
         if get_form_option("fastdg"):
@@ -759,6 +793,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
 
     @property
     def matrix_sequence_quadrature_permuted(self):
+        # TODO: This should be turned into a RuntimeError
         return tuple(BasisTabulationMatrixArray(tuple(k.matrix_sequence_quadrature_permuted[i] for k in self.kernels),
                                                 width=self.vector_width,
                                                 )
@@ -819,10 +854,11 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
 
     @property
     def interface(self):
+        perm = self.cost_permutation
         if self.stage == 1:
-            return VectorSumfactKernelInput(tuple(k.interface for k in self.kernels))
+            return VectorSumfactKernelInput(tuple(k.interface for k in self.kernels), perm)
         else:
-            return VectorSumfactKernelOutput(tuple(k.interface for k in self.kernels))
+            return VectorSumfactKernelOutput(tuple(k.interface for k in self.kernels), perm)
 
     @property
     def cache_key(self):