diff --git a/python/dune/perftool/loopy/symbolic.py b/python/dune/perftool/loopy/symbolic.py
index a0a7de0172f6310067ddd7a72160a595889fdcaa..a592c4610f87de0d38bb7da242f3ec8f798b7974 100644
--- a/python/dune/perftool/loopy/symbolic.py
+++ b/python/dune/perftool/loopy/symbolic.py
@@ -3,7 +3,7 @@
 Use this module to insert pymbolic nodes and the likes.
 """
 from dune.perftool.error import PerftoolError
-from dune.perftool.sumfact.symbolic import SumfactKernel
+from dune.perftool.sumfact.symbolic import SumfactKernel, VectorizedSumfactKernel
 from pymbolic.mapper.substitutor import make_subst_func
 
 import loopy as lp
@@ -111,6 +111,15 @@ lp.symbolic.DependencyMapper.map_sumfact_kernel = dependency_map_sumfact_kernel
 lp.target.c.codegen.expression.ExpressionToCExpressionMapper.map_sumfact_kernel = needs_resolution
 lp.type_inference.TypeInferenceMapper.map_sumfact_kernel = needs_resolution
 
+# VectorizedSumfactKernel node
+lp.symbolic.IdentityMapper.map_vectorized_sumfact_kernel = identity_map_sumfact_kernel
+lp.symbolic.SubstitutionMapper.map_vectorized_sumfact_kernel = lp.symbolic.SubstitutionMapper.map_variable
+lp.symbolic.WalkMapper.map_vectorized_sumfact_kernel = walk_map_sumfact_kernel
+lp.symbolic.StringifyMapper.map_vectorized_sumfact_kernel = stringify_map_sumfact_kernel
+lp.symbolic.DependencyMapper.map_vectorized_sumfact_kernel = dependency_map_sumfact_kernel
+lp.target.c.codegen.expression.ExpressionToCExpressionMapper.map_vectorized_sumfact_kernel = needs_resolution
+lp.type_inference.TypeInferenceMapper.map_vectorized_sumfact_kernel = needs_resolution
+
 # FusedMultiplyAdd node
 lp.symbolic.IdentityMapper.map_fused_multiply_add = identity_map_fused_multiply_add
 lp.symbolic.WalkMapper.map_fused_multiply_add = walk_map_fused_multiply_add
diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
index 56ad22cba11a0a501c1c8a285240311d4ddc18c3..d09a91b7b27e5248867c268d4136509d357754fa 100644
--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -103,24 +103,24 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                            )
 
         from dune.perftool.sumfact.vectorization import attach_vectorization_info
-        sf = attach_vectorization_info(sf)
+        vsf = attach_vectorization_info(sf)
 
         # Make sure we have a buffer that we can set up the input with
-        buffer = sf.buffer
+        buffer = vsf.buffer
         if buffer is None:
             buffer = get_counted_variable("buffer")
 
-        vectag = frozenset({"gradvec"}) if sf.index is not None else frozenset()
+        vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset()
 
         temp = get_buffer_temporary(buffer,
-                                    shape=sf.quadrature_shape,
-                                    dim_tags=sf.quadrature_dimtags,
-                                    name=sf.input,
+                                    shape=vsf.quadrature_shape,
+                                    dim_tags=vsf.quadrature_dimtags,
+                                    name=vsf.input,
                                     )
 
         # Those input fields, that are padded need to be set to zero
         # in order to do a horizontal_add later on
-        for pad in sf.padding:
+        for pad in vsf.padding:
             assignee = prim.Subscript(prim.Variable(temp),
                                       tuple(prim.Variable(i) for i in quadrature_inames()) + (pad,))
             instruction(assignee=assignee,
@@ -156,7 +156,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         # Issue an instruction in the quadrature loop that fills the buffer
         # with the evaluation of the contribution at all quadrature points
         assignee = prim.Subscript(prim.Variable(temp),
-                                  tuple(prim.Variable(i) for i in quadrature_inames()) + sf.vec_index)
+                                  tuple(prim.Variable(i) for i in quadrature_inames()) + vsf.vec_index_tuple(sf))
         contrib_dep = instruction(assignee=assignee,
                                   expression=expression,
                                   forced_iname_deps=frozenset(quadrature_inames() + visitor.inames),
@@ -174,7 +174,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                               within_inames=frozenset(visitor.inames))})
 
         inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i)
-                       for i, mat in enumerate(sf.matrix_sequence))
+                       for i, mat in enumerate(vsf.matrix_sequence))
 
         # Collect the lfs and lfs indices for the accumulate call
         test_lfs.index = flatten_index(tuple(prim.Variable(i) for i in inames),
@@ -196,13 +196,14 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         # Add a sum factorization kernel that implements the multiplication
         # with the test function (stage 3)
         from dune.perftool.sumfact.realization import realize_sum_factorization_kernel
-        result, insn_dep = realize_sum_factorization_kernel(sf.copy(insn_dep=insn_dep))
+        result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=insn_dep))
 
         # Determine the expression to accumulate with. This depends on the vectorization strategy!
         result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames))
         vecinames = ()
-        if sf.index is not None:
-            iname = accum_iname((accterm.argument.restriction, restriction), 4, "vec")
+        # TODO: evaluate whether the following line would be okay with vsf.vectorized
+        if vsf.vec_index(sf) is not None:
+            iname = accum_iname((accterm.argument.restriction, restriction), vsf.vector_width, "vec")
             vecinames = (iname,)
             transform(lp.tag_inames, [(iname, "vec")])
             from dune.perftool.tools import maybe_wrap_subscript
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index b4861eebf81923c6253475cfdb7c55cd553d08d4..3630d9f2750fa29874f7ee7e6bf722e581dfa0fc 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -85,16 +85,16 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v
                            )
 
         from dune.perftool.sumfact.vectorization import attach_vectorization_info
-        sf = attach_vectorization_info(sf)
+        vsf = attach_vectorization_info(sf)
 
-        if i != sf.index:
+        if i != vsf.vec_index(sf):
             direct_indexing_is_possible = False
 
         # Add a sum factorization kernel that implements the
         # evaluation of the gradients of basis functions at quadrature
         # points (stage 1)
         from dune.perftool.sumfact.realization import realize_sum_factorization_kernel
-        var, insn_dep = realize_sum_factorization_kernel(sf)
+        var, insn_dep = realize_sum_factorization_kernel(vsf)
 
         buffers.append(var)
 
@@ -132,17 +132,16 @@ def pymbolic_coefficient(element, restriction, component, coeff_func, visitor):
                        component=component,
                        )
 
-    # TODO: Move this away!
     from dune.perftool.sumfact.vectorization import attach_vectorization_info
-    sf = attach_vectorization_info(sf)
+    vsf = attach_vectorization_info(sf)
 
     # Add a sum factorization kernel that implements the evaluation of
     # the basis functions at quadrature points (stage 1)
     from dune.perftool.sumfact.realization import realize_sum_factorization_kernel
-    var, _ = realize_sum_factorization_kernel(sf)
+    var, _ = realize_sum_factorization_kernel(vsf)
 
     return prim.Subscript(var,
-                          tuple(prim.Variable(i) for i in quadrature_inames()) + sf.vec_index
+                          tuple(prim.Variable(i) for i in quadrature_inames()) + vsf.vec_index_tuple(sf)
                           ), visitor.indices
 
 
diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py
index dcc06af6b0ee4121934fc4ecc312493342abc1da..583f8f4d18b7a9d5fcae8e614e8cc158016fa7d7 100644
--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -37,11 +37,13 @@ def realize_sum_factorization_kernel(sf, **kwargs):
     if get_global_context_value("dry_run", False):
         return sf, sf.insn_dep
     else:
-        _realize_input(sf)
         return _realize_sum_factorization_kernel(sf, **kwargs)
 
 
-def _realize_input(sf):
+@generator_factory(item_tags=("sumfactkernel",),
+                   context_tags=("kernel",),
+                   cache_key_generator=lambda s, **kw: s.cache_key)
+def _realize_sum_factorization_kernel(sf):
     # Set up the input for stage 1
     if sf.stage == 1 and not get_option("fastdg"):
         assert sf.coeff_func
@@ -53,22 +55,24 @@ def _realize_input(sf):
                                            name=sf.input
                                            )
 
-        # Write initial coefficients into buffer
-        lfs = name_lfs(sf.element, sf.restriction, sf.component)
-        basisiname = sumfact_iname(name_lfs_bound(lfs), "basis")
-        container = sf.coeff_func(sf.restriction)
-        coeff = pymbolic_coefficient(container, lfs, basisiname)
-        assignee = prim.Subscript(prim.Variable(input_setup), (prim.Variable(basisiname),) + sf.vec_index)
-        instruction(assignee=assignee,
-                    expression=coeff,
-                    depends_on=sf.insn_dep,
-                    )
-
+        def _write_input(inputsf):
+            # Write initial coefficients into buffer
+            lfs = name_lfs(inputsf.element, inputsf.restriction, inputsf.component)
+            basisiname = sumfact_iname(name_lfs_bound(lfs), "basis")
+            container = inputsf.coeff_func(inputsf.restriction)
+            coeff = pymbolic_coefficient(container, lfs, basisiname)
+            assignee = prim.Subscript(prim.Variable(input_setup), (prim.Variable(basisiname),) + sf.vec_index_tuple(inputsf))
+            instruction(assignee=assignee,
+                        expression=coeff,
+                        depends_on=inputsf.insn_dep,
+                        )
+
+        if sf.vectorized:
+            for inputsf in sf.kernels:
+                _write_input(inputsf)
+        else:
+            _write_input(sf)
 
-@generator_factory(item_tags=("sumfactkernel",),
-                   context_tags=("kernel",),
-                   cache_key_generator=lambda s, **kw: s.cache_key)
-def _realize_sum_factorization_kernel(sf):
     # Add a dependency on the input variable
     insn_dep = sf.insn_dep
     if sf.input:
@@ -91,7 +95,7 @@ def _realize_sum_factorization_kernel(sf):
     if sf.vectorized:
         ftags = ftags + ",vec"
         ctags = ctags + ",vec"
-        vec_shape = (4,)
+        vec_shape = (sf.vector_width,)
 
     # Measure times and count operations in c++ code
     if get_option("instrumentation_level") >= 4:
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index fb03a88d5f423fcd079a94598663ffc69de4937e..37fa5e3fc7b453192a8a3dbe974b1e6193659500 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -1,13 +1,22 @@
 """ A pymbolic node representing a sum factorization kernel """
 
-from pytools import ImmutableRecord
+from dune.perftool.options import get_option
+from dune.perftool.generation import get_counted_variable
+from dune.perftool.sumfact.tabulation import BasisTabulationMatrixArray
+
+from pytools import ImmutableRecord, product
 
 import pymbolic.primitives as prim
 import loopy as lp
+import frozendict
 import inspect
 
 
-class SumfactKernel(ImmutableRecord, prim.Variable):
+class SumfactKernelBase(object):
+    pass
+
+
+class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     def __init__(self,
                  matrix_sequence=None,
                  buffer=None,
@@ -16,8 +25,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
                  restriction=0,
                  within_inames=(),
                  input=None,
-                 padding=frozenset(),
-                 index=None,
                  insn_dep=frozenset(),
                  coeff_func=None,
                  element=None,
@@ -92,7 +99,7 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
         component: The treepath to the correct component of above element
         accumvar: The accumulation variable to accumulate into
         """
-        # Check the input and apply defaults where necessary
+        # Assert the inputs!
         assert isinstance(matrix_sequence, tuple)
         from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase
         assert all(isinstance(m, BasisTabulationMatrixBase) for m in matrix_sequence)
@@ -163,31 +170,25 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
 
     @property
     def vectorized(self):
-        return next(iter(self.matrix_sequence)).vectorized
+        return False
 
     @property
     def transposed(self):
         return next(iter(self.matrix_sequence)).transpose
 
-    @property
-    def vec_index(self):
-        """ A tuple with the vector index
+    def vec_index(self, sf):
+        """ Map an unvectorized sumfact kernel object to its position
+        in the vectorized kernel
+        """
+        return None
 
-        Defaults to the empty tuple to allow addition to any
-        existing index tuple """
-        if self.index is not None:
-            return (self.index,)
-        else:
-            return ()
+    def vec_index_tuple(self, sf):
+        return ()
 
     @property
     def flat_input_shape(self):
         """ The 'flat' input tensor shape """
-        from pytools import product
-        shape = (product(mat.cols for mat in self.matrix_sequence),)
-        if self.vectorized:
-            shape = shape + (4,)
-        return shape
+        return (product(mat.cols for mat in self.matrix_sequence),)
 
     @property
     def quadrature_shape(self):
@@ -196,12 +197,9 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
         Takes into account the lower dimensionality of faces and vectorization.
         """
         if self.transposed:
-            shape = tuple(mat.cols for mat in self.matrix_sequence if mat.face is None)
+            return tuple(mat.cols for mat in self.matrix_sequence if mat.face is None)
         else:
-            shape = tuple(mat.rows for mat in self.matrix_sequence if mat.face is None)
-        if self.vectorized:
-            shape = shape + (4,)
-        return shape
+            return tuple(mat.rows for mat in self.matrix_sequence if mat.face is None)
 
     @property
     def quadrature_dimtags(self):
@@ -210,8 +208,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
         Takes into account the lower dimensionality of faces and vectorization.
         """
         tags = ["f"] * len(self.quadrature_shape)
-        if self.vectorized:
-            tags[-1] = 'c'
         return ",".join(tags)
 
     @property
@@ -221,8 +217,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
         Takes into account vectorization.
         """
         shape = tuple(mat.rows for mat in self.matrix_sequence)
-        if self.vectorized:
-            shape = shape + (4,)
         return shape
 
     @property
@@ -232,8 +226,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
         Takes into account vectorization.
         """
         tags = ["f"] * len(self.dof_shape)
-        if self.vectorized:
-            tags[-1] = 'vec'
         return ",".join(tags)
 
     @property
@@ -251,13 +243,221 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
             return self.dof_dimtags
 
     def output_to_pymbolic(self, name):
-        if self.vectorized:
-            return lp.TaggedVariable(name, "vector")
-        else:
-            return lp.TaggedVariable(name, "sumfac")
+        return lp.TaggedVariable(name, "sumfac")
+
+    #
+    # Define properties for conformity with the interface of VectorizedSumfactKernel
+    #
+
+    @property
+    def padding(self):
+        return set()
 
 
 # Extract the argument list and store it on the class. This needs to be done
 # outside of the class because the SumfactKernel class object needs to be fully
 # initialized in order to extract the information from __init__.
 SumfactKernel.init_arg_names = tuple(inspect.getargspec(SumfactKernel.__init__)[0][1:])
+
+
+class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
+    def __init__(self,
+                 kernels=None,
+                 vector_width=get_option('max_vector_width'),
+                 buffer=None,
+                 input=None,
+                 insn_dep=frozenset(),
+                 ):
+        # Assert the input data structure
+        assert isinstance(kernels, tuple)
+        assert all(isinstance(k, SumfactKernel) for k in kernels)
+
+        # Assert all the properties that need to be the same across all subkernels
+        assert len(set(k.stage for k in kernels)) == 1
+        assert len(set(k.length for k in kernels)) == 1
+        assert len(set(k.restriction for k in kernels)) == 1
+        assert len(set(k.within_inames for k in kernels)) == 1
+
+        # Assert properties of the matrix sequence of the underlying kernels
+        for i in range(kernels[0].length):
+            assert len(set(tuple(k.matrix_sequence[i].rows for k in kernels))) == 1
+            assert len(set(tuple(k.matrix_sequence[i].cols for k in kernels))) == 1
+            assert len(set(tuple(k.matrix_sequence[i].face for k in kernels))) == 1
+            assert len(set(tuple(k.matrix_sequence[i].transpose for k in kernels))) == 1
+
+        # Join the instruction dependencies of all subkernels
+        insn_dep = insn_dep.union(k.insn_dep for k in kernels)
+
+        # We currently assume that all subkernels are consecutive, 0-based within the vector
+        assert None not in kernels
+
+        ImmutableRecord.__init__(self,
+                                 kernels=kernels,
+                                 vector_width=vector_width,
+                                 buffer=buffer,
+                                 input=input,
+                                 insn_dep=insn_dep,
+                                 )
+
+        prim.Variable.__init__(self, "VecSUMFAC")
+
+    def __getinitargs__(self):
+        return (self.kernels, self.vector_width, self.buffer, self.input, self.insn_dep)
+
+    def stringifier(self):
+        return lp.symbolic.StringifyMapper
+
+    mapper_method = "map_vectorized_sumfact_kernel"
+
+    init_arg_names = ("kernels", "vector_width", "buffer", "input", "insn_dep")
+
+    #
+    # Some cache key definitions
+    # Watch out for the documentation to see which key is used unter what circumstances
+    #
+
+    @property
+    def cache_key(self):
+        """ The cache key that can be used in generation magic
+        Any two sum factorization kernels having the same cache_key
+        are realized simulatenously!
+        """
+        return (self.matrix_sequence, self.restriction, self.stage, self.buffer)
+
+    @property
+    def input_key(self):
+        """ A cache key for the input coefficients
+        Any two sum factorization kernels having the same input_key
+        work on the same input coefficient (and are suitable for simultaneous
+        treatment because of that)
+        """
+        return (self.restriction, self.stage, self.coeff_func, self.element, self.component, self.accumvar)
+
+    #
+    # Deduce all data fields of normal sum factorization kernels from the underlying kernels
+    #
+
+    @property
+    def matrix_sequence(self):
+        return tuple(BasisTabulationMatrixArray(rows=self.kernels[0].matrix_sequence[i].rows,
+                                                cols=self.kernels[0].matrix_sequence[i].cols,
+                                                transpose=self.kernels[0].matrix_sequence[i].transpose,
+                                                face=self.kernels[0].matrix_sequence[i].face,
+                                                derivative=tuple(k.matrix_sequence[i].derivative for k in self.kernels),
+                                                width=self.vector_width,
+                                                )
+                     for i in range(self.length))
+
+    @property
+    def stage(self):
+        return self.kernels[0].stage
+
+    @property
+    def restriction(self):
+        return self.kernels[0].restriction
+
+    @property
+    def within_inames(self):
+        return self.kernels[0].within_inames
+
+    @property
+    def coeff_func(self):
+        assert len(set(k.coeff_func for k in self.kernels)) == 1
+        return self.kernels[0].coeff_func
+
+    @property
+    def element(self):
+        assert len(set(k.element for k in self.kernels)) == 1
+        return self.kernels[0].element
+
+    @property
+    def component(self):
+        assert len(set(k.component for k in self.kernels)) == 1
+        return self.kernels[0].component
+
+    @property
+    def accumvar(self):
+        assert len(set(k.accumvar for k in self.kernels)) == 1
+        return self.kernels[0].accumvar
+
+    #
+    # Define some properties only needed for this one
+    #
+
+    @property
+    def padding(self):
+        return set(range(self.vector_width)) - set(range(len(self.kernels)))
+
+    #
+    # Define the same properties the normal SumfactKernel defines
+    #
+
+    @property
+    def cache_key(self):
+        return tuple(k.cache_key for k in self.kernels)
+
+    @property
+    def input_key(self):
+        return tuple(k.input_key for k in self.kernels)
+
+    @property
+    def length(self):
+        return self.kernels[0].length
+
+    @property
+    def vectorized(self):
+        return True
+
+    @property
+    def transposed(self):
+        return self.kernels[0].transposed
+
+    def vec_index(self, sf):
+        return self.kernels.index(sf)
+
+    def vec_index_tuple(self, sf):
+        return (self.vec_index(sf),)
+
+    @property
+    def flat_input_shape(self):
+        return (product(mat.cols for mat in self.matrix_sequence), self.vector_width)
+
+    @property
+    def quadrature_shape(self):
+        if self.transposed:
+            return tuple(mat.cols for mat in self.matrix_sequence if mat.face is None) + (self.vector_width,)
+        else:
+            return tuple(mat.rows for mat in self.matrix_sequence if mat.face is None) + (self.vector_width,)
+
+    @property
+    def quadrature_dimtags(self):
+        tags = ["f"] * len(self.quadrature_shape)
+        tags[-1] = 'c'
+        return ",".join(tags)
+
+    @property
+    def dof_shape(self):
+        return tuple(mat.rows for mat in self.matrix_sequence) + (self.vector_width,)
+
+    @property
+    def dof_dimtags(self):
+        tags = ["f"] * len(self.dof_shape)
+        tags[-1] = 'vec'
+        return ",".join(tags)
+
+    @property
+    def output_shape(self):
+        if self.stage == 1:
+            return self.quadrature_shape
+        else:
+            return self.dof_shape
+
+    @property
+    def output_dimtags(self):
+        if self.stage == 1:
+            return self.quadrature_dimtags
+        else:
+            return self.dof_dimtags
+
+    def output_to_pymbolic(self, name):
+        return lp.TaggedVariable(name, "vector")
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index c6fa99e3b28a1a9f1a499d6559461a5ec1af7cc2..e64659851c51eea8be01dd3e69f8f258c799f0ad 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -59,7 +59,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase):
 
 
 class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
-    def __init__(self, rows, cols, transpose, derivative, face):
+    def __init__(self, rows, cols, transpose, derivative, face, width):
         assert isinstance(derivative, tuple)
         BasisTabulationMatrixBase.__init__(self,
                                            rows=rows,
@@ -67,6 +67,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
                                            transpose=transpose,
                                            derivative=derivative,
                                            face=face,
+                                           width=width,
                                            )
 
     @property
@@ -77,13 +78,22 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
                                                quadrature_points_per_direction(),
                                                )
         for i, d in enumerate(self.derivative):
-            define_theta(name, (self.rows, self.cols), self.transpose, d, face=self.face, additional_indices=(i,))
+            define_theta(name, (self.rows, self.cols), self.transpose, d, face=self.face, additional_indices=(i,), width=self.width)
+
+        # Apply padding to those fields not used. This is necessary because you may get memory
+        # initialized with NaN and those NaNs will screw the horizontal_add.
+        i = theta_iname("i", self.rows)
+        j = theta_iname("j", self.cols)
+        for index in set(range(self.width)) - set(range(len(self.derivative))):
+            instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j), index)),
+                        expression=0,
+                        kernel="operator")
 
         return loopy_class_member(name,
                                   classtag="operator",
                                   dtype=numpy.float64,
                                   dim_tags="f,f,vec",
-                                  shape=(self.rows, self.cols, 4),
+                                  shape=(self.rows, self.cols, self.width),
                                   potentially_vectorized=True,
                                   managed=True,
                                   )
@@ -214,7 +224,7 @@ def polynomial_lookup_mangler(target, func, dtypes):
         return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64)))
 
 
-def define_theta(name, shape, transpose, derivative, face=None, additional_indices=()):
+def define_theta(name, shape, transpose, derivative, face=None, additional_indices=(), width=None):
     qp = name_oned_quadrature_points()
     qw = name_oned_quadrature_weights()
     sort_quadrature_points_weights(qp, qw)
@@ -223,7 +233,7 @@ def define_theta(name, shape, transpose, derivative, face=None, additional_indic
     dim_tags = "f,f"
     if additional_indices:
         dim_tags = dim_tags + ",c"
-        shape = shape + (4,)
+        shape = shape + (width,)
 
     loopy_class_member(name,
                        dtype=numpy.float64,
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index ea0584641d81440497d98f7101c8c82a03fbd46f..460ae1e9fb77ffac0b26e7fb733b4ffa4e10adb8 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -1,6 +1,7 @@
 """ Sum factorization vectorization """
 
-from dune.perftool.loopy.symbolic import SumfactKernel
+from dune.perftool.loopy.vcl import get_vcl_type_size
+from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel
 from dune.perftool.generation import (generator_factory,
                                       get_counted_variable,
                                       get_global_context_value,
@@ -13,6 +14,7 @@ from dune.perftool.error import PerftoolError
 from dune.perftool.options import get_option
 
 import loopy as lp
+import numpy as np
 
 
 @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o)
@@ -40,10 +42,16 @@ def no_vectorization(sumfacts):
 
 
 def horizontal_vectorization_strategy(sumfacts):
-    if len(sumfacts) in (3, 4):
+    width = get_vcl_type_size(np.float64)
+    # We currently heuristically allow horizontal vectorization if the number
+    # of kernels matches the maximum vector width or is one below that. There
+    # is a lot of TODOs with that procedure:
+    # * You might have *more* kernels than vector width
+    # * You might want to combine horizontal and vertical strategies
+    if len(sumfacts) in (width - 1, width):
         # Map the sum factorization to their position in the joint kernel
         position_mapping = {}
-        available = set(range(4))
+        available = set(range(width))
         for sf in sumfacts:
             if sf.preferred_position is not None:
                 # This asserts that no two kernels want to take the same position
@@ -57,39 +65,22 @@ def horizontal_vectorization_strategy(sumfacts):
             if sumf.preferred_position is None:
                 position_mapping[sumf] = available.pop()
 
-        # Enable vectorization strategy:
-        inp = get_counted_variable("joined_input")
-        buf = get_counted_variable("joined_buffer")
-
-        # Collect the large matrices!
-        large_matrix_sequence = []
-        for i in range(len(next(iter(sumfacts)).matrix_sequence)):
-            # Assert that the matrices of all sum factorizations have the same size
-            assert len(set(tuple(sf.matrix_sequence[i].rows for sf in sumfacts))) == 1
-            assert len(set(tuple(sf.matrix_sequence[i].cols for sf in sumfacts))) == 1
-
-            # Collect the derivative information
-            derivative = [False] * 4
-            for sf in sumfacts:
-                derivative[position_mapping[sf]] = sf.matrix_sequence[i].derivative
-
-            large = BasisTabulationMatrixArray(rows=next(iter(sumfacts)).matrix_sequence[i].rows,
-                                               cols=next(iter(sumfacts)).matrix_sequence[i].cols,
-                                               transpose=next(iter(sumfacts)).matrix_sequence[i].transpose,
-                                               derivative=tuple(derivative),
-                                               face=next(iter(sumfacts)).matrix_sequence[i].face,
-                                               )
-            large_matrix_sequence.append(large)
+        # Store the kernels as tuple according to their positions
+        sorting = [None] * len(position_mapping)
+        for sf, pos in position_mapping.items():
+            sorting[pos] = sf
+        kernels = tuple(sorting)
+
+        buffer = get_counted_variable("joined_buffer")
+        input = get_counted_variable("joined_input")
 
         for sumf in sumfacts:
             _cache_vectorization_info(sumf,
-                                      sumf.copy(matrix_sequence=tuple(large_matrix_sequence),
-                                                buffer=buf,
-                                                input=inp,
-                                                index=position_mapping[sumf],
-                                                padding=frozenset(available),
-                                                insn_dep=frozenset().union(sf.insn_dep for sf in sumfacts),
-                                                )
+                                      VectorizedSumfactKernel(kernels=kernels,
+                                                              vector_width=width,
+                                                              buffer=buffer,
+                                                              input=input,
+                                                              )
                                       )
     else:
         # Disable vectorization strategy
diff --git a/python/setup.py b/python/setup.py
index 00f6f75e8ec4070b2da6205ed89359e1a9a62fd2..cfc85288c50aa3af535d53ce2a5c22d7f898d13a 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -39,7 +39,7 @@ setup(name='dune.perftool',
                 'dune.perftool.ufl',
                 'dune.perftool.ufl.transformations',
                 ],
-      install_requires=['dune.testtools', 'sympy'],
+      install_requires=['dune.testtools', 'sympy', 'frozendict'],
       tests_require=['pytest'],
       cmdclass={'test': PyTest},
       entry_points = {