diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index a5e073cd08644ec7ca709c26577491657f2a7b8f..680b1ec48ac9f2bf77f96c92826dc71d9ef60a06 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -89,7 +89,7 @@ def c_instruction_impl(**kw):
 
 @generator_factory(item_tags=("instruction", "exprinstruction"),
                    context_tags="kernel",
-                   cache_key_generator=lambda *a, **kw: kw['expression'],
+                   cache_key_generator=lambda *a, **kw: (kw['assignee'], kw['expression']),
                    )
 def expr_instruction_impl(**kw):
     if 'assignees' in kw:
@@ -110,8 +110,11 @@ def _insn_cache_key(code=None, expression=None, **kwargs):
     if code is not None:
         return code
     if expression is not None:
-        return expression
-    raise ValueError("Please specify either code or expression for instruction!")
+        if 'assignees' in kwargs:
+            return (kwargs['assignees'], expression)
+        if 'assignee' in kwargs:
+            return (kwargs['assignee'], expression)
+    raise ValueError("Error caching instruction, missing information about assignee")
 
 
 @generator_factory(item_tags=("insn_id",),
@@ -132,6 +135,9 @@ def instruction(code=None, expression=None, **kwargs):
         if 'assignees' in kwargs and len(kwargs['assignees']) == 0:
             call_instruction_impl(id=id, expression=expression, **kwargs)
         else:
+            if 'assignees' in kwargs:
+                assert 'assignee' not in kwargs
+                kwargs['assignee'] = kwargs['assignees']
             expr_instruction_impl(id=id, expression=expression, **kwargs)
 
     # return the ID, as it is the only useful information to the user
@@ -167,15 +173,29 @@ def barrier(**kwargs):
     return name
 
 
-def loopy_class_member(name, classtag=None, **kwargs):
+def loopy_class_member(name, classtag=None, potentially_vectorized=False, **kwargs):
     """ A class member is based on loopy! It is an
     * temporary variable of the constructor kernel
     * A globalarg of the requesting kernel (to make things pass)
     """
     assert classtag
+    assert "base_storage" not in kwargs
+
+    if potentially_vectorized:
+        base = "{}_base".format(name)
+        kwargs["base_storage"] = base
+
+    # Check if this is vectorized, if so, change name to vec
+    dim_tags = kwargs.get("dim_tags", "")
+    if dim_tags and dim_tags.split(',')[-1] == "vec":
+        name = "{}_vec".format(name)
+
     temporary_variable(name, kernel=classtag, **kwargs)
     silenced_warning("read_no_write({})".format(name), kernel=classtag)
 
     kwargs.pop("decl_method", None)
+    kwargs.pop("base_storage", None)
     # TODO I guess some filtering has to be applied here.
     globalarg(name, **kwargs)
+
+    return name
diff --git a/python/dune/perftool/loopy/buffer.py b/python/dune/perftool/loopy/buffer.py
index 4f5832669bd876c3601a1ffb90294f32b4efff6a..af0fc6973a96b9df8dd4e9db8cb2e5392e770249 100644
--- a/python/dune/perftool/loopy/buffer.py
+++ b/python/dune/perftool/loopy/buffer.py
@@ -38,13 +38,9 @@ class FlipFlopBuffer(object):
         formdata = get_global_context_value('formdata')
         dim = formdata.geometric_dimension
 
-        # TODO: doc!
-        storage_shape = (self.base_storage_size,) + (1,) * (dim - 1)
-
         # Construct the temporary and return it
         temporary_variable(name,
                            base_storage=base,
-                           storage_shape=storage_shape,
                            managed=True,
                            **kwargs
                            )
diff --git a/python/dune/perftool/loopy/symbolic.py b/python/dune/perftool/loopy/symbolic.py
index 0c9d1f967521e770e21e9d77c67df663c1231165..23415aac89afd84a66a94d110f3d899682a9bf13 100644
--- a/python/dune/perftool/loopy/symbolic.py
+++ b/python/dune/perftool/loopy/symbolic.py
@@ -15,21 +15,37 @@ import pymbolic.primitives as prim
 
 
 class SumfactKernel(prim.Variable):
-    def __init__(self, a_matrices, buffer, insn_dep=frozenset({}), additional_inames=frozenset({})):
+    def __init__(self,
+                 a_matrices,
+                 buffer,
+                 stage=1,
+                 insn_dep=frozenset({}),
+                 additional_inames=frozenset({}),
+                 preferred_interleaving_position=0,
+                 setup_method=None,
+                 input_temporary=None,
+                 ):
         self.a_matrices = a_matrices
         self.buffer = buffer
+        self.stage = stage
         self.insn_dep = insn_dep
         self.additional_inames = additional_inames
+        self.preferred_interleaving_position = preferred_interleaving_position
+        self.setup_method = setup_method
+        self.input_temporary = input_temporary
+
+        if setup_method is not None:
+            assert isinstance(setup_method, tuple) and len(setup_method) == 2
 
         prim.Variable.__init__(self, "SUMFACT")
 
     def __getinitargs__(self):
-        return (self.a_matrices, self.buffer, self.insn_dep, self.additional_inames)
+        return (self.a_matrices, self.buffer, self.stage, self.insn_dep, self.additional_inames, self.preferred_interleaving_position, self.setup_method, self.input_temporary)
 
     def stringifier(self):
         return lp.symbolic.StringifyMapper
 
-    init_arg_names = ("a_matrices", "buffer", "insn_dep", "additional_inames")
+    init_arg_names = ("a_matrices", "buffer", "stage", "insn_dep", "additional_inames", "preferred_interleaving_position", "setup_method", "input_temporary")
 
     mapper_method = "map_sumfact_kernel"
 
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 16d45c59cb02ffb9aaa4a9f83bc02e364e2e78ab..120e5202732fb9884f65626da9e3a38481aaf1f7 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -43,7 +43,8 @@ def get_form_compiler_arguments():
     parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project")
     # TODO at some point this help description should be updated
     parser.add_argument("--sumfact", action="store_true", help="Use sumfactorization")
-    parser.add_argument("--vectorize", action="store_true", help="whether to generate code with explicit vectorization")
+    parser.add_argument("--vectorize-quad", action="store_true", help="whether to generate code with explicit vectorization")
+    parser.add_argument("--vectorize-grads", action="store_true", help="whether to generate code with explicit vectorization")
 
     # Modify the positional argument to not be a list
     args = vars(parser.parse_args())
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index e5c3b0b4dc5011c087b3ea97ad6e67be556949da..c1455f35f71a8cb82103df1662870cc39b4992a3 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -29,6 +29,7 @@ from pymbolic.primitives import Variable
 from pytools import Record
 
 import loopy as lp
+import cgen
 
 
 def name_form(formdata, data):
@@ -531,7 +532,7 @@ def extract_kernel_from_cache(tag):
     kernel = heuristic_duplication(kernel)
 
     # Maybe apply vectorization strategies
-    if get_option("vectorize"):
+    if get_option("vectorize_quad"):
         if get_option("sumfact"):
             # Vectorization of the quadrature loop
             insns = [i.id for i in lp.find_instructions(kernel, lp.match.Tagged("quadvec"))]
@@ -668,7 +669,7 @@ def cgen_class_from_cache(tag, members=[]):
                                         allow_complex=False,
                                         is_generating_device_code=True,
                                         )
-    decls = target.get_device_ast_builder().get_temporary_decls(codegen_state, 0)
+    decls = [cgen.Line("\n  " + next(iter(d.generate()))) for d in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0)]
 
     from dune.perftool.cgen import Class
     return Class(basename, base_classes=base_classes, members=[constructor] + members + pm + decls, tparam_decls=tparams)
diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py
index b039c4545cb6dd61e8a05843b9e45f8a8cefbc15..54303ecedeb42608d2054b9c91264ce3e973bbd7 100644
--- a/python/dune/perftool/sumfact/amatrix.py
+++ b/python/dune/perftool/sumfact/amatrix.py
@@ -28,23 +28,33 @@ from loopy import CallMangleInfo
 from loopy.symbolic import FunctionIdentifier
 from loopy.types import NumpyType
 
-from pytools import Record, product
+from pytools import ImmutableRecord, product
 
 import pymbolic.primitives as prim
 import numpy
 
 
-class AMatrix(Record):
+class AMatrix(ImmutableRecord):
     def __init__(self, rows, cols, transpose=False, derivative=False):
-        Record.__init__(self,
-                        rows=rows,
-                        cols=cols,
-                        transpose=transpose,
-                        derivative=derivative,
-                        )
-
-    def __hash__(self):
-        return hash((self.transpose, self.derivative, self.rows, self.cols))
+        ImmutableRecord.__init__(self,
+                                 rows=rows,
+                                 cols=cols,
+                                 transpose=transpose,
+                                 derivative=derivative,
+                                 )
+
+
+class LargeAMatrix(ImmutableRecord):
+    def __init__(self, rows, cols, transpose, derivative):
+        assert isinstance(transpose, tuple)
+        assert isinstance(derivative, tuple)
+        assert len(transpose) == len(derivative)
+        ImmutableRecord.__init__(self,
+                                 rows=rows,
+                                 cols=cols,
+                                 transpose=transpose,
+                                 derivative=derivative,
+                                 )
 
 
 def quadrature_points_per_direction():
@@ -171,17 +181,23 @@ def polynomial_lookup_mangler(target, func, dtypes):
         return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64)))
 
 
-def define_theta(name, shape, transpose, derivative):
+def define_theta(name, shape, transpose, derivative, additional_indices=()):
     sort_quadrature_points_weights()
     polynomials = name_polynomials()
     qp = name_oned_quadrature_points()
 
+    dim_tags = "f,f"
+    if additional_indices:
+        dim_tags = dim_tags + ",c"
+        shape = shape + (4,)
+
     loopy_class_member(name,
                        dtype=numpy.float64,
                        shape=shape,
                        classtag="operator",
-                       dim_tags="f,f",
+                       dim_tags=dim_tags,
                        managed=True,
+                       potentially_vectorized=True,
                        )
 
     # TODO Enforce the alignment here!
@@ -194,7 +210,7 @@ def define_theta(name, shape, transpose, derivative):
     else:
         args = (prim.Variable(j), prim.Subscript(prim.Variable(qp), (prim.Variable(i),)))
 
-    instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j))),
+    instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j)) + additional_indices),
                 expression=prim.Call(PolynomialLookup(polynomials, derivative), args),
                 kernel="operator",
                 )
@@ -206,6 +222,25 @@ def name_theta(transpose=False, derivative=False):
         shape = (basis_functions_per_direction(), quadrature_points_per_direction())
     else:
         shape = (quadrature_points_per_direction(), basis_functions_per_direction())
-    globalarg(name, shape=shape, dtype=numpy.float64, dim_tags="f,f")
     define_theta(name, shape, transpose, derivative)
     return name
+
+
+def name_large_theta(transpose=False, derivative=False):
+    ident = tuple("{}{}".format("d" if d else "", "T" if t else "") for t, d in zip(transpose, derivative))
+    name = "ThetaLarge_{}".format("_".join(ident))
+    if True in transpose:
+        shape = (basis_functions_per_direction(), quadrature_points_per_direction())
+    else:
+        shape = (quadrature_points_per_direction(), basis_functions_per_direction())
+
+    for i, (t, d) in enumerate(zip(transpose, derivative)):
+        define_theta(name, shape, t, d, additional_indices=(i,))
+
+    return loopy_class_member(name,
+                              classtag="operator",
+                              dtype=numpy.float64,
+                              dim_tags="f,f,vec",
+                              shape=shape + (4,),
+                              potentially_vectorized=True,
+                              )
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 96f24219dc7244474d6c370728723f7280830eed..390acb59e5a7f4ba7ec2b724b59eb94f93848b5e 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -75,10 +75,10 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
         # Add a sum factorization kernel that implements the
         # evaluation of the gradients of basis functions at quadrature
         # points (stage 1)
-        insn_dep = setup_theta(element, restriction, component, a_matrices, buffer_name)
         var = SumfactKernel(a_matrices,
                             buffer_name,
-                            insn_dep=frozenset({insn_dep}),
+                            preferred_interleaving_position=i,
+                            setup_method=(setup_theta, (element, restriction, component))
                             )
 
         buffers.append(var)
@@ -100,11 +100,8 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
 def pymbolic_trialfunction_gradient(element, restriction, component):
     rawname = "gradu" + "_".join(str(c) for c in component)
     name = restricted_name(rawname, restriction)
-    from dune.perftool.pdelab.argument import name_coefficientcontainer
-    container = name_coefficientcontainer(restriction)
     sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
-    from pymbolic.primitives import Variable
-    return Variable(name)
+    return prim.Variable(name)
 
 
 @kernel_cached
@@ -128,10 +125,10 @@ def pymbolic_trialfunction(element, restriction, component):
 
     # Add a sum factorization kernel that implements the evaluation of
     # the basis functions at quadrature points (stage 1)
-    insn_dep = setup_theta(element, restriction, component, a_matrices, buffer_name)
     var = SumfactKernel(a_matrices,
                         buffer_name,
-                        insn_dep=frozenset({insn_dep}),
+                        preferred_interleaving_position=dim,
+                        setup_method=(setup_theta, (element, restriction, component))
                         )
 
     return prim.Subscript(var,
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index 35cb783e2e325efbe89796ccfae89d550300f1b6..e3042690f55007928716e0baeb806fa4e0ba9030 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -29,13 +29,18 @@ from dune.perftool.loopy.buffer import (get_buffer_temporary,
 from dune.perftool.sumfact.quadrature import nest_quadrature_loops
 from dune.perftool.pdelab.localoperator import determine_accumulation_space
 from dune.perftool.pdelab.restriction import restricted_name
-from dune.perftool.pdelab.spaces import name_lfs
+from dune.perftool.pdelab.spaces import (name_lfs,
+                                         name_lfs_bound,
+                                         )
 from dune.perftool.sumfact.amatrix import (AMatrix,
+                                           LargeAMatrix,
                                            quadrature_points_per_direction,
                                            basis_functions_per_direction,
+                                           name_large_theta,
                                            name_theta,
                                            )
 from dune.perftool.loopy.symbolic import SumfactKernel
+from dune.perftool.tools import get_pymbolic_basename
 from dune.perftool.error import PerftoolError
 from pymbolic.primitives import (Call,
                                  Product,
@@ -44,9 +49,10 @@ from pymbolic.primitives import (Call,
                                  )
 from dune.perftool.sumfact.quadrature import quadrature_inames
 from loopy import Reduction, GlobalArg
-from loopy.symbolic import FunctionIdentifier
+from loopy.symbolic import FunctionIdentifier, IdentityMapper
 
 import loopy as lp
+import numpy as np
 import pymbolic.primitives as prim
 from pytools import product
 
@@ -72,13 +78,59 @@ def find_sumfact(expr):
     return HasSumfactMapper()(expr)
 
 
-def expand_sumfact_kernels(insns):
+class IndexFiddleMapper(IdentityMapper):
+    def __init__(self, var, index, pos):
+        assert isinstance(var, str)
+        self.var = var
+        self.index = index
+        if isinstance(index, str):
+            self.index = prim.Variable(index)
+        self.pos = pos
+
+    def map_variable(self, expr):
+        if expr.name == self.var:
+            return prim.Subscript(expr, (self.var,))
+        else:
+            return IdentityMapper.map_variable(self, expr)
+
+    def map_subscript(self, expr):
+        if expr.aggregate.name == self.var:
+            ind = expr.index
+            if not isinstance(ind, tuple):
+                ind = (ind,)
+            if self.pos is None:
+                ind = ind + (self.index,)
+            else:
+                raise NotImplementedError
+            return prim.Subscript(expr.aggregate, ind)
+        else:
+            return IdentityMapper.map_subscript(self, expr)
+
+
+def fiddle_in_index(expr, var, index, pos=None):
+    return IndexFiddleMapper(var, index, pos)(expr)
+
+
+def default_resolution(insns):
     for insn in insns:
         if isinstance(insn, (lp.Assignment, lp.CallInstruction)):
             replace = {}
             deps = []
             for sumf in find_sumfact(insn.expression):
-                var, dep = sum_factorization_kernel(sumf.a_matrices, sumf.buffer, sumf.insn_dep, sumf.additional_inames)
+                # Maybe set up the input
+                if sumf.setup_method:
+                    shape = product(mat.cols for mat in sumf.a_matrices)
+                    shape = (shape,)
+                    inp = get_buffer_temporary(sumf.buffer, shape=shape)
+                    silenced_warning('read_no_write({})'.format(inp))
+
+                    func, args = sumf.setup_method
+                    insn_dep = frozenset({func(inp, *args)})
+                else:
+                    insn_dep = sumf.insn_dep
+
+                # Call the sum factorization algorithm
+                var, dep = sum_factorization_kernel(sumf.a_matrices, sumf.buffer, insn_dep, sumf.additional_inames)
                 replace[sumf] = prim.Variable(var)
                 deps.append(dep)
 
@@ -89,6 +141,98 @@ def expand_sumfact_kernels(insns):
                                   )
 
 
+def apply_sumfact_grad_vectorization(insns, stage):
+    sumfact_kernels = []
+    for insn in insns:
+        if isinstance(insn, (lp.Assignment, lp.CallInstruction)):
+            sumf = find_sumfact(insn.expression)
+            if sumf:
+                sumf, = sumf
+                if sumf.stage == stage:
+                    sumfact_kernels.append(sumf)
+
+    # Now apply some heuristics when to vectorize...
+    if len(set(sumfact_kernels)) < 3:
+        default_resolution(insns)
+    else:
+        # Vectorize!!!
+        sumfact_kernels = sorted(sumfact_kernels, key=lambda s: s.preferred_interleaving_position)
+        # Pad to 4
+        if len(sumfact_kernels) < 4:
+            sumfact_kernels.append(sumfact_kernels[0])
+
+        # Determine a name for the buffer in use
+        buffer = "_".join(sf.buffer for sf in sumfact_kernels)
+        insn_dep = frozenset()
+
+        # Maybe initialize the input buffer
+        if sumfact_kernels[0].setup_method:
+            shape = product(mat.cols for mat in sumfact_kernels[0].a_matrices)
+            shape = (shape, 4)
+            initialize_buffer(buffer, base_storage_size=4 * shape[0])
+            inp = get_buffer_temporary(buffer, shape=shape)
+            silenced_warning('read_no_write({})'.format(inp))
+
+            # Call the individual setup_methods
+            for i, sumf in enumerate(sumfact_kernels):
+                func, args = sumf.setup_method
+                insn_dep = insn_dep.union({func(inp, *args, additional_indices=(i,))})
+        else:
+            # No setup method defined. We need to make sure the input is correctly setup
+            shape = tuple(mat.cols for mat in sumfact_kernels[0].a_matrices) + (4,)
+            initialize_buffer(buffer, base_storage_size=4 * shape[0])
+            dim_tags = ",".join("f" * (len(shape) - 1)) + ",c"
+            inp = get_buffer_temporary(buffer, shape=shape, dim_tags=dim_tags)
+
+            for i, sumf in enumerate(sumfact_kernels):
+                assert sumf.input_temporary
+                for insn in insns:
+                    if isinstance(insn, lp.Assignment):
+                        if get_pymbolic_basename(insn.assignee) == sumf.input_temporary:
+                            built_instruction(insn.copy(assignee=prim.Subscript(prim.Variable(inp), insn.assignee.index + (i,))))
+                            from dune.perftool.generation.loopy import expr_instruction_impl
+                            expr_instruction_impl._memoize_cache = {k: v for k, v in expr_instruction_impl._memoize_cache.items() if v.id != insn.id}
+
+        # Determine the joined AMatrix
+        large_a_matrices = []
+        for i in range(len(sumfact_kernels[0].a_matrices)):
+            assert len(set(tuple(sf.a_matrices[i].rows for sf in sumfact_kernels))) == 1
+            assert len(set(tuple(sf.a_matrices[i].cols for sf in sumfact_kernels))) == 1
+            large = LargeAMatrix(rows=sumfact_kernels[0].a_matrices[i].rows,
+                                 cols=sumfact_kernels[0].a_matrices[i].cols,
+                                 transpose=tuple(sf.a_matrices[i].transpose for sf in sumfact_kernels),
+                                 derivative=tuple(sf.a_matrices[i].derivative for sf in sumfact_kernels),
+                                 )
+            large_a_matrices.append(large)
+
+        # Join the instruction dependencies
+        insn_dep = insn_dep.union(*tuple(sf.insn_dep for sf in sumfact_kernels))
+
+        var, dep = sum_factorization_kernel(large_a_matrices, buffer, insn_dep, sumfact_kernels[0].additional_inames, add_vec_tag=True)
+
+        for insn in insns:
+            if isinstance(insn, (lp.Assignment, lp.CallInstruction)):
+                sumfacts = find_sumfact(insn.expression)
+                if sumfacts:
+                    sumf, = sumfacts
+                    if sumf.stage == stage:
+                        replace = {}
+                        replace[sumf] = prim.Variable(var)
+                        newexpr = substitute(insn.expression, replace)
+                        newexpr = fiddle_in_index(newexpr, var, sumf.preferred_interleaving_position)
+                        assert newexpr
+                        built_instruction(insn.copy(expression=newexpr,
+                                                    depends_on=dep,
+                                                    ))
+
+def expand_sumfact_kernels(insns):
+    if get_option("vectorize_grads"):
+        apply_sumfact_grad_vectorization(insns, 1)
+        apply_sumfact_grad_vectorization(insns, 3)
+    else:
+        default_resolution(insns)
+
+
 def filter_sumfact_instructions():
     """ Remove all instructions that contain a SumfactKernel node """
     from dune.perftool.generation.loopy import expr_instruction_impl, call_instruction_impl
@@ -109,19 +253,13 @@ def sumfact_iname(bound, _type):
     return name
 
 
-def setup_theta(element, restriction, component, a_matrices, buffer_name):
-    number_basis = product(mat.cols for mat in a_matrices)
-    shape = (number_basis,)
-    inp = get_buffer_temporary(buffer_name,
-                               shape=shape)
-    silenced_warning('read_no_write({})'.format(inp))
-
+def setup_theta(inp, element, restriction, component, additional_indices=()):
     # Write initial coefficients into buffer
-    basisiname = sumfact_iname(number_basis, "basis")
     lfs = name_lfs(element, restriction, component)
+    basisiname = sumfact_iname(name_lfs_bound(lfs), "basis")
     container = name_coefficientcontainer(restriction)
     coeff = pymbolic_coefficient(container, lfs, basisiname)
-    assignee = Subscript(Variable(inp), (Variable(basisiname),))
+    assignee = Subscript(Variable(inp), (Variable(basisiname),) + additional_indices)
     return instruction(assignee=assignee,
                        expression=coeff,
                        )
@@ -178,9 +316,10 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
             a_matrices = [theta_matrix] * dim
             a_matrices[i] = dtheta_matrix
             a_matrices = tuple(a_matrices)
-
+            pref_pos = i
         else:
             a_matrices = (theta_matrix,) * dim
+            pref_pos = dim
 
         # Initialize a base storage for this buffer and get a temporay pointing to it
         temp = initialize_buffer(buf,
@@ -219,6 +358,9 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                buf,
                                insn_dep=frozenset({contrib_dep}),
                                additional_inames=frozenset(visitor.inames),
+                               stage=3,
+                               preferred_interleaving_position=pref_pos,
+                               input_temporary=temp,
                                )
 
         inames = tuple(sumfact_iname(mat.rows, 'accum') for mat in a_matrices)
@@ -258,7 +400,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         transform(nest_quadrature_loops, visitor.inames)
 
 
-def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional_inames=frozenset({})):
+def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional_inames=frozenset({}), add_vec_tag=False):
     """
     Calculate a sum factorization matrix product.
 
@@ -278,6 +420,10 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional
         should depend upon. All following ones will depend on each
         other.
     """
+    ftags = "f,f" + (",vec" if add_vec_tag else "")
+    ctags = "c,c" + (",vec" if add_vec_tag else "")
+    vec_shape = (4,) if add_vec_tag else ()
+
     insn_dep = frozenset({barrier(depends_on=insn_dep,
                                   within_inames=additional_inames,
                                   )})
@@ -288,8 +434,8 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional
         # this reinterprets the output of the previous iteration.
         inp_shape = (a_matrix.cols, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
         inp = get_buffer_temporary(buf,
-                                   shape=inp_shape,
-                                   dim_tags="f,f")
+                                   shape=inp_shape + vec_shape,
+                                   dim_tags=ftags)
 
         # The input temporary will only be read from, so we need to silence the loopy warning
         silenced_warning('read_no_write({})'.format(inp))
@@ -301,22 +447,34 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional
         out_shape = (a_matrix.rows, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
 
         out = get_buffer_temporary(buf,
-                                   shape=out_shape,
-                                   dim_tags="c,c")
+                                   shape=out_shape + vec_shape,
+                                   dim_tags=ctags)
 
         # Get the inames needed for one matrix-matrix multiplication
         i = sumfact_iname(out_shape[0], "row")
         j = sumfact_iname(out_shape[1], "col")
         k = sumfact_iname(a_matrix.cols, "red")
 
+        # Select the correct Theta Matrix for multiplication
+        if isinstance(a_matrix, AMatrix):
+            theta = name_theta(transpose=a_matrix.transpose, derivative=a_matrix.derivative)
+            vec_iname = ()
+        elif isinstance(a_matrix, LargeAMatrix):
+            theta = name_large_theta(transpose=a_matrix.transpose, derivative=a_matrix.derivative)
+            iname = sumfact_iname(4, "vec")
+            vec_iname = (prim.Variable(iname),)
+            transform(lp.tag_inames, [(iname, "vec")])
+        else:
+            raise TypeError("What kind of an AMatrix is that?")
+
         # Construct the matrix-matrix-multiplication expression a_ik*in_kj
-        prod = Product((Subscript(Variable(name_theta(transpose=a_matrix.transpose, derivative=a_matrix.derivative)), (Variable(i), Variable(k))),
-                        Subscript(Variable(inp), (Variable(k), Variable(j)))
+        prod = Product((Subscript(Variable(theta), (Variable(i), Variable(k)) + vec_iname),
+                        Subscript(Variable(inp), (Variable(k), Variable(j)) + vec_iname)
                         ))
 
         # Issue the reduction instruction that implements the multiplication
         # at the same time store the instruction ID for the next instruction to depend on
-        insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j))),
+        insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j)) + vec_iname),
                                           expression=Reduction("sum", k, prod),
                                           forced_iname_deps=frozenset({i, j}).union(additional_inames),
                                           forced_iname_deps_is_final=True,
@@ -330,6 +488,11 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional
 
     out_shape = tuple(mat.rows for mat in a_matrices)
     dim_tags = ",".join(['f'] * dim)
+
+    if add_vec_tag:
+        out_shape = out_shape + vec_shape
+        dim_tags = dim_tags + ",c"
+
     out = get_buffer_temporary(buf,
                                shape=out_shape,
                                dim_tags=dim_tags)
diff --git a/python/pytools b/python/pytools
index 6c5830f6e007f4ece99d75c83b0f72550cad80c2..7ed09d95be2bd2fb22392010cf28830fd43a30ca 160000
--- a/python/pytools
+++ b/python/pytools
@@ -1 +1 @@
-Subproject commit 6c5830f6e007f4ece99d75c83b0f72550cad80c2
+Subproject commit 7ed09d95be2bd2fb22392010cf28830fd43a30ca
diff --git a/test/sumfact/mass/mass.mini b/test/sumfact/mass/mass.mini
index c8729beade5e5f75d2f45619b3e33e5f8776292d..80594037a20b03ef2465080d6d7cdc62062b7c0e 100644
--- a/test/sumfact/mass/mass.mini
+++ b/test/sumfact/mass/mass.mini
@@ -13,5 +13,5 @@ extension = vtu
 
 [formcompiler]
 numerical_jacobian = 1, 0 | expand num
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec
 sumfact = 1
diff --git a/test/sumfact/mass/mass_3d.mini b/test/sumfact/mass/mass_3d.mini
index 17da3e4f4747889c3538b2de8d3afaf96469345f..2948674e0acc6b34dd4e92fec728f449e38978c6 100644
--- a/test/sumfact/mass/mass_3d.mini
+++ b/test/sumfact/mass/mass_3d.mini
@@ -15,5 +15,5 @@ extension = vtu
 
 [formcompiler]
 numerical_jacobian = 1, 0 | expand num
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec
 sumfact = 1
diff --git a/test/sumfact/poisson/poisson_2d_order1.mini b/test/sumfact/poisson/poisson_2d_order1.mini
index 339ae2c9b837bc78aace34be79fe03d54d82ab80..85e91b2135c564ac9ab743fec5502040b2c34a65 100644
--- a/test/sumfact/poisson/poisson_2d_order1.mini
+++ b/test/sumfact/poisson/poisson_2d_order1.mini
@@ -17,4 +17,4 @@ numerical_jacobian = 1, 0 | expand num
 exact_solution_expression = g
 compare_l2errorsquared = 1e-4
 sumfact = 1
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec
diff --git a/test/sumfact/poisson/poisson_2d_order2.mini b/test/sumfact/poisson/poisson_2d_order2.mini
index 8bdf424c46a66ae0a0e2b1a1da257eede4975789..5cdd15ff4bf39a8839ec1b1aa74f623ff4ad5585 100644
--- a/test/sumfact/poisson/poisson_2d_order2.mini
+++ b/test/sumfact/poisson/poisson_2d_order2.mini
@@ -17,4 +17,4 @@ numerical_jacobian = 1, 0 | expand num
 exact_solution_expression = g
 compare_l2errorsquared = 1e-8
 sumfact = 1
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec
diff --git a/test/sumfact/poisson/poisson_3d_order1.mini b/test/sumfact/poisson/poisson_3d_order1.mini
index 6250546eafa4527491931539547a6e961003dc8a..a636bda7fe69f91ebfc99154ecedf2b90ab89d07 100644
--- a/test/sumfact/poisson/poisson_3d_order1.mini
+++ b/test/sumfact/poisson/poisson_3d_order1.mini
@@ -1,8 +1,10 @@
 __name = sumfact_poisson_3d_order1_{__exec_suffix}
-__exec_suffix = {diff_suffix}_{vec_suffix}
+__exec_suffix = {diff_suffix}_{vecq_suffix}_{vecg_suffix}
 
 diff_suffix = numdiff, symdiff | expand num
-vec_suffix = vec, nonvec | expand vec
+vecq_suffix = quadvec, nonquadvec | expand vec_q
+vecg_suffix = gradvec, nongradvec | expand vec_g
+{vecq_suffix} == quadvec and {vecg_suffix} == gradvec | exclude
 
 cells = 8 8 8
 extension = 1. 1. 1.
@@ -17,4 +19,5 @@ numerical_jacobian = 1, 0 | expand num
 exact_solution_expression = g
 compare_l2errorsquared = 1e-4
 sumfact = 1
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec_q
+vectorize_grads = 1, 0 | expand vec_g
diff --git a/test/sumfact/poisson/poisson_3d_order2.mini b/test/sumfact/poisson/poisson_3d_order2.mini
index 54c0e3ccd6723c995dc3c396cdf08e5c06f5b561..24ec010e7beef46dea74bffd318411118a2c727d 100644
--- a/test/sumfact/poisson/poisson_3d_order2.mini
+++ b/test/sumfact/poisson/poisson_3d_order2.mini
@@ -1,8 +1,10 @@
 __name = sumfact_poisson_3d_order2_{__exec_suffix}
-__exec_suffix = {diff_suffix}_{vec_suffix}
+__exec_suffix = {diff_suffix}_{vecq_suffix}_{vecg_suffix}
 
 diff_suffix = numdiff, symdiff | expand num
-vec_suffix = vec, nonvec | expand vec
+vecq_suffix = quadvec, nonquadvec | expand vec_q
+vecg_suffix = gradvec, nongradvec | expand vec_g
+{vecq_suffix} == quadvec and {vecg_suffix} == gradvec | exclude
 
 cells = 8 8 8
 extension = 1. 1. 1.
@@ -17,4 +19,5 @@ numerical_jacobian = 1, 0 | expand num
 exact_solution_expression = g
 compare_l2errorsquared = 1e-8
 sumfact = 1
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec_q
+vectorize_grads = 1, 0 | expand vec_g