Implement vectorization through simultaneous treatment of function/gradients

fc5d941e · Dominic Kempf · ecfbe719 · fc5d941e · fc5d941e · fc5d941e
Commit fc5d941e authored 8 years ago by Dominic Kempf
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -89,7 +89,7 @@ def c_instruction_impl(**kw):

 @generator_factory(item_tags=("instruction", "exprinstruction"),
                   context_tags="kernel",
-                   cache_key_generator=lambda *a, **kw: kw['expression'],
+                   cache_key_generator=lambda *a, **kw: (kw['assignee'], kw['expression']),
                   )
 def expr_instruction_impl(**kw):
    if 'assignees' in kw:
@@ -110,8 +110,11 @@ def _insn_cache_key(code=None, expression=None, **kwargs):
    if code is not None:
        return code
    if expression is not None:
-        return expression
-    raise ValueError("Please specify either code or expression for instruction!")
+        if 'assignees' in kwargs:
+            return (kwargs['assignees'], expression)
+        if 'assignee' in kwargs:
+            return (kwargs['assignee'], expression)
+    raise ValueError("Error caching instruction, missing information about assignee")


 @generator_factory(item_tags=("insn_id",),
@@ -132,6 +135,9 @@ def instruction(code=None, expression=None, **kwargs):
        if 'assignees' in kwargs and len(kwargs['assignees']) == 0:
            call_instruction_impl(id=id, expression=expression, **kwargs)
        else:
+            if 'assignees' in kwargs:
+                assert 'assignee' not in kwargs
+                kwargs['assignee'] = kwargs['assignees']
            expr_instruction_impl(id=id, expression=expression, **kwargs)

    # return the ID, as it is the only useful information to the user
@@ -167,15 +173,29 @@ def barrier(**kwargs):
    return name


-def loopy_class_member(name, classtag=None, **kwargs):
+def loopy_class_member(name, classtag=None, potentially_vectorized=False, **kwargs):
    """ A class member is based on loopy! It is an
    * temporary variable of the constructor kernel
    * A globalarg of the requesting kernel (to make things pass)
    """
    assert classtag
+    assert "base_storage" not in kwargs
+
+    if potentially_vectorized:
+        base = "{}_base".format(name)
+        kwargs["base_storage"] = base
+
+    # Check if this is vectorized, if so, change name to vec
+    dim_tags = kwargs.get("dim_tags", "")
+    if dim_tags and dim_tags.split(',')[-1] == "vec":
+        name = "{}_vec".format(name)
+
    temporary_variable(name, kernel=classtag, **kwargs)
    silenced_warning("read_no_write({})".format(name), kernel=classtag)

    kwargs.pop("decl_method", None)
+    kwargs.pop("base_storage", None)
    # TODO I guess some filtering has to be applied here.
    globalarg(name, **kwargs)
+
+    return name
--- a/python/dune/perftool/loopy/buffer.py
+++ b/python/dune/perftool/loopy/buffer.py
@@ -38,13 +38,9 @@ class FlipFlopBuffer(object):
        formdata = get_global_context_value('formdata')
        dim = formdata.geometric_dimension

-        # TODO: doc!
-        storage_shape = (self.base_storage_size,) + (1,) * (dim - 1)
-
        # Construct the temporary and return it
        temporary_variable(name,
                           base_storage=base,
-                           storage_shape=storage_shape,
                           managed=True,
                           **kwargs
                           )

--- a/python/dune/perftool/loopy/symbolic.py
+++ b/python/dune/perftool/loopy/symbolic.py
@@ -15,21 +15,37 @@ import pymbolic.primitives as prim


 class SumfactKernel(prim.Variable):
-    def __init__(self, a_matrices, buffer, insn_dep=frozenset({}), additional_inames=frozenset({})):
+    def __init__(self,
+                 a_matrices,
+                 buffer,
+                 stage=1,
+                 insn_dep=frozenset({}),
+                 additional_inames=frozenset({}),
+                 preferred_interleaving_position=0,
+                 setup_method=None,
+                 input_temporary=None,
+                 ):
        self.a_matrices = a_matrices
        self.buffer = buffer
+        self.stage = stage
        self.insn_dep = insn_dep
        self.additional_inames = additional_inames
+        self.preferred_interleaving_position = preferred_interleaving_position
+        self.setup_method = setup_method
+        self.input_temporary = input_temporary
+
+        if setup_method is not None:
+            assert isinstance(setup_method, tuple) and len(setup_method) == 2

        prim.Variable.__init__(self, "SUMFACT")

    def __getinitargs__(self):
-        return (self.a_matrices, self.buffer, self.insn_dep, self.additional_inames)
+        return (self.a_matrices, self.buffer, self.stage, self.insn_dep, self.additional_inames, self.preferred_interleaving_position, self.setup_method, self.input_temporary)

    def stringifier(self):
        return lp.symbolic.StringifyMapper

-    init_arg_names = ("a_matrices", "buffer", "insn_dep", "additional_inames")
+    init_arg_names = ("a_matrices", "buffer", "stage", "insn_dep", "additional_inames", "preferred_interleaving_position", "setup_method", "input_temporary")

    mapper_method = "map_sumfact_kernel"


--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -43,7 +43,8 @@ def get_form_compiler_arguments():
    parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project")
    # TODO at some point this help description should be updated
    parser.add_argument("--sumfact", action="store_true", help="Use sumfactorization")
-    parser.add_argument("--vectorize", action="store_true", help="whether to generate code with explicit vectorization")
+    parser.add_argument("--vectorize-quad", action="store_true", help="whether to generate code with explicit vectorization")
+    parser.add_argument("--vectorize-grads", action="store_true", help="whether to generate code with explicit vectorization")

    # Modify the positional argument to not be a list
    args = vars(parser.parse_args())

--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -29,6 +29,7 @@ from pymbolic.primitives import Variable
 from pytools import Record

 import loopy as lp
+import cgen


 def name_form(formdata, data):
@@ -531,7 +532,7 @@ def extract_kernel_from_cache(tag):
    kernel = heuristic_duplication(kernel)

    # Maybe apply vectorization strategies
-    if get_option("vectorize"):
+    if get_option("vectorize_quad"):
        if get_option("sumfact"):
            # Vectorization of the quadrature loop
            insns = [i.id for i in lp.find_instructions(kernel, lp.match.Tagged("quadvec"))]
@@ -668,7 +669,7 @@ def cgen_class_from_cache(tag, members=[]):
                                        allow_complex=False,
                                        is_generating_device_code=True,
                                        )
-    decls = target.get_device_ast_builder().get_temporary_decls(codegen_state, 0)
+    decls = [cgen.Line("\n  " + next(iter(d.generate()))) for d in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0)]

    from dune.perftool.cgen import Class
    return Class(basename, base_classes=base_classes, members=[constructor] + members + pm + decls, tparam_decls=tparams)

--- a/python/dune/perftool/sumfact/amatrix.py
+++ b/python/dune/perftool/sumfact/amatrix.py
@@ -28,23 +28,33 @@ from loopy import CallMangleInfo
 from loopy.symbolic import FunctionIdentifier
 from loopy.types import NumpyType

-from pytools import Record, product
+from pytools import ImmutableRecord, product

 import pymbolic.primitives as prim
 import numpy


-class AMatrix(Record):
+class AMatrix(ImmutableRecord):
    def __init__(self, rows, cols, transpose=False, derivative=False):
-        Record.__init__(self,
-                        rows=rows,
-                        cols=cols,
-                        transpose=transpose,
-                        derivative=derivative,
-                        )
-
-    def __hash__(self):
-        return hash((self.transpose, self.derivative, self.rows, self.cols))
+        ImmutableRecord.__init__(self,
+                                 rows=rows,
+                                 cols=cols,
+                                 transpose=transpose,
+                                 derivative=derivative,
+                                 )
+
+
+class LargeAMatrix(ImmutableRecord):
+    def __init__(self, rows, cols, transpose, derivative):
+        assert isinstance(transpose, tuple)
+        assert isinstance(derivative, tuple)
+        assert len(transpose) == len(derivative)
+        ImmutableRecord.__init__(self,
+                                 rows=rows,
+                                 cols=cols,
+                                 transpose=transpose,
+                                 derivative=derivative,
+                                 )


 def quadrature_points_per_direction():
@@ -171,17 +181,23 @@ def polynomial_lookup_mangler(target, func, dtypes):
        return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64)))


-def define_theta(name, shape, transpose, derivative):
+def define_theta(name, shape, transpose, derivative, additional_indices=()):
    sort_quadrature_points_weights()
    polynomials = name_polynomials()
    qp = name_oned_quadrature_points()

+    dim_tags = "f,f"
+    if additional_indices:
+        dim_tags = dim_tags + ",c"
+        shape = shape + (4,)
+
    loopy_class_member(name,
                       dtype=numpy.float64,
                       shape=shape,
                       classtag="operator",
-                       dim_tags="f,f",
+                       dim_tags=dim_tags,
                       managed=True,
+                       potentially_vectorized=True,
                       )

    # TODO Enforce the alignment here!
@@ -194,7 +210,7 @@ def define_theta(name, shape, transpose, derivative):
    else:
        args = (prim.Variable(j), prim.Subscript(prim.Variable(qp), (prim.Variable(i),)))

-    instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j))),
+    instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j)) + additional_indices),
                expression=prim.Call(PolynomialLookup(polynomials, derivative), args),
                kernel="operator",
                )
@@ -206,6 +222,25 @@ def name_theta(transpose=False, derivative=False):
        shape = (basis_functions_per_direction(), quadrature_points_per_direction())
    else:
        shape = (quadrature_points_per_direction(), basis_functions_per_direction())
-    globalarg(name, shape=shape, dtype=numpy.float64, dim_tags="f,f")
    define_theta(name, shape, transpose, derivative)
    return name
+
+
+def name_large_theta(transpose=False, derivative=False):
+    ident = tuple("{}{}".format("d" if d else "", "T" if t else "") for t, d in zip(transpose, derivative))
+    name = "ThetaLarge_{}".format("_".join(ident))
+    if True in transpose:
+        shape = (basis_functions_per_direction(), quadrature_points_per_direction())
+    else:
+        shape = (quadrature_points_per_direction(), basis_functions_per_direction())
+
+    for i, (t, d) in enumerate(zip(transpose, derivative)):
+        define_theta(name, shape, t, d, additional_indices=(i,))
+
+    return loopy_class_member(name,
+                              classtag="operator",
+                              dtype=numpy.float64,
+                              dim_tags="f,f,vec",
+                              shape=shape + (4,),
+                              potentially_vectorized=True,
+                              )
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -75,10 +75,10 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
        # Add a sum factorization kernel that implements the
        # evaluation of the gradients of basis functions at quadrature
        # points (stage 1)
-        insn_dep = setup_theta(element, restriction, component, a_matrices, buffer_name)
        var = SumfactKernel(a_matrices,
                            buffer_name,
-                            insn_dep=frozenset({insn_dep}),
+                            preferred_interleaving_position=i,
+                            setup_method=(setup_theta, (element, restriction, component))
                            )

        buffers.append(var)
@@ -100,11 +100,8 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
 def pymbolic_trialfunction_gradient(element, restriction, component):
    rawname = "gradu" + "_".join(str(c) for c in component)
    name = restricted_name(rawname, restriction)
-    from dune.perftool.pdelab.argument import name_coefficientcontainer
-    container = name_coefficientcontainer(restriction)
    sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
-    from pymbolic.primitives import Variable
-    return Variable(name)
+    return prim.Variable(name)


 @kernel_cached
@@ -128,10 +125,10 @@ def pymbolic_trialfunction(element, restriction, component):

    # Add a sum factorization kernel that implements the evaluation of
    # the basis functions at quadrature points (stage 1)
-    insn_dep = setup_theta(element, restriction, component, a_matrices, buffer_name)
    var = SumfactKernel(a_matrices,
                        buffer_name,
-                        insn_dep=frozenset({insn_dep}),
+                        preferred_interleaving_position=dim,
+                        setup_method=(setup_theta, (element, restriction, component))
                        )

    return prim.Subscript(var,

--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -29,13 +29,18 @@ from dune.perftool.loopy.buffer import (get_buffer_temporary,
 from dune.perftool.sumfact.quadrature import nest_quadrature_loops
 from dune.perftool.pdelab.localoperator import determine_accumulation_space
 from dune.perftool.pdelab.restriction import restricted_name
-from dune.perftool.pdelab.spaces import name_lfs
+from dune.perftool.pdelab.spaces import (name_lfs,
+                                         name_lfs_bound,
+                                         )
 from dune.perftool.sumfact.amatrix import (AMatrix,
+                                           LargeAMatrix,
                                           quadrature_points_per_direction,
                                           basis_functions_per_direction,
+                                           name_large_theta,
                                           name_theta,
                                           )
 from dune.perftool.loopy.symbolic import SumfactKernel
+from dune.perftool.tools import get_pymbolic_basename
 from dune.perftool.error import PerftoolError
 from pymbolic.primitives import (Call,
                                 Product,
@@ -44,9 +49,10 @@ from pymbolic.primitives import (Call,
                                 )
 from dune.perftool.sumfact.quadrature import quadrature_inames
 from loopy import Reduction, GlobalArg
-from loopy.symbolic import FunctionIdentifier
+from loopy.symbolic import FunctionIdentifier, IdentityMapper

 import loopy as lp
+import numpy as np
 import pymbolic.primitives as prim
 from pytools import product

@@ -72,13 +78,59 @@ def find_sumfact(expr):
    return HasSumfactMapper()(expr)


-def expand_sumfact_kernels(insns):
+class IndexFiddleMapper(IdentityMapper):
+    def __init__(self, var, index, pos):
+        assert isinstance(var, str)
+        self.var = var
+        self.index = index
+        if isinstance(index, str):
+            self.index = prim.Variable(index)
+        self.pos = pos
+
+    def map_variable(self, expr):
+        if expr.name == self.var:
+            return prim.Subscript(expr, (self.var,))
+        else:
+            return IdentityMapper.map_variable(self, expr)
+
+    def map_subscript(self, expr):
+        if expr.aggregate.name == self.var:
+            ind = expr.index
+            if not isinstance(ind, tuple):
+                ind = (ind,)
+            if self.pos is None:
+                ind = ind + (self.index,)
+            else:
+                raise NotImplementedError
+            return prim.Subscript(expr.aggregate, ind)
+        else:
+            return IdentityMapper.map_subscript(self, expr)
+
+
+def fiddle_in_index(expr, var, index, pos=None):
+    return IndexFiddleMapper(var, index, pos)(expr)
+
+
+def default_resolution(insns):
    for insn in insns:
        if isinstance(insn, (lp.Assignment, lp.CallInstruction)):
            replace = {}
            deps = []
            for sumf in find_sumfact(insn.expression):
-                var, dep = sum_factorization_kernel(sumf.a_matrices, sumf.buffer, sumf.insn_dep, sumf.additional_inames)
+                # Maybe set up the input
+                if sumf.setup_method:
+                    shape = product(mat.cols for mat in sumf.a_matrices)
+                    shape = (shape,)
+                    inp = get_buffer_temporary(sumf.buffer, shape=shape)
+                    silenced_warning('read_no_write({})'.format(inp))
+
+                    func, args = sumf.setup_method
+                    insn_dep = frozenset({func(inp, *args)})
+                else:
+                    insn_dep = sumf.insn_dep
+
+                # Call the sum factorization algorithm
+                var, dep = sum_factorization_kernel(sumf.a_matrices, sumf.buffer, insn_dep, sumf.additional_inames)
                replace[sumf] = prim.Variable(var)
                deps.append(dep)

@@ -89,6 +141,98 @@ def expand_sumfact_kernels(insns):
                                  )


+def apply_sumfact_grad_vectorization(insns, stage):
+    sumfact_kernels = []
+    for insn in insns:
+        if isinstance(insn, (lp.Assignment, lp.CallInstruction)):
+            sumf = find_sumfact(insn.expression)
+            if sumf:
+                sumf, = sumf
+                if sumf.stage == stage:
+                    sumfact_kernels.append(sumf)
+
+    # Now apply some heuristics when to vectorize...
+    if len(set(sumfact_kernels)) < 3:
+        default_resolution(insns)
+    else:
+        # Vectorize!!!
+        sumfact_kernels = sorted(sumfact_kernels, key=lambda s: s.preferred_interleaving_position)
+        # Pad to 4
+        if len(sumfact_kernels) < 4:
+            sumfact_kernels.append(sumfact_kernels[0])
+
+        # Determine a name for the buffer in use
+        buffer = "_".join(sf.buffer for sf in sumfact_kernels)
+        insn_dep = frozenset()
+
+        # Maybe initialize the input buffer
+        if sumfact_kernels[0].setup_method:
+            shape = product(mat.cols for mat in sumfact_kernels[0].a_matrices)
+            shape = (shape, 4)
+            initialize_buffer(buffer, base_storage_size=4 * shape[0])
+            inp = get_buffer_temporary(buffer, shape=shape)
+            silenced_warning('read_no_write({})'.format(inp))
+
+            # Call the individual setup_methods
+            for i, sumf in enumerate(sumfact_kernels):
+                func, args = sumf.setup_method
+                insn_dep = insn_dep.union({func(inp, *args, additional_indices=(i,))})
+        else:
+            # No setup method defined. We need to make sure the input is correctly setup
+            shape = tuple(mat.cols for mat in sumfact_kernels[0].a_matrices) + (4,)
+            initialize_buffer(buffer, base_storage_size=4 * shape[0])
+            dim_tags = ",".join("f" * (len(shape) - 1)) + ",c"
+            inp = get_buffer_temporary(buffer, shape=shape, dim_tags=dim_tags)
+
+            for i, sumf in enumerate(sumfact_kernels):
+                assert sumf.input_temporary
+                for insn in insns:
+                    if isinstance(insn, lp.Assignment):
+                        if get_pymbolic_basename(insn.assignee) == sumf.input_temporary:
+                            built_instruction(insn.copy(assignee=prim.Subscript(prim.Variable(inp), insn.assignee.index + (i,))))
+                            from dune.perftool.generation.loopy import expr_instruction_impl
+                            expr_instruction_impl._memoize_cache = {k: v for k, v in expr_instruction_impl._memoize_cache.items() if v.id != insn.id}
+
+        # Determine the joined AMatrix
+        large_a_matrices = []
+        for i in range(len(sumfact_kernels[0].a_matrices)):
+            assert len(set(tuple(sf.a_matrices[i].rows for sf in sumfact_kernels))) == 1
+            assert len(set(tuple(sf.a_matrices[i].cols for sf in sumfact_kernels))) == 1
+            large = LargeAMatrix(rows=sumfact_kernels[0].a_matrices[i].rows,
+                                 cols=sumfact_kernels[0].a_matrices[i].cols,
+                                 transpose=tuple(sf.a_matrices[i].transpose for sf in sumfact_kernels),
+                                 derivative=tuple(sf.a_matrices[i].derivative for sf in sumfact_kernels),
+                                 )
+            large_a_matrices.append(large)
+
+        # Join the instruction dependencies
+        insn_dep = insn_dep.union(*tuple(sf.insn_dep for sf in sumfact_kernels))
+
+        var, dep = sum_factorization_kernel(large_a_matrices, buffer, insn_dep, sumfact_kernels[0].additional_inames, add_vec_tag=True)
+
+        for insn in insns:
+            if isinstance(insn, (lp.Assignment, lp.CallInstruction)):
+                sumfacts = find_sumfact(insn.expression)
+                if sumfacts:
+                    sumf, = sumfacts
+                    if sumf.stage == stage:
+                        replace = {}
+                        replace[sumf] = prim.Variable(var)
+                        newexpr = substitute(insn.expression, replace)
+                        newexpr = fiddle_in_index(newexpr, var, sumf.preferred_interleaving_position)
+                        assert newexpr
+                        built_instruction(insn.copy(expression=newexpr,
+                                                    depends_on=dep,
+                                                    ))
+
+def expand_sumfact_kernels(insns):
+    if get_option("vectorize_grads"):
+        apply_sumfact_grad_vectorization(insns, 1)
+        apply_sumfact_grad_vectorization(insns, 3)
+    else:
+        default_resolution(insns)
+
+
 def filter_sumfact_instructions():
    """ Remove all instructions that contain a SumfactKernel node """
    from dune.perftool.generation.loopy import expr_instruction_impl, call_instruction_impl
@@ -109,19 +253,13 @@ def sumfact_iname(bound, _type):
    return name


-def setup_theta(element, restriction, component, a_matrices, buffer_name):
-    number_basis = product(mat.cols for mat in a_matrices)
-    shape = (number_basis,)
-    inp = get_buffer_temporary(buffer_name,
-                               shape=shape)
-    silenced_warning('read_no_write({})'.format(inp))
-
+def setup_theta(inp, element, restriction, component, additional_indices=()):
    # Write initial coefficients into buffer
-    basisiname = sumfact_iname(number_basis, "basis")
    lfs = name_lfs(element, restriction, component)
+    basisiname = sumfact_iname(name_lfs_bound(lfs), "basis")
    container = name_coefficientcontainer(restriction)
    coeff = pymbolic_coefficient(container, lfs, basisiname)
-    assignee = Subscript(Variable(inp), (Variable(basisiname),))
+    assignee = Subscript(Variable(inp), (Variable(basisiname),) + additional_indices)
    return instruction(assignee=assignee,
                       expression=coeff,
                       )
@@ -178,9 +316,10 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
            a_matrices = [theta_matrix] * dim
            a_matrices[i] = dtheta_matrix
            a_matrices = tuple(a_matrices)
-
+            pref_pos = i
        else:
            a_matrices = (theta_matrix,) * dim
+            pref_pos = dim

        # Initialize a base storage for this buffer and get a temporay pointing to it
        temp = initialize_buffer(buf,
@@ -219,6 +358,9 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                               buf,
                               insn_dep=frozenset({contrib_dep}),
                               additional_inames=frozenset(visitor.inames),
+                               stage=3,
+                               preferred_interleaving_position=pref_pos,
+                               input_temporary=temp,
                               )

        inames = tuple(sumfact_iname(mat.rows, 'accum') for mat in a_matrices)
@@ -258,7 +400,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
        transform(nest_quadrature_loops, visitor.inames)


-def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional_inames=frozenset({})):
+def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional_inames=frozenset({}), add_vec_tag=False):
    """
    Calculate a sum factorization matrix product.

@@ -278,6 +420,10 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional
        should depend upon. All following ones will depend on each
        other.
    """
+    ftags = "f,f" + (",vec" if add_vec_tag else "")
+    ctags = "c,c" + (",vec" if add_vec_tag else "")
+    vec_shape = (4,) if add_vec_tag else ()
+
    insn_dep = frozenset({barrier(depends_on=insn_dep,
                                  within_inames=additional_inames,
                                  )})
@@ -288,8 +434,8 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional
        # this reinterprets the output of the previous iteration.
        inp_shape = (a_matrix.cols, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
        inp = get_buffer_temporary(buf,
-                                   shape=inp_shape,
-                                   dim_tags="f,f")
+                                   shape=inp_shape + vec_shape,
+                                   dim_tags=ftags)

        # The input temporary will only be read from, so we need to silence the loopy warning
        silenced_warning('read_no_write({})'.format(inp))
@@ -301,22 +447,34 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional
        out_shape = (a_matrix.rows, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))

        out = get_buffer_temporary(buf,
-                                   shape=out_shape,
-                                   dim_tags="c,c")
+                                   shape=out_shape + vec_shape,
+                                   dim_tags=ctags)

        # Get the inames needed for one matrix-matrix multiplication
        i = sumfact_iname(out_shape[0], "row")
        j = sumfact_iname(out_shape[1], "col")
        k = sumfact_iname(a_matrix.cols, "red")

+        # Select the correct Theta Matrix for multiplication
+        if isinstance(a_matrix, AMatrix):
+            theta = name_theta(transpose=a_matrix.transpose, derivative=a_matrix.derivative)
+            vec_iname = ()
+        elif isinstance(a_matrix, LargeAMatrix):
+            theta = name_large_theta(transpose=a_matrix.transpose, derivative=a_matrix.derivative)
+            iname = sumfact_iname(4, "vec")
+            vec_iname = (prim.Variable(iname),)
+            transform(lp.tag_inames, [(iname, "vec")])
+        else:
+            raise TypeError("What kind of an AMatrix is that?")
+
        # Construct the matrix-matrix-multiplication expression a_ik*in_kj
-        prod = Product((Subscript(Variable(name_theta(transpose=a_matrix.transpose, derivative=a_matrix.derivative)), (Variable(i), Variable(k))),
-                        Subscript(Variable(inp), (Variable(k), Variable(j)))
+        prod = Product((Subscript(Variable(theta), (Variable(i), Variable(k)) + vec_iname),
+                        Subscript(Variable(inp), (Variable(k), Variable(j)) + vec_iname)
                        ))

        # Issue the reduction instruction that implements the multiplication
        # at the same time store the instruction ID for the next instruction to depend on
-        insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j))),
+        insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j)) + vec_iname),
                                          expression=Reduction("sum", k, prod),
                                          forced_iname_deps=frozenset({i, j}).union(additional_inames),
                                          forced_iname_deps_is_final=True,
@@ -330,6 +488,11 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional

    out_shape = tuple(mat.rows for mat in a_matrices)
    dim_tags = ",".join(['f'] * dim)
+
+    if add_vec_tag:
+        out_shape = out_shape + vec_shape
+        dim_tags = dim_tags + ",c"
+
    out = get_buffer_temporary(buf,
                               shape=out_shape,
                               dim_tags=dim_tags)

--- a/pytools @ 7ed09d95
+++ b/pytools @ 7ed09d95
-Subproject commit 6c5830f6e007f4ece99d75c83b0f72550cad80c2
+Subproject commit 7ed09d95be2bd2fb22392010cf28830fd43a30ca
--- a/test/sumfact/mass/mass.mini
+++ b/test/sumfact/mass/mass.mini
@@ -13,5 +13,5 @@ extension = vtu

 [formcompiler]
 numerical_jacobian = 1, 0 | expand num
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec
 sumfact = 1
--- a/test/sumfact/mass/mass_3d.mini
+++ b/test/sumfact/mass/mass_3d.mini
@@ -15,5 +15,5 @@ extension = vtu

 [formcompiler]
 numerical_jacobian = 1, 0 | expand num
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec
 sumfact = 1
--- a/test/sumfact/poisson/poisson_2d_order1.mini
+++ b/test/sumfact/poisson/poisson_2d_order1.mini
@@ -17,4 +17,4 @@ numerical_jacobian = 1, 0 | expand num
 exact_solution_expression = g
 compare_l2errorsquared = 1e-4
 sumfact = 1
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec
--- a/test/sumfact/poisson/poisson_2d_order2.mini
+++ b/test/sumfact/poisson/poisson_2d_order2.mini
@@ -17,4 +17,4 @@ numerical_jacobian = 1, 0 | expand num
 exact_solution_expression = g
 compare_l2errorsquared = 1e-8
 sumfact = 1
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec
--- a/test/sumfact/poisson/poisson_3d_order1.mini
+++ b/test/sumfact/poisson/poisson_3d_order1.mini
 __name = sumfact_poisson_3d_order1_{__exec_suffix}
-__exec_suffix = {diff_suffix}_{vec_suffix}
+__exec_suffix = {diff_suffix}_{vecq_suffix}_{vecg_suffix}

 diff_suffix = numdiff, symdiff | expand num
-vec_suffix = vec, nonvec | expand vec
+vecq_suffix = quadvec, nonquadvec | expand vec_q
+vecg_suffix = gradvec, nongradvec | expand vec_g
+{vecq_suffix} == quadvec and {vecg_suffix} == gradvec | exclude

 cells = 8 8 8
 extension = 1. 1. 1.
@@ -17,4 +19,5 @@ numerical_jacobian = 1, 0 | expand num
 exact_solution_expression = g
 compare_l2errorsquared = 1e-4
 sumfact = 1
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec_q
+vectorize_grads = 1, 0 | expand vec_g
--- a/test/sumfact/poisson/poisson_3d_order2.mini
+++ b/test/sumfact/poisson/poisson_3d_order2.mini
 __name = sumfact_poisson_3d_order2_{__exec_suffix}
-__exec_suffix = {diff_suffix}_{vec_suffix}
+__exec_suffix = {diff_suffix}_{vecq_suffix}_{vecg_suffix}

 diff_suffix = numdiff, symdiff | expand num
-vec_suffix = vec, nonvec | expand vec
+vecq_suffix = quadvec, nonquadvec | expand vec_q
+vecg_suffix = gradvec, nongradvec | expand vec_g
+{vecq_suffix} == quadvec and {vecg_suffix} == gradvec | exclude

 cells = 8 8 8
 extension = 1. 1. 1.
@@ -17,4 +19,5 @@ numerical_jacobian = 1, 0 | expand num
 exact_solution_expression = g
 compare_l2errorsquared = 1e-8
 sumfact = 1
-vectorize = 1, 0 | expand vec
+vectorize_quad = 1, 0 | expand vec_q
+vectorize_grads = 1, 0 | expand vec_g