diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py
index 9d71f86c0913fd766112ec9ccbc8c91e8d9b55f9..d8ab37fbb57f176e4776eb0436de1ef88363254d 100644
--- a/python/dune/perftool/generation/__init__.py
+++ b/python/dune/perftool/generation/__init__.py
@@ -31,7 +31,6 @@ from dune.perftool.generation.cpp import (base_class,
                                           )
 
 from dune.perftool.generation.loopy import (barrier,
-                                            built_instruction,
                                             constantarg,
                                             construct_subst_rule,
                                             domain,
diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index f01bdf03f85c412f860f33ed1df202f81c5dda52..d098c798fe13f1bdd34fd4502fc3b8a0b4f6fa1c 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -15,7 +15,6 @@ iname = generator_factory(item_tags=("iname",), context_tags="kernel")
 function_mangler = generator_factory(item_tags=("mangler",), context_tags="kernel")
 silenced_warning = generator_factory(item_tags=("silenced_warning",), no_deco=True, context_tags="kernel")
 kernel_cached = generator_factory(item_tags=("default_cached",), context_tags="kernel")
-built_instruction = generator_factory(item_tags=("instruction",), context_tags="kernel", no_deco=True)
 
 
 class DuneGlobalArg(lp.GlobalArg):
diff --git a/python/dune/perftool/loopy/buffer.py b/python/dune/perftool/loopy/buffer.py
index feef4570c1484c7b5cfc267ca9dc52081bda4772..edb1d378f2c5ea680f18dc7c9103486f74ef2d1f 100644
--- a/python/dune/perftool/loopy/buffer.py
+++ b/python/dune/perftool/loopy/buffer.py
@@ -1,25 +1,22 @@
 from dune.perftool.error import PerftoolLoopyError
-from dune.perftool.generation import (generator_factory,
-                                      get_counted_variable,
-                                      get_global_context_value,
+from dune.perftool.generation import (get_counted_variable,
+                                      kernel_cached,
                                       temporary_variable,
                                       )
 
 
 class FlipFlopBuffer(object):
-    def __init__(self, identifier, base_storage_size=None, num=2):
+    def __init__(self, identifier):
         self.identifier = identifier
-        self.base_storage_size = base_storage_size
-        self.num = num
 
         # Initialize the counter that switches between the base storages!
         self._current = 0
 
         # Generate the base storage names
-        self.base_storage = tuple("{}_base_{}".format(self.identifier, i) for i in range(self.num))
+        self.base_storage = tuple("{}_base_{}".format(self.identifier, i) for i in (0, 1))
 
     def switch_base_storage(self):
-        self._current = (self._current + 1) % self.num
+        self._current = (self._current + 1) % 2
 
     def get_temporary(self, **kwargs):
         assert("base_storage" not in kwargs)
@@ -33,10 +30,6 @@ class FlipFlopBuffer(object):
         if name is None:
             name = get_counted_variable(self.identifier)
 
-        # Get geometric dimension
-        formdata = get_global_context_value('formdata')
-        dim = formdata.geometric_dimension
-
         # Construct the temporary and return it
         temporary_variable(name,
                            base_storage=base,
@@ -47,14 +40,10 @@ class FlipFlopBuffer(object):
         return name
 
 
-@generator_factory(item_tags=("buffer"), cache_key_generator=lambda i, **kw: i, context_tags=("kernel",))
-def initialize_buffer(identifier, base_storage_size=None, num=2):
-    if base_storage_size is None:
-        raise PerftoolLoopyError("The buffer for identifier {} has not been initialized.".format(identifier))
-    return FlipFlopBuffer(identifier,
-                          base_storage_size=base_storage_size,
-                          num=num,
-                          )
+@kernel_cached
+def initialize_buffer(identifier):
+    assert isinstance(identifier, str)
+    return FlipFlopBuffer(identifier)
 
 
 def get_buffer_temporary(identifier, **kwargs):
diff --git a/python/dune/perftool/loopy/symbolic.py b/python/dune/perftool/loopy/symbolic.py
index 7efae2d54331886f081fbf16102c6ddf77892763..a0a7de0172f6310067ddd7a72160a595889fdcaa 100644
--- a/python/dune/perftool/loopy/symbolic.py
+++ b/python/dune/perftool/loopy/symbolic.py
@@ -3,6 +3,7 @@
 Use this module to insert pymbolic nodes and the likes.
 """
 from dune.perftool.error import PerftoolError
+from dune.perftool.sumfact.symbolic import SumfactKernel
 from pymbolic.mapper.substitutor import make_subst_func
 
 import loopy as lp
@@ -15,36 +16,6 @@ from six.moves import intern
 #
 
 
-class SumfactKernel(prim.Variable):
-    def __init__(self,
-                 a_matrices,
-                 buffer,
-                 stage,
-                 preferred_position,
-                 restriction,
-                 ):
-        if not isinstance(restriction, tuple):
-            restriction = (restriction, 0)
-
-        self.a_matrices = a_matrices
-        self.buffer = buffer
-        self.stage = stage
-        self.preferred_position = preferred_position
-        self.restriction = restriction
-
-        prim.Variable.__init__(self, "SUMFACT")
-
-    def __getinitargs__(self):
-        return (self.a_matrices, self.buffer, self.stage, self.preferred_position, self.restriction)
-
-    def stringifier(self):
-        return lp.symbolic.StringifyMapper
-
-    init_arg_names = ("a_matrices", "buffer", "stage", "preferred_position", "restriction")
-
-    mapper_method = "map_sumfact_kernel"
-
-
 class FusedMultiplyAdd(prim.Expression):
     """ Represents an FMA operation """
 
diff --git a/python/dune/perftool/loopy/transformations/collect_rotate.py b/python/dune/perftool/loopy/transformations/collect_rotate.py
index c0e222c2819015f272199c5bbb4440ba11af770c..6cc8b102bbbba30a2e59b6c6b901186cf21e4a0d 100644
--- a/python/dune/perftool/loopy/transformations/collect_rotate.py
+++ b/python/dune/perftool/loopy/transformations/collect_rotate.py
@@ -31,7 +31,7 @@ def rotate_function_mangler(knl, func, arg_dtypes):
         # passing the vector registers as references and have them
         # changed. Loopy assumes this function to be read-only.
         include_file("dune/perftool/sumfact/transposereg.hh", filetag="operatorfile")
-        vcl = lp.types.NumpyType(get_vcl_type(np.float64, register_size=256))
+        vcl = lp.types.NumpyType(get_vcl_type(np.float64))
         return lp.CallMangleInfo("transpose_reg", (), (vcl, vcl, vcl, vcl))
 
 
diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py
index 271e547ddb4f7a49a27e0d68de6157c5c9d536f7..d175f02585d886c4950cd25493f895b3a4ed6447 100644
--- a/python/dune/perftool/loopy/vcl.py
+++ b/python/dune/perftool/loopy/vcl.py
@@ -1,15 +1,10 @@
 """
 Our extensions to the loopy type system
 """
-from dune.perftool.generation import (function_mangler,
-                                      include_file,
-                                      )
-
-from loopy.symbolic import FunctionIdentifier
-from loopy.types import NumpyType
-from loopy import CallMangleInfo
-
+from dune.perftool.options import get_option
+from dune.perftool.generation import function_mangler
 
+import loopy as lp
 import numpy as np
 
 
@@ -47,16 +42,15 @@ def _populate_vcl_type_registry():
 _populate_vcl_type_registry()
 
 
-def get_vcl_type_size(nptype, register_size=256):
-    return register_size // (np.dtype(nptype).itemsize * 8)
+def get_vcl_type_size(nptype, register_size=None):
+    if register_size is None:
+        register_size = get_option("max_vector_width")
 
+    return register_size // (np.dtype(nptype).itemsize * 8)
 
-def get_vcl_type(nptype, register_size=None, vec_size=None):
-    assert register_size or vec_size
-    assert not(register_size and vec_size)
 
-    if not vec_size:
-        vec_size = register_size // (np.dtype(nptype).itemsize * 8)
+def get_vcl_type(nptype, register_size=None):
+    vec_size = get_vcl_type_size(nptype, register_size)
 
     return VCLTypeRegistry.types[np.dtype(nptype), vec_size]
 
@@ -64,9 +58,13 @@ def get_vcl_type(nptype, register_size=None, vec_size=None):
 @function_mangler
 def vcl_function_mangler(knl, func, arg_dtypes):
     if func == "mul_add":
-        vcl = lp.types.NumpyType(get_vcl_type(np.float64, register_size=256))
+        vcl = lp.types.NumpyType(get_vcl_type(np.float64))
         return lp.CallMangleInfo("mul_add", (vcl,), (vcl, vcl, vcl))
 
     if func == "select":
-        vcl = lp.types.NumpyType(get_vcl_type(np.float64, register_size=256))
+        vcl = lp.types.NumpyType(get_vcl_type(np.float64))
         return lp.CallMangleInfo("select", (vcl,), (vcl, vcl, vcl))
+
+    if func == "horizontal_add":
+        vcl = lp.types.NumpyType(get_vcl_type(np.float64))
+        return lp.CallMangleInfo("horizontal_add", (lp.types.NumpyType(np.float64),), (vcl,))
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index b751e420f91e5a7f97da583f41f5c3b48be34fdd..3589f773f15c5002d452accbd242ebc3d6c5a96b 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -2,7 +2,7 @@
 
 from argparse import ArgumentParser
 from os.path import abspath
-from pytools import ImmutableRecord
+from pytools import ImmutableRecord, memoize
 
 from dune.common.parametertree.parser import parse_ini_file
 
@@ -16,7 +16,7 @@ class PerftoolOption(ImmutableRecord):
                  _type=type(None),
                  ):
         _type = type(default)
-        if _type is type(None):
+        if issubclass(_type, type(None)):
             _type = str
         ImmutableRecord.__init__(self,
                                  helpstr=helpstr,
@@ -32,6 +32,7 @@ class PerftoolOptionsArray(ImmutableRecord):
         opts.update(**kwargs)
         ImmutableRecord.__init__(self, **opts)
 
+    # Arguments that are to be set from the outside
     uflfile = PerftoolOption(helpstr="the UFL file to compile")
     driver_file = PerftoolOption(helpstr="The filename for the generated driver header")
     operator_file = PerftoolOption(helpstr="The filename for the generated local operator header")
@@ -57,6 +58,10 @@ class PerftoolOptionsArray(ImmutableRecord):
     vectorize_quad = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
     vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
     turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.")
+    architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl")
+
+    # Arguments that are mainly to be set by logic depending on other options
+    max_vector_width = PerftoolOption(default=256, helpstr=None)
 
 
 # Until more sophisticated logic is needed, we keep the actual option data in this module
@@ -64,7 +69,7 @@ _options = PerftoolOptionsArray()
 
 
 def initialize_options():
-    """ This should be called in entrypoint methods to correctly set up an options array """
+    """ Initialize the options from the command line """
     global _options
     _options = update_options_from_commandline(_options)
     _options = update_options_from_inifile(_options)
@@ -78,7 +83,7 @@ def update_options_from_commandline(opt):
                             )
     parser.add_argument('--version', action='version', version='%(prog)s 0.1')
     for k, v in type(opt).__dict__.items():
-        if isinstance(v, PerftoolOption):
+        if isinstance(v, PerftoolOption) and v.helpstr is not None:
             cmdopt = "--{}".format(k.replace('_', '-'))
             parser.add_argument(cmdopt, help=v.helpstr, type=v.type)
     parsedargs = {k: v for k, v in vars(parser.parse_args()).items() if v is not None}
@@ -98,21 +103,35 @@ def update_options_from_inifile(opt):
     return opt
 
 
+@memoize
+def process_options(opt):
+    """ Make sure that the options have been fully processed """
+    opt = expand_architecture_options(opt)
+    return opt
+
+
+def expand_architecture_options(opt):
+    if opt.architecture == "haswell":
+        return opt.copy(max_vector_width=256)
+    elif opt.architecture == "knl":
+        return opt.copy(max_vector_width=512)
+    else:
+        raise NotImplementedError("Architecture {} not known!".format(opt.architecture))
+
+
 def set_option(key, value):
     """Add the key value pair to the options.
 
     If the key is already in the options dictionary its value will be
     overwritten.  Form compiler arguments will always be set before
     any other options.
-
     """
     global _options
-    _options = _options.copy(**{key:value})
-    assert getattr(_options, key) == value
+    _options = process_options(_options).copy(**{key: value})
 
 
 def get_option(key):
-    return getattr(_options, key)
+    return getattr(process_options(_options), key)
 
 
 def option_switch(opt):
diff --git a/python/dune/perftool/sumfact/__init__.py b/python/dune/perftool/sumfact/__init__.py
index 0389416b5999e96b7e303434f07cab7e07a824b5..f7f108b311b2f17379e5e719730f75e46ff5c9f2 100644
--- a/python/dune/perftool/sumfact/__init__.py
+++ b/python/dune/perftool/sumfact/__init__.py
@@ -12,6 +12,8 @@ from dune.perftool.sumfact.basis import (lfs_inames,
                                          pymbolic_coefficient,
                                          pymbolic_coefficient_gradient,
                                          )
+
+import dune.perftool.sumfact.accumulation
 import dune.perftool.sumfact.switch
 
 from dune.perftool.pdelab import PDELabInterface
diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
new file mode 100644
index 0000000000000000000000000000000000000000..56ad22cba11a0a501c1c8a285240311d4ddc18c3
--- /dev/null
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -0,0 +1,245 @@
+from dune.perftool.pdelab.argument import (name_accumulation_variable,
+                                           PDELabAccumulationFunction,
+                                           )
+from dune.perftool.generation import (backend,
+                                      domain,
+                                      dump_accumulate_timer,
+                                      function_mangler,
+                                      get_counted_variable,
+                                      get_counter,
+                                      iname,
+                                      instruction,
+                                      post_include,
+                                      kernel_cached,
+                                      temporary_variable,
+                                      transform,
+                                      )
+from dune.perftool.options import get_option
+from dune.perftool.loopy.flatten import flatten_index
+from dune.perftool.loopy.buffer import get_buffer_temporary
+from dune.perftool.sumfact.quadrature import nest_quadrature_loops
+from dune.perftool.pdelab.localoperator import determine_accumulation_space
+from dune.perftool.pdelab.restriction import restricted_name
+from dune.perftool.pdelab.signatures import assembler_routine_name
+from dune.perftool.pdelab.geometry import world_dimension
+from dune.perftool.sumfact.tabulation import (basis_functions_per_direction,
+                                              construct_basis_matrix_sequence,
+                                              )
+from dune.perftool.sumfact.switch import (get_facedir,
+                                          get_facemod,
+                                          )
+from dune.perftool.sumfact.symbolic import SumfactKernel
+from dune.perftool.ufl.modified_terminals import extract_modified_arguments
+from dune.perftool.tools import get_pymbolic_basename
+from dune.perftool.error import PerftoolError
+from dune.perftool.sumfact.quadrature import quadrature_inames
+
+
+import loopy as lp
+import numpy as np
+import pymbolic.primitives as prim
+
+
+@iname
+def _sumfact_iname(bound, _type, count):
+    name = "sf_{}_{}".format(_type, str(count))
+    domain(name, bound)
+    return name
+
+
+def sumfact_iname(bound, _type):
+    count = get_counter('_sumfac_iname_{}'.format(_type))
+    name = _sumfact_iname(bound, _type, count)
+    return name
+
+
+@kernel_cached
+def accum_iname(restriction, bound, i):
+    return sumfact_iname(bound, "accum")
+
+
+def name_test_function_contribution(test):
+    count = get_counter('sumfact_contribution_counter')
+    grad = "grad_" if test.reference_grad else ""
+    return restricted_name("contrib_{}phi_{}".format(grad, str(count)), test.restriction)
+
+
+@backend(interface="accum_insn", name="sumfact")
+def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
+    pymbolic_expr = visitor(accterm.term)
+
+    if pymbolic_expr == 0:
+        return
+
+    dim = world_dimension()
+
+    # If this is a gradient, we find the gradient iname
+    additional_inames = frozenset({})
+    if accterm.new_indices is not None:
+        for i in accterm.new_indices:
+            if i not in visitor.dimension_indices:
+                from dune.perftool.pdelab.localoperator import grad_iname
+                additional_inames = additional_inames.union(frozenset({grad_iname(i, dim)}))
+
+    # Figure out the name of the accumulation variable
+    test_lfs = determine_accumulation_space(accterm.argument.expr, 0, measure)
+    ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure)
+    accum = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
+
+    def emit_sumfact_kernel(i, restriction, insn_dep):
+        # Construct the matrix sequence for this sum factorization
+        matrix_sequence = construct_basis_matrix_sequence(transpose=True,
+                                                          derivative=i if accterm.new_indices else None,
+                                                          facedir=get_facedir(accterm.argument.restriction),
+                                                          facemod=get_facemod(accterm.argument.restriction),
+                                                          )
+
+        sf = SumfactKernel(matrix_sequence=matrix_sequence,
+                           restriction=(accterm.argument.restriction, restriction),
+                           stage=3,
+                           preferred_position=i if accterm.new_indices else None,
+                           within_inames=visitor.inames,
+                           accumvar=accum,
+                           )
+
+        from dune.perftool.sumfact.vectorization import attach_vectorization_info
+        sf = attach_vectorization_info(sf)
+
+        # Make sure we have a buffer that we can set up the input with
+        buffer = sf.buffer
+        if buffer is None:
+            buffer = get_counted_variable("buffer")
+
+        vectag = frozenset({"gradvec"}) if sf.index is not None else frozenset()
+
+        temp = get_buffer_temporary(buffer,
+                                    shape=sf.quadrature_shape,
+                                    dim_tags=sf.quadrature_dimtags,
+                                    name=sf.input,
+                                    )
+
+        # Those input fields, that are padded need to be set to zero
+        # in order to do a horizontal_add later on
+        for pad in sf.padding:
+            assignee = prim.Subscript(prim.Variable(temp),
+                                      tuple(prim.Variable(i) for i in quadrature_inames()) + (pad,))
+            instruction(assignee=assignee,
+                        expression=0,
+                        forced_iname_deps=frozenset(quadrature_inames() + visitor.inames),
+                        forced_iname_deps_is_final=True,
+                        tags=frozenset(["quadvec", "gradvec"])
+                        )
+
+        # Replace gradient iname with correct index for assignement
+        replace_dict = {}
+        for iname in additional_inames:
+            replace_dict[prim.Variable(iname)] = i
+        from dune.perftool.loopy.symbolic import substitute
+        expression = substitute(pymbolic_expr, replace_dict)
+
+        # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1)
+        timer_dep = frozenset()
+        if get_option("instrumentation_level") >= 4:
+            timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
+            post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+            dump_accumulate_timer(timer_name)
+            if(visitor.inames):
+                timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+                                                   within_inames=frozenset(visitor.inames))})
+
+        # Determine dependencies
+        from loopy.match import Or, Writes
+        from loopy.symbolic import DependencyMapper
+        from dune.perftool.tools import get_pymbolic_basename
+        deps = Or(tuple(Writes(get_pymbolic_basename(expr)) for expr in DependencyMapper()(expression)))
+
+        # Issue an instruction in the quadrature loop that fills the buffer
+        # with the evaluation of the contribution at all quadrature points
+        assignee = prim.Subscript(prim.Variable(temp),
+                                  tuple(prim.Variable(i) for i in quadrature_inames()) + sf.vec_index)
+        contrib_dep = instruction(assignee=assignee,
+                                  expression=expression,
+                                  forced_iname_deps=frozenset(quadrature_inames() + visitor.inames),
+                                  forced_iname_deps_is_final=True,
+                                  tags=frozenset({"quadvec"}).union(vectag),
+                                  depends_on=frozenset({deps}).union(timer_dep)
+                                  )
+
+        if insn_dep is None:
+            insn_dep = frozenset({contrib_dep})
+
+        if get_option("instrumentation_level") >= 4:
+            insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+                                              depends_on=insn_dep,
+                                              within_inames=frozenset(visitor.inames))})
+
+        inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i)
+                       for i, mat in enumerate(sf.matrix_sequence))
+
+        # Collect the lfs and lfs indices for the accumulate call
+        test_lfs.index = flatten_index(tuple(prim.Variable(i) for i in inames),
+                                       (basis_functions_per_direction(),) * dim,
+                                       order="f"
+                                       )
+
+        # In the jacobian case, also determine the space for the ansatz space
+        rank = 2 if visitor.inames else 1
+        if rank == 2:
+            # TODO the next line should get its inames from
+            # elsewhere. This is *NOT* robust (but works right now)
+            ansatz_lfs.index = flatten_index(tuple(prim.Variable(visitor.inames[i])
+                                                   for i in range(world_dimension())),
+                                             (basis_functions_per_direction(),) * dim,
+                                             order="f"
+                                             )
+
+        # Add a sum factorization kernel that implements the multiplication
+        # with the test function (stage 3)
+        from dune.perftool.sumfact.realization import realize_sum_factorization_kernel
+        result, insn_dep = realize_sum_factorization_kernel(sf.copy(insn_dep=insn_dep))
+
+        # Determine the expression to accumulate with. This depends on the vectorization strategy!
+        result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames))
+        vecinames = ()
+        if sf.index is not None:
+            iname = accum_iname((accterm.argument.restriction, restriction), 4, "vec")
+            vecinames = (iname,)
+            transform(lp.tag_inames, [(iname, "vec")])
+            from dune.perftool.tools import maybe_wrap_subscript
+            result = prim.Call(prim.Variable("horizontal_add"),
+                               (maybe_wrap_subscript(result, prim.Variable(iname)),),
+                               )
+
+        if not get_option("fastdg"):
+            expr = prim.Call(PDELabAccumulationFunction(accum, rank),
+                             (test_lfs.get_args() +
+                              ansatz_lfs.get_args() +
+                              (result,)
+                              )
+                             )
+            instruction(assignees=(),
+                        expression=expr,
+                        forced_iname_deps=frozenset(inames + visitor.inames + vecinames),
+                        forced_iname_deps_is_final=True,
+                        depends_on=insn_dep,
+                        )
+
+        # Mark the transformation that moves the quadrature loop
+        # inside the trialfunction loops for application
+        transform(nest_quadrature_loops, visitor.inames)
+
+        return insn_dep
+
+    # Extract the restrictions on argument-1:
+    jac_restrictions = frozenset(tuple(ma.restriction for ma in
+                                       extract_modified_arguments(accterm.term, argnumber=1, do_index=True)))
+    if not jac_restrictions:
+        jac_restrictions = frozenset({0})
+
+    insn_dep = None
+    for restriction in jac_restrictions:
+        if accterm.new_indices:
+            for i in range(world_dimension()):
+                insn_dep = emit_sumfact_kernel(i, restriction, insn_dep)
+        else:
+            insn_dep = emit_sumfact_kernel(None, restriction, insn_dep)
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index e9b5a6ceb39b06b6757b96c76d7b1d4e7ec49ff8..b4861eebf81923c6253475cfdb7c55cd553d08d4 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -6,6 +6,8 @@ multiplication with the test function is part of the sum factorization kernel.
 
 from dune.perftool.generation import (backend,
                                       domain,
+                                      get_backend,
+                                      get_counted_variable,
                                       get_counter,
                                       get_global_context_value,
                                       iname,
@@ -13,18 +15,13 @@ from dune.perftool.generation import (backend,
                                       kernel_cached,
                                       temporary_variable,
                                       )
-from dune.perftool.sumfact.amatrix import (AMatrix,
-                                           basis_functions_per_direction,
-                                           construct_amatrix_sequence,
-                                           name_theta,
-                                           quadrature_points_per_direction,
-                                           )
-from dune.perftool.sumfact.sumfact import (get_facedir,
-                                           setup_theta,
-                                           SumfactKernel,
-                                           sumfact_iname,
-                                           sum_factorization_kernel,
-                                           )
+from dune.perftool.sumfact.tabulation import (basis_functions_per_direction,
+                                              construct_basis_matrix_sequence,
+                                              name_theta,
+                                              quadrature_points_per_direction,
+                                              PolynomialLookup,
+                                              name_polynomials,
+                                              )
 from dune.perftool.sumfact.quadrature import quadrature_inames
 from dune.perftool.sumfact.switch import (get_facedir,
                                           get_facemod,
@@ -34,6 +31,7 @@ from dune.perftool.pdelab.geometry import (local_dimension,
                                            world_dimension,
                                            )
 from dune.perftool.loopy.buffer import initialize_buffer
+from dune.perftool.sumfact.symbolic import SumfactKernel
 from dune.perftool.options import get_option
 from dune.perftool.pdelab.driver import FEM_name_mangling
 from dune.perftool.pdelab.restriction import restricted_name
@@ -65,67 +63,53 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v
     shape_impl = ('arr',) * rank
     temporary_variable(name, shape=shape, shape_impl=shape_impl)
 
-    dim = world_dimension()
+    # Whether direct indexing into the output is possible. This happens
+    # if the positioning within a SIMD vectors coincides with the index!
+    direct_indexing_is_possible = True
+
     buffers = []
-    for i in range(dim):
+    for i in range(world_dimension()):
         # Construct the matrix sequence for this sum factorization
-        a_matrices = construct_amatrix_sequence(derivative=i,
-                                                facedir=get_facedir(restriction),
-                                                facemod=get_facemod(restriction),
-                                                )
-
-        # Get the vectorization info. If this happens during the dry run, we get dummies
-        from dune.perftool.sumfact.vectorization import get_vectorization_info
-        a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, restriction)
-
-        # Initialize the buffer for the sum fact kernel
-        shape = (product(mat.cols for mat in a_matrices),)
-        if index is not None:
-            shape = shape + (4,)
-        inp = initialize_buffer(buf,
-                                base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
-                                num=2
-                                ).get_temporary(shape=shape,
-                                                name=inp,
-                                                )
-        insn_dep = frozenset({Writes(inp)})
-
-        if get_option('fastdg'):
-            # Name of direct input, shape and globalarg is set in sum_factorization_kernel
-            direct_input = coeff_func(restriction)
-        else:
-            direct_input = None
-            # Setup the input!
-            setup_theta(inp, element, restriction, component, index, coeff_func)
+        matrix_sequence = construct_basis_matrix_sequence(derivative=i,
+                                                          facedir=get_facedir(restriction),
+                                                          facemod=get_facemod(restriction),
+                                                          )
+
+        # The sum factorization kernel object gathering all relevant information
+        sf = SumfactKernel(matrix_sequence=matrix_sequence,
+                           restriction=restriction,
+                           preferred_position=i,
+                           coeff_func=coeff_func,
+                           element=element,
+                           component=component,
+                           )
+
+        from dune.perftool.sumfact.vectorization import attach_vectorization_info
+        sf = attach_vectorization_info(sf)
+
+        if i != sf.index:
+            direct_indexing_is_possible = False
 
         # Add a sum factorization kernel that implements the
         # evaluation of the gradients of basis functions at quadrature
         # points (stage 1)
-        var, insn_dep = sum_factorization_kernel(a_matrices,
-                                                 buf,
-                                                 1,
-                                                 preferred_position=i,
-                                                 insn_dep=insn_dep,
-                                                 restriction=restriction,
-                                                 outshape=tuple(mat.rows for mat in a_matrices if mat.face is None),
-                                                 direct_input=direct_input,
-                                                 )
+        from dune.perftool.sumfact.realization import realize_sum_factorization_kernel
+        var, insn_dep = realize_sum_factorization_kernel(sf)
+
         buffers.append(var)
 
     # Check whether we want to return early with something that has the indexing
     # already handled! This happens with vectorization when the index coincides
     # with the position in the vector register.
-    if index:
+    if direct_indexing_is_possible:
         assert len(visitor.indices) == 1
         return maybe_wrap_subscript(var, tuple(prim.Variable(i) for i in quadrature_inames()) + visitor.indices), None
 
     # TODO this should be quite conditional!!!
     for i, buf in enumerate(buffers):
         # Write solution from sumfactorization to gradient variable
-        from pymbolic.primitives import Subscript, Variable
-        from dune.perftool.generation import get_backend
-        assignee = Subscript(Variable(name), i)
-        expression = Subscript(buf, tuple(Variable(i) for i in quadrature_inames()))
+        assignee = prim.Subscript(prim.Variable(name), i)
+        expression = prim.Subscript(buf, tuple(prim.Variable(i) for i in quadrature_inames()))
         instruction(assignee=assignee,
                     expression=expression,
                     forced_iname_deps=frozenset(get_backend("quad_inames")()),
@@ -137,55 +121,28 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v
 
 @kernel_cached
 def pymbolic_coefficient(element, restriction, component, coeff_func, visitor):
-    # Get geometric dimension
-    dim = world_dimension()
-
     # Construct the matrix sequence for this sum factorization
-    a_matrices = construct_amatrix_sequence(facedir=get_facedir(restriction),
-                                            facemod=get_facemod(restriction),)
-
-    # Get the vectorization info. If this happens during the dry run, we get dummies
-    from dune.perftool.sumfact.vectorization import get_vectorization_info
-    a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, restriction)
-
-    # Flip flop buffers for sumfactorization
-    shape = (product(mat.cols for mat in a_matrices),)
-    if index is not None:
-        shape = shape + (4,)
-    initialize_buffer(buf,
-                      base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
-                      num=2
-                      ).get_temporary(shape=shape,
-                                      name=inp,
-                                      )
+    matrix_sequence = construct_basis_matrix_sequence(facedir=get_facedir(restriction),
+                                                      facemod=get_facemod(restriction),)
+
+    sf = SumfactKernel(matrix_sequence=matrix_sequence,
+                       restriction=restriction,
+                       coeff_func=coeff_func,
+                       element=element,
+                       component=component,
+                       )
 
-    if get_option('fastdg'):
-        # Name of direct input, shape and globalarg is set in sum_factorization_kernel
-        direct_input = coeff_func(restriction)
-    else:
-        direct_input = None
-        # Setup the input!
-        setup_theta(inp, element, restriction, component, index, coeff_func)
+    # TODO: Move this away!
+    from dune.perftool.sumfact.vectorization import attach_vectorization_info
+    sf = attach_vectorization_info(sf)
 
     # Add a sum factorization kernel that implements the evaluation of
     # the basis functions at quadrature points (stage 1)
-    var, _ = sum_factorization_kernel(a_matrices,
-                                      buf,
-                                      1,
-                                      preferred_position=None,
-                                      insn_dep=frozenset({Writes(inp)}),
-                                      outshape=tuple(mat.rows for mat in a_matrices if mat.face is None),
-                                      restriction=restriction,
-                                      direct_input=direct_input,
-                                      )
-
-    if index:
-        index = (index,)
-    else:
-        index = ()
+    from dune.perftool.sumfact.realization import realize_sum_factorization_kernel
+    var, _ = realize_sum_factorization_kernel(sf)
 
     return prim.Subscript(var,
-                          tuple(prim.Variable(i) for i in quadrature_inames()) + index
+                          tuple(prim.Variable(i) for i in quadrature_inames()) + sf.vec_index
                           ), visitor.indices
 
 
@@ -223,7 +180,6 @@ def evaluate_basis(element, name, restriction):
     # Add the missing direction on facedirs by evaluating at either 0 or 1
     if facedir is not None:
         facemod = get_facemod(restriction)
-        from dune.perftool.sumfact.amatrix import PolynomialLookup, name_polynomials
         prod = prod + (prim.Call(PolynomialLookup(name_polynomials(), False),
                                  (prim.Variable(inames[facedir]), facemod)),)
 
@@ -273,7 +229,6 @@ def evaluate_reference_gradient(element, name, restriction):
                                            ))
         if facedir is not None:
             facemod = get_facemod(restriction)
-            from dune.perftool.sumfact.amatrix import PolynomialLookup, name_polynomials
             prod.append(prim.Call(PolynomialLookup(name_polynomials(), facedir == d),
                                   (prim.Variable(inames[facedir]), facemod)),)
 
diff --git a/python/dune/perftool/sumfact/permutation.py b/python/dune/perftool/sumfact/permutation.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ebc0be78adf2a00d184fcd8187de38a53b2d648
--- /dev/null
+++ b/python/dune/perftool/sumfact/permutation.py
@@ -0,0 +1,86 @@
+""" Permute sum factorization kernels """
+
+import itertools
+
+
+def sumfact_permutation_heuristic(permutations, stage):
+    """Heuristic to choose a permutation
+
+    - Stage 1: Pick the permutation where in permutations[1:] most
+      elements are ordered by size
+    - Stage 3: Pick the permutation where in permutations[:-1] most
+      elements are ordered by size
+    """
+    def cost(perm, stage):
+        cost = 0
+        for i in range(0, len(perm) - 2):
+            if stage == 1:
+                if perm[i + 1] > perm[i + 2]:
+                    cost += 1
+            if stage == 3:
+                if perm[0] > perm[i + 1]:
+                    cost += 1
+        return cost
+
+    perm = min(permutations, key=lambda i: cost(i, stage))
+    return perm
+
+
+def flop_cost(matrix_sequence):
+    """Computational cost of sumfactorization with this matrix_sequence
+    """
+    cost = 0
+    for l in range(len(matrix_sequence)):
+        cost_m = 1
+        cost_n = 1
+        for i in range(l + 1):
+            cost_m *= matrix_sequence[i].rows
+        for i in range(l, len(matrix_sequence)):
+            cost_n *= matrix_sequence[i].cols
+        cost += cost_m * cost_n
+    return cost
+
+
+def sumfact_permutation_strategy(sf):
+    """Choose permutation of the matrix sequence based on computational cost
+
+    Note: If there are multiple permutations with the same cost a
+    heuristic is used to pick one.
+    """
+    # Extract information from the SumfactKernel object
+    matrix_sequence = sf.matrix_sequence
+    stage = sf.stage
+
+    # Combine permutation and matrix_sequence
+    perm = [i for i, _ in enumerate(matrix_sequence)]
+    perm_matrix_sequence = zip(perm, matrix_sequence)
+
+    # Find cost for all possible permutations of the matrix_sequence
+    perm_cost = []
+    for permutation in itertools.permutations(perm_matrix_sequence):
+        perm, series = zip(*permutation)
+        cost = flop_cost(series)
+        perm_cost.append((perm, cost))
+
+    # Find minimal cost and all permutations with that cost
+    _, costs = zip(*perm_cost)
+    minimal_cost = min(costs)
+    minimal_cost_permutations = [p[0] for p in perm_cost if p[1] == minimal_cost]
+
+    # Use heuristic to pick one of the minimal cost permutations
+    perm = sumfact_permutation_heuristic(minimal_cost_permutations, stage)
+    return perm
+
+
+def permute_forward(t, perm):
+    tmp = []
+    for pos in perm:
+        tmp.append(t[pos])
+    return tuple(tmp)
+
+
+def permute_backward(t, perm):
+    tmp = [None] * len(t)
+    for i, pos in enumerate(perm):
+        tmp[pos] = t[i]
+    return tuple(tmp)
diff --git a/python/dune/perftool/sumfact/quadrature.py b/python/dune/perftool/sumfact/quadrature.py
index 08a5d7e56c70d82bd9a5da52aab0a3c09651b38d..5e6935c3ba368e94300d2e22e6885f98e544af29 100644
--- a/python/dune/perftool/sumfact/quadrature.py
+++ b/python/dune/perftool/sumfact/quadrature.py
@@ -7,10 +7,10 @@ from dune.perftool.generation import (backend,
                                       temporary_variable,
                                       )
 
-from dune.perftool.sumfact.amatrix import (quadrature_points_per_direction,
-                                           name_oned_quadrature_points,
-                                           name_oned_quadrature_weights,
-                                           )
+from dune.perftool.sumfact.tabulation import (quadrature_points_per_direction,
+                                              name_oned_quadrature_points,
+                                              name_oned_quadrature_weights,
+                                              )
 from dune.perftool.pdelab.argument import name_accumulation_variable
 from dune.perftool.pdelab.geometry import (dimension_iname,
                                            local_dimension,
diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcc06af6b0ee4121934fc4ecc312493342abc1da
--- /dev/null
+++ b/python/dune/perftool/sumfact/realization.py
@@ -0,0 +1,275 @@
+"""
+The code that triggers the creation of the necessary code constructs
+to realize a sum factorization kernel
+"""
+
+from dune.perftool.generation import (barrier,
+                                      dump_accumulate_timer,
+                                      generator_factory,
+                                      get_global_context_value,
+                                      globalarg,
+                                      instruction,
+                                      post_include,
+                                      silenced_warning,
+                                      transform,
+                                      )
+from dune.perftool.loopy.buffer import (get_buffer_temporary,
+                                        switch_base_storage,
+                                        )
+from dune.perftool.pdelab.argument import pymbolic_coefficient
+from dune.perftool.pdelab.geometry import world_dimension
+from dune.perftool.pdelab.spaces import name_lfs, name_lfs_bound
+from dune.perftool.options import get_option
+from dune.perftool.pdelab.signatures import assembler_routine_name
+from dune.perftool.sumfact.permutation import (sumfact_permutation_strategy,
+                                               permute_backward,
+                                               permute_forward,
+                                               )
+from dune.perftool.sumfact.vectorization import attach_vectorization_info
+from dune.perftool.sumfact.accumulation import sumfact_iname
+
+import loopy as lp
+import numpy as np
+import pymbolic.primitives as prim
+
+
+def realize_sum_factorization_kernel(sf, **kwargs):
+    if get_global_context_value("dry_run", False):
+        return sf, sf.insn_dep
+    else:
+        _realize_input(sf)
+        return _realize_sum_factorization_kernel(sf, **kwargs)
+
+
+def _realize_input(sf):
+    # Set up the input for stage 1
+    if sf.stage == 1 and not get_option("fastdg"):
+        assert sf.coeff_func
+        assert sf.input is not None
+
+        # Get the input temporary!
+        input_setup = get_buffer_temporary(sf.buffer,
+                                           shape=sf.flat_input_shape,
+                                           name=sf.input
+                                           )
+
+        # Write initial coefficients into buffer
+        lfs = name_lfs(sf.element, sf.restriction, sf.component)
+        basisiname = sumfact_iname(name_lfs_bound(lfs), "basis")
+        container = sf.coeff_func(sf.restriction)
+        coeff = pymbolic_coefficient(container, lfs, basisiname)
+        assignee = prim.Subscript(prim.Variable(input_setup), (prim.Variable(basisiname),) + sf.vec_index)
+        instruction(assignee=assignee,
+                    expression=coeff,
+                    depends_on=sf.insn_dep,
+                    )
+
+
+@generator_factory(item_tags=("sumfactkernel",),
+                   context_tags=("kernel",),
+                   cache_key_generator=lambda s, **kw: s.cache_key)
+def _realize_sum_factorization_kernel(sf):
+    # Add a dependency on the input variable
+    insn_dep = sf.insn_dep
+    if sf.input:
+        insn_dep = insn_dep.union(frozenset({lp.match.Writes(sf.input)}))
+
+    # Construct the direct_input for the FastDG case
+    direct_input = None
+    if get_option('fastdg') and sf.stage == 1:
+        direct_input = sf.coeff_func(sf.restriction)
+
+    direct_output = None
+    if get_option('fastdg') and sf.stage == 3:
+        direct_output = sf.accumvar + ".data()"
+
+    # Prepare some dim_tags/shapes for later use
+    ftags = ",".join(["f"] * sf.length)
+    novec_ftags = ftags
+    ctags = ",".join(["c"] * sf.length)
+    vec_shape = ()
+    if sf.vectorized:
+        ftags = ftags + ",vec"
+        ctags = ctags + ",vec"
+        vec_shape = (4,)
+
+    # Measure times and count operations in c++ code
+    if get_option("instrumentation_level") >= 4:
+        timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(sf.stage)
+        post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+        dump_accumulate_timer(timer_name)
+        insn_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+                                          depends_on=insn_dep,
+                                          within_inames=frozenset(sf.within_inames))})
+
+    # Put a barrier before the sumfactorization kernel
+    insn_dep = frozenset({barrier(depends_on=insn_dep,
+                                  within_inames=frozenset(sf.within_inames),
+                                  )})
+
+    # Decide in which order we want to process directions in the
+    # sumfactorization. A clever ordering can lead to a reduced
+    # complexity. This will e.g. happen at faces where we only have
+    # one quadratue point m_l=1 if l is the normal direction of the
+    # face.
+    #
+    # Rule of thumb: small m's early and large n's late.
+    perm = sumfact_permutation_strategy(sf)
+
+    # Permute matrix sequence
+    matrix_sequence = permute_forward(sf.matrix_sequence, perm)
+
+    # Product of all matrices
+    for l, matrix in enumerate(matrix_sequence):
+        # Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication
+        # and get inames that realize the product.
+        inp_shape = (matrix.cols,) + tuple(mat.cols for mat in matrix_sequence[l + 1:]) + tuple(mat.rows for mat in matrix_sequence[:l])
+        out_shape = (matrix.rows,) + tuple(mat.cols for mat in matrix_sequence[l + 1:]) + tuple(mat.rows for mat in matrix_sequence[:l])
+        out_inames = tuple(sumfact_iname(length, "out_inames_" + str(k)) for k, length in enumerate(out_shape))
+        vec_iname = ()
+        if matrix.vectorized:
+            iname = sumfact_iname(4, "vec")
+            vec_iname = (prim.Variable(iname),)
+            transform(lp.tag_inames, [(iname, "vec")])
+
+        # A trivial reduction is implemented as a product, otherwise we run into
+        # a code generation corner case producing way too complicated code. This
+        # could be fixed upstream, but the loopy code realizing reductions is not
+        # trivial and the priority is kind of low.
+        if matrix.cols != 1:
+            k = sumfact_iname(matrix.cols, "red")
+            k_expr = prim.Variable(k)
+        else:
+            k_expr = 0
+
+        # Setup the input of the sum factorization kernel. In the
+        # first matrix multiplication this can be taken from
+        # * an input temporary (default)
+        # * a global data structure (if FastDGGridOperator is in use)
+        # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator)
+        input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])
+        if l == 0 and direct_input is not None:
+            # See comment below
+            input_inames = permute_backward(input_inames, perm)
+            inp_shape = permute_backward(inp_shape, perm)
+
+            globalarg(direct_input, dtype=np.float64, shape=inp_shape, dim_tags=novec_ftags)
+            if matrix.vectorized:
+                input_summand = prim.Call(prim.Variable("Vec4d"),
+                                          (prim.Subscript(prim.Variable(direct_input),
+                                                          input_inames),))
+            else:
+                input_summand = prim.Subscript(prim.Variable(direct_input),
+                                               input_inames + vec_iname)
+        else:
+            # If we did permute the order of a matrices above we also
+            # permuted the order of out_inames. Unfortunately the
+            # order of our input is from 0 to d-1. This means we need
+            # to permute _back_ to get the right coefficients.
+            if l == 0:
+                inp_shape = permute_backward(inp_shape, perm)
+                input_inames = permute_backward(input_inames, perm)
+
+            # Get a temporary that interprets the base storage of the input
+            # as a column-major matrix. In later iteration of the matrix loop
+            # this reinterprets the output of the previous iteration.
+            inp = get_buffer_temporary(sf.buffer,
+                                       shape=inp_shape + vec_shape,
+                                       dim_tags=ftags)
+
+            # The input temporary will only be read from, so we need to silence the loopy warning
+            silenced_warning('read_no_write({})'.format(inp))
+
+            input_summand = prim.Subscript(prim.Variable(inp),
+                                           input_inames + vec_iname)
+
+        switch_base_storage(sf.buffer)
+
+        # Get a temporary that interprets the base storage of the output.
+        #
+        # Note: In this step the reordering of the fastest directions
+        # is happening. The new direction (out_inames[0]) and the
+        # corresponding shape (out_shape[0]) goes to the end (slowest
+        # direction) and everything stays column major (ftags->fortran
+        # style).
+        #
+        # If we are in the last step we reverse the permutation.
+        output_shape = tuple(out_shape[1:]) + (out_shape[0],)
+        if l == len(matrix_sequence) - 1:
+            output_shape = permute_backward(output_shape, perm)
+        out = get_buffer_temporary(sf.buffer,
+                                   shape=output_shape + vec_shape,
+                                   dim_tags=ftags)
+
+        # Write the matrix-matrix multiplication expression
+        matprod = prim.Product((prim.Subscript(prim.Variable(matrix.name),
+                                               (prim.Variable(out_inames[0]), k_expr) + vec_iname),
+                                input_summand))
+
+        # ... which may be a reduction, if k>0
+        if matrix.cols != 1:
+            matprod = lp.Reduction("sum", k, matprod)
+
+        # Here we also move the new direction (out_inames[0]) to the
+        # end and reverse permutation
+        output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),)
+        if l == len(matrix_sequence) - 1:
+            output_inames = permute_backward(output_inames, perm)
+
+        # In case of direct output we directly accumulate the result
+        # of the Sumfactorization into some global data structure.
+        if l == len(matrix_sequence) - 1 and direct_output is not None:
+            ft = get_global_context_value("form_type")
+            if ft == 'residual' or ft == 'jacobian_apply':
+                globalarg(direct_output, dtype=np.float64, shape=output_shape, dim_tags=novec_ftags)
+                assignee = prim.Subscript(prim.Variable(direct_output), output_inames)
+            else:
+                assert ft == 'jacobian'
+                globalarg(direct_output,
+                          dtype=np.float64,
+                          shape=output_shape + output_shape,
+                          dim_tags=novec_ftags + "," + novec_ftags)
+                # TODO: It is at least questionnable, whether using the *order* of the inames in here
+                #       for indexing is a good idea. Then again, it is hard to find an alternative.
+                _ansatz_inames = tuple(prim.Variable(i) for i in sf.within_inames)
+                assignee = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + output_inames)
+
+            # In case of vectorization we need to apply a horizontal add
+            if matrix.vectorized:
+                matprod = prim.Call(prim.Variable("horizontal_add"),
+                                    (matprod,))
+
+            # We need to accumulate
+            matprod = prim.Sum((assignee, matprod))
+        else:
+            assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
+
+        # Issue the reduction instruction that implements the multiplication
+        # at the same time store the instruction ID for the next instruction to depend on
+        insn_dep = frozenset({instruction(assignee=assignee,
+                                          expression=matprod,
+                                          forced_iname_deps=frozenset([iname for iname in out_inames]).union(frozenset(sf.within_inames)),
+                                          forced_iname_deps_is_final=True,
+                                          depends_on=insn_dep,
+                                          )
+                              })
+
+    # Measure times and count operations in c++ code
+    if get_option("instrumentation_level") >= 4:
+        insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+                                          depends_on=insn_dep,
+                                          within_inames=frozenset(sf.within_inames))})
+        if sf.stage == 1:
+            qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
+            post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+            dump_accumulate_timer(timer_name)
+            insn_dep = instruction(code="HP_TIMER_START({});".format(qp_timer_name),
+                                   depends_on=insn_dep)
+
+    out = get_buffer_temporary(sf.buffer,
+                               shape=sf.output_shape,
+                               dim_tags=sf.output_dimtags,
+                               )
+    silenced_warning('read_no_write({})'.format(out))
+
+    return sf.output_to_pymbolic(out), insn_dep
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
deleted file mode 100644
index a91079f54ff2fb4ef1746e7d737d7fbc04f3456c..0000000000000000000000000000000000000000
--- a/python/dune/perftool/sumfact/sumfact.py
+++ /dev/null
@@ -1,704 +0,0 @@
-import copy
-import itertools
-
-from dune.perftool.loopy.symbolic import substitute
-from dune.perftool.pdelab.argument import (name_accumulation_variable,
-                                           name_coefficientcontainer,
-                                           pymbolic_coefficient,
-                                           PDELabAccumulationFunction,
-                                           )
-from dune.perftool.generation import (backend,
-                                      barrier,
-                                      built_instruction,
-                                      domain,
-                                      dump_accumulate_timer,
-                                      function_mangler,
-                                      generator_factory,
-                                      get_counter,
-                                      get_global_context_value,
-                                      globalarg,
-                                      iname,
-                                      instruction,
-                                      post_include,
-                                      kernel_cached,
-                                      retrieve_cache_items,
-                                      silenced_warning,
-                                      temporary_variable,
-                                      transform,
-                                      )
-from dune.perftool.options import get_option
-from dune.perftool.loopy.flatten import flatten_index
-from dune.perftool.loopy.buffer import (get_buffer_temporary,
-                                        initialize_buffer,
-                                        switch_base_storage,
-                                        )
-from dune.perftool.sumfact.quadrature import nest_quadrature_loops
-from dune.perftool.pdelab.localoperator import determine_accumulation_space
-from dune.perftool.pdelab.restriction import restricted_name
-from dune.perftool.pdelab.signatures import assembler_routine_name
-from dune.perftool.pdelab.spaces import (name_lfs,
-                                         name_lfs_bound,
-                                         )
-from dune.perftool.pdelab.geometry import (local_dimension,
-                                           world_dimension,
-                                           )
-from dune.perftool.sumfact.amatrix import (AMatrix,
-                                           LargeAMatrix,
-                                           quadrature_points_per_direction,
-                                           basis_functions_per_direction,
-                                           construct_amatrix_sequence,
-                                           )
-from dune.perftool.sumfact.switch import (get_facedir,
-                                          get_facemod,
-                                          )
-from dune.perftool.loopy.symbolic import SumfactKernel
-from dune.perftool.ufl.modified_terminals import extract_modified_arguments
-from dune.perftool.tools import get_pymbolic_basename
-from dune.perftool.error import PerftoolError
-from pymbolic.primitives import (Call,
-                                 Product,
-                                 Subscript,
-                                 Variable,
-                                 )
-from dune.perftool.sumfact.quadrature import quadrature_inames
-from dune.perftool.sumfact.vectorization import find_sumfact
-from loopy.symbolic import FunctionIdentifier, IdentityMapper
-
-import loopy as lp
-import numpy as np
-import pymbolic.primitives as prim
-from pytools import product
-
-
-@function_mangler
-def horizontal_add_function_mangler(knl, func, arg_dtypes):
-    if func == "horizontal_add":
-        from dune.perftool.loopy.vcl import get_vcl_type
-        vcl = lp.types.NumpyType(get_vcl_type(np.float64, register_size=256))
-        return lp.CallMangleInfo("horizontal_add", (lp.types.NumpyType(np.float64),), (vcl,))
-
-
-@iname
-def _sumfact_iname(bound, _type, count):
-    name = "sf_{}_{}".format(_type, str(count))
-    domain(name, bound)
-    return name
-
-
-def sumfact_iname(bound, _type):
-    count = get_counter('_sumfac_iname_{}'.format(_type))
-    name = _sumfact_iname(bound, _type, count)
-    return name
-
-
-@kernel_cached
-def accum_iname(restriction, bound, i):
-    return sumfact_iname(bound, "accum")
-
-
-def setup_theta(inp, element, restriction, component, index, coeff_func):
-    if index is None:
-        index = ()
-    else:
-        index = (index,)
-    # Write initial coefficients into buffer
-    lfs = name_lfs(element, restriction, component)
-    basisiname = sumfact_iname(name_lfs_bound(lfs), "basis")
-    container = coeff_func(restriction)
-    coeff = pymbolic_coefficient(container, lfs, basisiname)
-    assignee = Subscript(Variable(inp), (Variable(basisiname),) + index)
-    return instruction(assignee=assignee,
-                       expression=coeff,
-                       )
-
-
-def name_test_function_contribution(test):
-    count = get_counter('sumfact_contribution_counter')
-    grad = "grad_" if test.reference_grad else ""
-    return restricted_name("contrib_{}phi_{}".format(grad, str(count)), test.restriction)
-
-
-@backend(interface="accum_insn", name="sumfact")
-def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
-    pymbolic_expr = visitor(accterm.term)
-
-    if pymbolic_expr == 0:
-        return
-
-    dim = world_dimension()
-
-    # If this is a gradient, we find the gradient iname
-    additional_inames = frozenset({})
-    if accterm.new_indices is not None:
-        for i in accterm.new_indices:
-            if i not in visitor.dimension_indices:
-                from dune.perftool.pdelab.localoperator import grad_iname
-                additional_inames = additional_inames.union(frozenset({grad_iname(i, dim)}))
-
-    def emit_sumfact_kernel(i, restriction, insn_dep):
-        # Construct the matrix sequence for this sum factorization
-        a_matrices = construct_amatrix_sequence(transpose=True,
-                                                derivative=i if accterm.new_indices else None,
-                                                facedir=get_facedir(accterm.argument.restriction),
-                                                facemod=get_facemod(accterm.argument.restriction),
-                                                )
-
-        # Get the vectorization info. If this happens during the dry run, we get dummies
-        from dune.perftool.sumfact.vectorization import get_vectorization_info
-        a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, (accterm.argument.restriction, restriction))
-
-        # Initialize a base storage for this buffer and get a temporay pointing to it
-        shape = tuple(mat.cols for mat in a_matrices if mat.face is None)
-        dim_tags = ",".join(['f'] * local_dimension())
-        if index is not None:
-            shape = shape + (4,)
-            dim_tags = dim_tags + ",c"
-            index = (index,)
-            vectag = frozenset({"gradvec"})
-        else:
-            index = ()
-            vectag = frozenset()
-
-        base_storage_size = product(max(mat.rows, mat.cols) for mat in a_matrices)
-        temp = initialize_buffer(buf,
-                                 base_storage_size=base_storage_size,
-                                 num=2
-                                 ).get_temporary(shape=shape,
-                                                 dim_tags=dim_tags,
-                                                 name=inp,
-                                                 )
-
-        # Those input fields, that are padded need to be set to zero
-        # in order to do a horizontal_add later on
-        for pad in padding:
-            assignee = prim.Subscript(prim.Variable(temp),
-                                      tuple(Variable(i) for i in quadrature_inames()) + (pad,))
-            instruction(assignee=assignee,
-                        expression=0,
-                        forced_iname_deps=frozenset(quadrature_inames() + visitor.inames),
-                        forced_iname_deps_is_final=True,
-                        tags=frozenset(["quadvec", "gradvec"])
-                        )
-
-        # Replace gradient iname with correct index for assignement
-        replace_dict = {}
-        for iname in additional_inames:
-            replace_dict[Variable(iname)] = i
-        expression = substitute(pymbolic_expr, replace_dict)
-
-        # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1)
-        timer_dep = frozenset()
-        if get_option("instrumentation_level") >= 4:
-            timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
-            post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-            dump_accumulate_timer(timer_name)
-            if(visitor.inames):
-                timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
-                                                   within_inames=frozenset(visitor.inames))})
-
-        # Determine dependencies
-        from loopy.match import Or, Writes
-        from loopy.symbolic import DependencyMapper
-        from dune.perftool.tools import get_pymbolic_basename
-        deps = Or(tuple(Writes(get_pymbolic_basename(expr)) for expr in DependencyMapper()(expression)))
-
-        # Issue an instruction in the quadrature loop that fills the buffer
-        # with the evaluation of the contribution at all quadrature points
-        assignee = Subscript(Variable(temp), tuple(Variable(i) for i in quadrature_inames()) + index)
-        contrib_dep = instruction(assignee=assignee,
-                                  expression=expression,
-                                  forced_iname_deps=frozenset(quadrature_inames() + visitor.inames),
-                                  forced_iname_deps_is_final=True,
-                                  tags=frozenset({"quadvec"}).union(vectag),
-                                  depends_on=frozenset({deps}).union(timer_dep)
-                                  )
-
-        if insn_dep is None:
-            insn_dep = frozenset({contrib_dep})
-
-        if get_option("instrumentation_level") >= 4:
-            insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
-                                              depends_on=insn_dep,
-                                              within_inames=frozenset(visitor.inames))})
-
-        inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i)
-                       for i, mat in enumerate(a_matrices))
-
-        # Collect the lfs and lfs indices for the accumulate call
-        test_lfs = determine_accumulation_space(accterm.argument.expr, 0, measure)
-        test_lfs.index = flatten_index(tuple(Variable(i) for i in inames),
-                                       (basis_functions_per_direction(),) * dim,
-                                       order="f"
-                                       )
-
-        # In the jacobian case, also determine the space for the ansatz space
-        ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure)
-        rank = 2 if visitor.inames else 1
-        if rank == 2:
-            # TODO the next line should get its inames from
-            # elsewhere. This is *NOT* robust (but works right now)
-            ansatz_lfs.index = flatten_index(tuple(Variable(visitor.inames[i])
-                                                   for i in range(world_dimension())),
-                                             (basis_functions_per_direction(),) * dim,
-                                             order="f"
-                                             )
-
-        # Construct the expression representing "{r,jac}.accumulate(..)"
-        accum = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
-
-        # We can directly accumulate the solution in the last step of
-        # the sumfactorization. Currently this is only implemented for
-        # FastDGGridOperator.
-        direct_output = None
-        if get_option('fastdg'):
-            ft = get_global_context_value("form_type")
-            accum = accum + ".data()"
-            if ft == 'residual' or ft == 'jacobian_apply':
-                direct_output = accum
-            else:
-                assert ft == 'jacobian'
-                direct_output = accum
-
-        # Add a sum factorization kernel that implements the multiplication
-        # with the test function (stage 3)
-        pref_pos = i if accterm.new_indices else None
-        result, insn_dep = sum_factorization_kernel(a_matrices,
-                                                    buf,
-                                                    3,
-                                                    insn_dep=insn_dep,
-                                                    additional_inames=frozenset(visitor.inames),
-                                                    preferred_position=pref_pos,
-                                                    restriction=(accterm.argument.restriction,
-                                                                 restriction),
-                                                    direct_output=direct_output,
-                                                    visitor=visitor
-                                                    )
-
-        # Determine the expression to accumulate with. This depends on the vectorization strategy!
-        result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames))
-        vecinames = ()
-        if index:
-            iname = accum_iname((accterm.argument.restriction, restriction), 4, "vec")
-            vecinames = (iname,)
-            transform(lp.tag_inames, [(iname, "vec")])
-            from dune.perftool.tools import maybe_wrap_subscript
-            result = prim.Call(prim.Variable("horizontal_add"),
-                               (maybe_wrap_subscript(result, prim.Variable(iname)),),
-                               )
-
-        # In the fastdg case we accumulate directly in the last step of the SF!
-        if get_option('fastdg'):
-            # In the dry run we need an instruction with a
-            # sumfactorization node. We use this to decide on a
-            # vectorization strategy. This is just a dummy
-            # instruction, in the real run the accumulation is done
-            # directly in the sumfactorization.
-            if get_global_context_value("dry_run", False):
-                dummy = "dummy"
-                globalarg(dummy, dtype=np.float64)
-                instruction(assignee=prim.Variable(dummy),
-                            expression=result,
-                            forced_iname_deps=frozenset(inames),
-                            forced_iname_deps_is_final=True,
-                            depends_on=insn_dep,
-                            )
-        # Default: Generate accumulation instructions
-        else:
-            expr = Call(PDELabAccumulationFunction(accum, rank),
-                        (test_lfs.get_args() +
-                         ansatz_lfs.get_args() +
-                         (result,)
-                         )
-                        )
-            instruction(assignees=(),
-                        expression=expr,
-                        forced_iname_deps=frozenset(inames + visitor.inames + vecinames),
-                        forced_iname_deps_is_final=True,
-                        depends_on=insn_dep,
-                        )
-
-        # Mark the transformation that moves the quadrature loop
-        # inside the trialfunction loops for application
-        transform(nest_quadrature_loops, visitor.inames)
-
-        return insn_dep
-
-    # Extract the restrictions on argument-1:
-    jac_restrictions = frozenset(tuple(ma.restriction for ma in
-                                       extract_modified_arguments(accterm.term, argnumber=1, do_index=True)))
-    if not jac_restrictions:
-        jac_restrictions = frozenset({0})
-
-    insn_dep = None
-    for restriction in jac_restrictions:
-        if accterm.new_indices:
-            for i in range(world_dimension()):
-                insn_dep = emit_sumfact_kernel(i, restriction, insn_dep)
-        else:
-            insn_dep = emit_sumfact_kernel(None, restriction, insn_dep)
-
-
-def _sf_permutation_heuristic(permutations, stage):
-    """Heuristic to choose a permutation
-
-    - Stage 1: Pick the permutation where in permutations[1:] most
-      elements are ordered by size
-    - Stage 3: Pick the permutation where in permutations[:-1] most
-      elements are ordered by size
-    """
-    def cost(perm, stage):
-        cost = 0
-        for i in range(0, len(perm) - 2):
-            if stage == 1:
-                if perm[i + 1] > perm[i + 2]:
-                    cost += 1
-            if stage == 3:
-                if perm[0] > perm[i + 1]:
-                    cost += 1
-        return cost
-
-    perm = min(permutations, key=lambda i: cost(i, stage))
-    return perm
-
-
-def _sf_flop_cost(a_matrices):
-    """Computational cost of sumfactorization with this list of a_matrices
-    """
-    cost = 0
-    for l in range(len(a_matrices)):
-        cost_m = 1
-        cost_n = 1
-        for i in range(l + 1):
-            cost_m *= a_matrices[i].rows
-        for i in range(l, len(a_matrices)):
-            cost_n *= a_matrices[i].cols
-        cost += cost_m * cost_n
-    return cost
-
-
-def _sf_permutation_strategy(a_matrices, stage):
-    """Choose permutation of a_matrices list based on computational cost
-
-    Note: If there are multiple permutations with the same cost a
-    heuristic is used to pick one.
-    """
-    # Combine permutation and a_matrices list
-    perm = [i for i, _ in enumerate(a_matrices)]
-    perm_a_matrices = zip(perm, a_matrices)
-
-    # Find cost for all possible permutations of a_matrices list
-    perm_cost = []
-    for permutation in itertools.permutations(perm_a_matrices):
-        perm, series = zip(*permutation)
-        cost = _sf_flop_cost(series)
-        perm_cost.append((perm, cost))
-
-    # Find minimal cost and all permutations with that cost
-    _, costs = zip(*perm_cost)
-    minimal_cost = min(costs)
-    minimal_cost_permutations = [p[0] for p in perm_cost if p[1] == minimal_cost]
-
-    # Use heuristic to pic one of the minimal cost permutations
-    perm = _sf_permutation_heuristic(minimal_cost_permutations, stage)
-    return perm
-
-
-def _permute_forward(t, perm):
-    tmp = []
-    for pos in perm:
-        tmp.append(t[pos])
-    return tuple(tmp)
-
-
-def _permute_backward(t, perm):
-    tmp = [None] * len(t)
-    for i, pos in enumerate(perm):
-        tmp[pos] = t[i]
-    return tuple(tmp)
-
-
-@generator_factory(item_tags=("sumfactkernel",), context_tags=("kernel",), cache_key_generator=lambda a, b, s, **kw: (a, b, s, kw.get("restriction", 0)))
-def sum_factorization_kernel(a_matrices,
-                             buf,
-                             stage,
-                             insn_dep=frozenset({}),
-                             additional_inames=frozenset({}),
-                             preferred_position=None,
-                             outshape=None,
-                             restriction=0,
-                             direct_input=None,
-                             direct_output=None,
-                             visitor=None,
-                             ):
-    """Create a sum factorization kernel
-
-    Sum factorization can be written as
-
-    Y = R_{d-1} (A_{d-1} * ... * R_0 (A_0 * X)...)
-
-    with:
-    - X: Input rank d tensor of dimension n_0 x ... x n_{d-1}
-    - Y: Output rank d tensor of dimension m_0 x ... x m_{d-1}
-    - A_l: Values of 1D basis evaluations at quadrature points in l
-      direction, matrix of dimension m_l x n_l
-    - R_l: Transformation operator that permutes the underlying data
-      vector of the rank d tensor in such a way that the fastest
-      direction gets the slowest direction
-
-    In the l'th step we have the following setup:
-    - A_l: Matrix of dimensions m_l x n_l
-    - X_l: Rank d tensor of dimensions n_l x ... x n_{d-1} x m_0 x ... x m_{l-1}
-    - R_l: Transformation operator
-
-    Looking at the indizes the following will happen:
-    X --> [n_l,...,n_{d-1},m_0,...,m_{l-1}]
-    A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
-    R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
-
-    So the multiplication with A_l is a reduction over one index and
-    the transformation brings the next reduction index in the fastest
-    position.
-
-    Note: In the code below the transformation step is directly done
-    in the reduction instruction by adapting the assignee!
-
-    It can make sense to permute the order of directions. If you have
-    a small m_l (e.g. stage 1 on faces) it is better to do direction l
-    first. This can be done by:
-
-    - Permuting the order of the A matrices.
-    - Permuting the input tensor.
-    - Permuting the output tensor (this assures that the directions of
-      the output tensor are again ordered from 0 to d-1).
-
-    Arguments:
-    ----------
-    a_matrices: An iterable of AMatrix instances
-        The list of tensors to be applied to the input.
-        Order of application is from 0 up.
-    buf: A string identifying the flip flop buffer in use
-        for intermediate results. The memory is expected to be
-        pre-initialized with the input or you have to provide
-        direct_input (FastDGGridOperator).
-    insn_dep: An instruction ID that the first issued instruction
-        should depend upon. All following ones will depend on each
-        other.
-    additional_inames: Instructions will be executed within those
-        inames (needed for stage 3 in jacobians).
-    preferred_position: Will be used in the dry run to order kernels
-        when doing vectorization e.g. (dx u,dy u,dz u, u).
-    outshape: Shape of the output.
-    restriction: Restriction for faces values.
-    direct_input: Global data structure containing input for
-        sumfactorization (e.g. when using FastDGGridOperator).
-    """
-    # Return a pymbolic SumfactKernel node in the dry run. This will
-    # be used to decide on an appropriate vectorization strategy
-    # before we do the real thing.
-    if get_global_context_value("dry_run", False):
-        return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset()
-
-    ftags = ",".join(["f"] * len(a_matrices))
-    novec_ftags = ftags
-    ctags = ",".join(["c"] * len(a_matrices))
-    vec_shape = ()
-    if next(iter(a_matrices)).vectorized:
-        ftags = ftags + ",vec"
-        ctags = ctags + ",vec"
-        vec_shape = (4,)
-
-    # Measure times and count operations in c++ code
-    if get_option("instrumentation_level") >= 4:
-        timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(stage)
-        post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-        dump_accumulate_timer(timer_name)
-        insn_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
-                                          depends_on=insn_dep,
-                                          within_inames=additional_inames)})
-
-    # Put a barrier before the sumfactorization kernel
-    insn_dep = frozenset({barrier(depends_on=insn_dep,
-                                  within_inames=additional_inames,
-                                  )})
-
-    # Decide in which order we want to process directions in the
-    # sumfactorization. A clever ordering can lead to a reduced
-    # complexity. This will e.g. happen at faces where we only have
-    # one quadratue point m_l=1 if l is the normal direction of the
-    # face.
-    #
-    # Rule of thumb: small m's early and large n's late.
-    perm = _sf_permutation_strategy(a_matrices, stage)
-
-    # Permute a_matrices
-    a_matrices = _permute_forward(a_matrices, perm)
-
-    # Product of all matrices
-    for l, a_matrix in enumerate(a_matrices):
-        # Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication
-        # and get inames that realize the product.
-        inp_shape = (a_matrix.cols,) + tuple(mat.cols for mat in a_matrices[l + 1:]) + tuple(mat.rows for mat in a_matrices[:l])
-        out_shape = (a_matrix.rows,) + tuple(mat.cols for mat in a_matrices[l + 1:]) + tuple(mat.rows for mat in a_matrices[:l])
-        out_inames = tuple(sumfact_iname(length, "out_inames_" + str(k)) for k, length in enumerate(out_shape))
-        vec_iname = ()
-        if a_matrix.vectorized:
-            iname = sumfact_iname(4, "vec")
-            vec_iname = (prim.Variable(iname),)
-            transform(lp.tag_inames, [(iname, "vec")])
-
-        # A trivial reduction is implemented as a product, otherwise we run into
-        # a code generation corner case producing way too complicated code. This
-        # could be fixed upstream, but the loopy code realizing reductions is not
-        # trivial and the priority is kind of low.
-        if a_matrix.cols != 1:
-            k = sumfact_iname(a_matrix.cols, "red")
-            k_expr = prim.Variable(k)
-        else:
-            k_expr = 0
-
-        # Setup the input of the sum factorization kernel. In the
-        # first matrix multiplication this can be taken from
-        # * an input temporary (default)
-        # * a global data structure (if FastDGGridOperator is in use)
-        # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator)
-        input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])
-        if l == 0 and direct_input is not None:
-            # See comment below
-            input_inames = _permute_backward(input_inames, perm)
-            inp_shape = _permute_backward(inp_shape, perm)
-
-            globalarg(direct_input, dtype=np.float64, shape=inp_shape, dim_tags=novec_ftags)
-            if a_matrix.vectorized:
-                input_summand = prim.Call(prim.Variable("Vec4d"),
-                                          (prim.Subscript(prim.Variable(direct_input),
-                                                          input_inames),))
-            else:
-                input_summand = prim.Subscript(prim.Variable(direct_input),
-                                               input_inames + vec_iname)
-        else:
-            # If we did permute the order of a matrices above we also
-            # permuted the order of out_inames. Unfortunately the
-            # order of our input is from 0 to d-1. This means we need
-            # to permute _back_ to get the right coefficients.
-            if l == 0:
-                inp_shape = _permute_backward(inp_shape, perm)
-                input_inames = _permute_backward(input_inames, perm)
-
-            # Get a temporary that interprets the base storage of the input
-            # as a column-major matrix. In later iteration of the amatrix loop
-            # this reinterprets the output of the previous iteration.
-            inp = get_buffer_temporary(buf,
-                                       shape=inp_shape + vec_shape,
-                                       dim_tags=ftags)
-
-            # The input temporary will only be read from, so we need to silence the loopy warning
-            silenced_warning('read_no_write({})'.format(inp))
-
-            input_summand = prim.Subscript(prim.Variable(inp),
-                                           input_inames + vec_iname)
-
-        switch_base_storage(buf)
-
-        # Get a temporary that interprets the base storage of the output.
-        #
-        # Note: In this step the reordering of the fastest directions
-        # is happening. The new direction (out_inames[0]) and the
-        # corresponding shape (out_shape[0]) goes to the end (slowest
-        # direction) and everything stays column major (ftags->fortran
-        # style).
-        #
-        # If we are in the last step we reverse the permutation.
-        output_shape = tuple(out_shape[1:]) + (out_shape[0],)
-        if l == len(a_matrices) - 1:
-            output_shape = _permute_backward(output_shape, perm)
-        out = get_buffer_temporary(buf,
-                                   shape=output_shape + vec_shape,
-                                   dim_tags=ftags)
-
-        # Write the matrix-matrix multiplication expression
-        matprod = Product((prim.Subscript(prim.Variable(a_matrix.name),
-                                          (prim.Variable(out_inames[0]), k_expr) + vec_iname),
-                           input_summand))
-
-        # ... which may be a reduction, if k>0
-        if a_matrix.cols != 1:
-            matprod = lp.Reduction("sum", k, matprod)
-
-        # Here we also move the new direction (out_inames[0]) to the
-        # end and reverse permutation
-        output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),)
-        if l == len(a_matrices) - 1:
-            output_inames = _permute_backward(output_inames, perm)
-
-        # In case of direct output we directly accumulate the result
-        # of the Sumfactorization into some global data structure.
-        if l == len(a_matrices) - 1 and direct_output is not None:
-            ft = get_global_context_value("form_type")
-            if ft == 'residual' or ft == 'jacobian_apply':
-                globalarg(direct_output, dtype=np.float64, shape=output_shape, dim_tags=novec_ftags)
-                assignee = prim.Subscript(prim.Variable(direct_output), output_inames)
-            else:
-                assert ft == 'jacobian'
-                globalarg(direct_output,
-                          dtype=np.float64,
-                          shape=output_shape + output_shape,
-                          dim_tags=novec_ftags + "," + novec_ftags)
-                # TODO the next line should get its inames from
-                # elsewhere. This is *NOT* robust (but works right
-                # now)
-                _ansatz_inames = tuple(Variable(visitor.inames[i]) for i in range(world_dimension()))
-                assignee = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + output_inames)
-
-            # In case of vectorization we need to apply a horizontal add
-            if a_matrix.vectorized:
-                matprod = prim.Call(prim.Variable("horizontal_add"),
-                                    (matprod,))
-
-            # We need to accumulate
-            matprod = prim.Sum((assignee, matprod))
-        else:
-            assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
-
-        # Issue the reduction instruction that implements the multiplication
-        # at the same time store the instruction ID for the next instruction to depend on
-        insn_dep = frozenset({instruction(assignee=assignee,
-                                          expression=matprod,
-                                          forced_iname_deps=frozenset([iname for iname in out_inames]).union(additional_inames),
-                                          forced_iname_deps_is_final=True,
-                                          depends_on=insn_dep,
-                                          )
-                              })
-
-    # Measure times and count operations in c++ code
-    if get_option("instrumentation_level") >= 4:
-        insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
-                                          depends_on=insn_dep,
-                                          within_inames=additional_inames)})
-        if stage == 1:
-            qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
-            post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-            dump_accumulate_timer(timer_name)
-            insn_dep = instruction(code="HP_TIMER_START({});".format(qp_timer_name),
-                                   depends_on=insn_dep)
-
-    if outshape is None:
-        assert stage == 3
-        outshape = tuple(mat.rows for mat in a_matrices)
-
-    dim_tags = ",".join(['f'] * len(outshape))
-
-    if next(iter(a_matrices)).vectorized:
-        outshape = outshape + vec_shape
-        # This is a 'bit' hacky: In stage 3 we need to return something with vectag, in stage 1 not.
-        if stage == 1:
-            dim_tags = dim_tags + ",c"
-        else:
-            dim_tags = dim_tags + ",vec"
-
-    out = get_buffer_temporary(buf,
-                               shape=outshape,
-                               dim_tags=dim_tags,
-                               )
-    silenced_warning('read_no_write({})'.format(out))
-
-    return next(iter(a_matrices)).output_to_pymbolic(out), insn_dep
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb03a88d5f423fcd079a94598663ffc69de4937e
--- /dev/null
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -0,0 +1,263 @@
+""" A pymbolic node representing a sum factorization kernel """
+
+from pytools import ImmutableRecord
+
+import pymbolic.primitives as prim
+import loopy as lp
+import inspect
+
+
+class SumfactKernel(ImmutableRecord, prim.Variable):
+    def __init__(self,
+                 matrix_sequence=None,
+                 buffer=None,
+                 stage=1,
+                 preferred_position=None,
+                 restriction=0,
+                 within_inames=(),
+                 input=None,
+                 padding=frozenset(),
+                 index=None,
+                 insn_dep=frozenset(),
+                 coeff_func=None,
+                 element=None,
+                 component=None,
+                 accumvar=None,
+                 ):
+        """Create a sum factorization kernel
+
+        Sum factorization can be written as
+
+        Y = R_{d-1} (A_{d-1} * ... * R_0 (A_0 * X)...)
+
+        with:
+        - X: Input rank d tensor of dimension n_0 x ... x n_{d-1}
+        - Y: Output rank d tensor of dimension m_0 x ... x m_{d-1}
+        - A_l: Values of 1D basis evaluations at quadrature points in l
+               direction, matrix of dimension m_l x n_l
+        - R_l: Transformation operator that permutes the underlying data
+               vector of the rank d tensor in such a way that the fastest
+               direction gets the slowest direction
+
+        In the l'th step we have the following setup:
+        - A_l: Matrix of dimensions m_l x n_l
+        - X_l: Rank d tensor of dimensions n_l x ... x n_{d-1} x m_0 x ... x m_{l-1}
+        - R_l: Transformation operator
+
+        Looking at the indizes the following will happen:
+        X --> [n_l,...,n_{d-1},m_0,...,m_{l-1}]
+        A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
+        R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
+
+        So the multiplication with A_l is a reduction over one index and
+        the transformation brings the next reduction index in the fastest
+        position.
+
+        It can make sense to permute the order of directions. If you have
+        a small m_l (e.g. stage 1 on faces) it is better to do direction l
+        first. This can be done by:
+
+        - Permuting the order of the A matrices.
+        - Permuting the input tensor.
+        - Permuting the output tensor (this assures that the directions of
+          the output tensor are again ordered from 0 to d-1).
+
+        Note, that you will typically *not* set all of the below arguments,
+        but only some. The vectorization strategy may set others for you.
+        The only argument really needed in all cases is matrix_sequence.
+
+        Arguments:
+        ----------
+        matrix_sequence: A tuple of BasisTabulationMatrixBase instances
+            The list of tensors to be applied to the input.
+            Order of application is from 0 up.
+        buffer: A string identifying the flip flop buffer in use
+            for intermediate results. The memory is expected to be
+            pre-initialized with the input or you have to provide
+            direct_input (FastDGGridOperator).
+        stage: 1 or 3
+        insn_dep: An instruction ID that the first issued instruction
+            should depend upon. All following ones will depend on each
+            other.
+        within_inames: Instructions will be executed within those
+            inames (needed for stage 3 in jacobians).
+        preferred_position: Will be used in the dry run to order kernels
+            when doing vectorization e.g. (dx u,dy u,dz u, u).
+        restriction: Restriction for faces values.
+        input: The name to use for the input temporary
+        padding: a set of slots in the input temporary to be padded
+        index: The slot in the input temporary dedicated to this kernel
+        coeff_func: The function to call to get the input coefficient
+        element: The UFL element
+        component: The treepath to the correct component of above element
+        accumvar: The accumulation variable to accumulate into
+        """
+        # Check the input and apply defaults where necessary
+        assert isinstance(matrix_sequence, tuple)
+        from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase
+        assert all(isinstance(m, BasisTabulationMatrixBase) for m in matrix_sequence)
+
+        assert stage in (1, 3)
+
+        if preferred_position is not None:
+            assert isinstance(preferred_position, int)
+
+        if stage == 3:
+            assert isinstance(restriction, tuple)
+
+        assert isinstance(within_inames, tuple)
+
+        assert isinstance(insn_dep, frozenset)
+
+        # The following construction is a bit weird: Dict comprehensions do not have
+        # access to the locals of the calling scope: So we need to do the eval beforehand
+        defaults = [eval(a) for a in SumfactKernel.init_arg_names]
+        defaultdict = {a: defaults[SumfactKernel.init_arg_names.index(a)] for a in SumfactKernel.init_arg_names}
+
+        # Call the base class constructors
+        ImmutableRecord.__init__(self, **defaultdict)
+        prim.Variable.__init__(self, "SUMFACT")
+
+    #
+    # The methods/fields needed to get a well-formed pymbolic node
+    #
+
+    def __getinitargs__(self):
+        return tuple(getattr(self, arg) for arg in SumfactKernel.init_arg_names)
+
+    def stringifier(self):
+        return lp.symbolic.StringifyMapper
+
+    mapper_method = "map_sumfact_kernel"
+
+    #
+    # Some cache key definitions
+    # Watch out for the documentation to see which key is used unter what circumstances
+    #
+
+    @property
+    def cache_key(self):
+        """ The cache key that can be used in generation magic
+        Any two sum factorization kernels having the same cache_key
+        are realized simulatenously!
+        """
+        return (self.matrix_sequence, self.restriction, self.stage, self.buffer)
+
+    @property
+    def input_key(self):
+        """ A cache key for the input coefficients
+        Any two sum factorization kernels having the same input_key
+        work on the same input coefficient (and are suitable for simultaneous
+        treatment because of that)
+        """
+        return (self.restriction, self.stage, self.coeff_func, self.element, self.component, self.accumvar)
+
+    #
+    # Some convenience methods to extract information about the sum factorization kernel
+    #
+
+    @property
+    def length(self):
+        """ The number of matrices to apply """
+        return len(self.matrix_sequence)
+
+    @property
+    def vectorized(self):
+        return next(iter(self.matrix_sequence)).vectorized
+
+    @property
+    def transposed(self):
+        return next(iter(self.matrix_sequence)).transpose
+
+    @property
+    def vec_index(self):
+        """ A tuple with the vector index
+
+        Defaults to the empty tuple to allow addition to any
+        existing index tuple """
+        if self.index is not None:
+            return (self.index,)
+        else:
+            return ()
+
+    @property
+    def flat_input_shape(self):
+        """ The 'flat' input tensor shape """
+        from pytools import product
+        shape = (product(mat.cols for mat in self.matrix_sequence),)
+        if self.vectorized:
+            shape = shape + (4,)
+        return shape
+
+    @property
+    def quadrature_shape(self):
+        """ The shape of a temporary for the quadrature points
+
+        Takes into account the lower dimensionality of faces and vectorization.
+        """
+        if self.transposed:
+            shape = tuple(mat.cols for mat in self.matrix_sequence if mat.face is None)
+        else:
+            shape = tuple(mat.rows for mat in self.matrix_sequence if mat.face is None)
+        if self.vectorized:
+            shape = shape + (4,)
+        return shape
+
+    @property
+    def quadrature_dimtags(self):
+        """ The dim_tags of a temporary for the quadrature points
+
+        Takes into account the lower dimensionality of faces and vectorization.
+        """
+        tags = ["f"] * len(self.quadrature_shape)
+        if self.vectorized:
+            tags[-1] = 'c'
+        return ",".join(tags)
+
+    @property
+    def dof_shape(self):
+        """ The shape of a temporary for the degrees of freedom
+
+        Takes into account vectorization.
+        """
+        shape = tuple(mat.rows for mat in self.matrix_sequence)
+        if self.vectorized:
+            shape = shape + (4,)
+        return shape
+
+    @property
+    def dof_dimtags(self):
+        """ The dim_tags of a temporary for the degrees of freedom
+
+        Takes into account vectorization.
+        """
+        tags = ["f"] * len(self.dof_shape)
+        if self.vectorized:
+            tags[-1] = 'vec'
+        return ",".join(tags)
+
+    @property
+    def output_shape(self):
+        if self.stage == 1:
+            return self.quadrature_shape
+        else:
+            return self.dof_shape
+
+    @property
+    def output_dimtags(self):
+        if self.stage == 1:
+            return self.quadrature_dimtags
+        else:
+            return self.dof_dimtags
+
+    def output_to_pymbolic(self, name):
+        if self.vectorized:
+            return lp.TaggedVariable(name, "vector")
+        else:
+            return lp.TaggedVariable(name, "sumfac")
+
+
+# Extract the argument list and store it on the class. This needs to be done
+# outside of the class because the SumfactKernel class object needs to be fully
+# initialized in order to extract the information from __init__.
+SumfactKernel.init_arg_names = tuple(inspect.getargspec(SumfactKernel.__init__)[0][1:])
diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/tabulation.py
similarity index 88%
rename from python/dune/perftool/sumfact/amatrix.py
rename to python/dune/perftool/sumfact/tabulation.py
index 16f8b5289e074665f540ad67593af7ff20598bde..c6fa99e3b28a1a9f1a499d6559461a5ec1af7cc2 100644
--- a/python/dune/perftool/sumfact/amatrix.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -35,15 +35,19 @@ import loopy as lp
 import numpy
 
 
-class AMatrix(ImmutableRecord):
+class BasisTabulationMatrixBase(ImmutableRecord):
+    pass
+
+
+class BasisTabulationMatrix(BasisTabulationMatrixBase):
     def __init__(self, rows, cols, transpose=False, derivative=False, face=None):
-        ImmutableRecord.__init__(self,
-                                 rows=rows,
-                                 cols=cols,
-                                 transpose=transpose,
-                                 derivative=derivative,
-                                 face=face,
-                                 )
+        BasisTabulationMatrixBase.__init__(self,
+                                           rows=rows,
+                                           cols=cols,
+                                           transpose=transpose,
+                                           derivative=derivative,
+                                           face=face,
+                                           )
 
     @property
     def name(self):
@@ -53,20 +57,17 @@ class AMatrix(ImmutableRecord):
     def vectorized(self):
         return False
 
-    def output_to_pymbolic(self, name):
-        return lp.TaggedVariable(name, "sumfac")
-
 
-class LargeAMatrix(ImmutableRecord):
+class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
     def __init__(self, rows, cols, transpose, derivative, face):
         assert isinstance(derivative, tuple)
-        ImmutableRecord.__init__(self,
-                                 rows=rows,
-                                 cols=cols,
-                                 transpose=transpose,
-                                 derivative=derivative,
-                                 face=face,
-                                 )
+        BasisTabulationMatrixBase.__init__(self,
+                                           rows=rows,
+                                           cols=cols,
+                                           transpose=transpose,
+                                           derivative=derivative,
+                                           face=face,
+                                           )
 
     @property
     def name(self):
@@ -91,14 +92,8 @@ class LargeAMatrix(ImmutableRecord):
     def vectorized(self):
         return True
 
-    def output_to_pymbolic(self, name):
-        return lp.TaggedVariable(name, "vector")
-
 
 def quadrature_points_per_direction():
-    # TODO use quadrature order from dune.perftool.pdelab.quadrature
-    # as soon as we have a generic implementation
-
     # Quadrature order
     q = quadrature_order()
 
@@ -274,7 +269,7 @@ def name_theta(transpose=False, derivative=False, face=None):
     return name
 
 
-def construct_amatrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None):
+def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None):
     dim = world_dimension()
     result = [None] * dim
 
@@ -290,6 +285,6 @@ def construct_amatrix_sequence(transpose=False, derivative=None, facedir=None, f
         if transpose:
             rows, cols = cols, rows
 
-        result[i] = AMatrix(rows, cols, transpose=transpose, derivative=derivative == i, face=onface)
+        result[i] = BasisTabulationMatrix(rows, cols, transpose=transpose, derivative=derivative == i, face=onface)
 
     return tuple(result)
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 0adcc6ab3b833fe010f36afea3ea17b087f60a37..ea0584641d81440497d98f7101c8c82a03fbd46f 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -1,55 +1,50 @@
 """ Sum factorization vectorization """
 
+from dune.perftool.loopy.symbolic import SumfactKernel
 from dune.perftool.generation import (generator_factory,
                                       get_counted_variable,
+                                      get_global_context_value,
                                       )
 from dune.perftool.pdelab.restriction import (Restriction,
                                               restricted_name,
                                               )
+from dune.perftool.sumfact.tabulation import BasisTabulationMatrixArray
 from dune.perftool.error import PerftoolError
 from dune.perftool.options import get_option
 
 import loopy as lp
 
 
-@generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda a, r, *args: (a, r))
-def vectorization_info(a_matrices, restriction, new_a_matrices, buf, inp, index, padding):
-    return (new_a_matrices, buf, inp, index, padding)
+@generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o)
+def _cache_vectorization_info(old, new):
+    if new is None:
+        raise PerftoolError("Vectorization info for sum factorization kernel was not gathered correctly!")
+    return new
 
 
-def get_vectorization_info(a_matrices, restriction):
-    if not isinstance(restriction, tuple):
-        restriction = (restriction, 0)
-    from dune.perftool.generation import get_global_context_value
-    if get_global_context_value("dry_run"):
-        # Return dummy data
-        return (a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None, frozenset())
+_collect_sumfact_nodes = generator_factory(item_tags=("sumfactnodes", "dryrundata"), context_tags="kernel", no_deco=True)
 
-    # Try getting the vectorization info collected after dry run
-    vec = vectorization_info(a_matrices, restriction, None, None, None, None, None)
-    if vec[0] is None:
-        raise PerftoolError("Sumfact Vectorization data should have been collected in dry run, but wasnt")
-    return vec
+
+def attach_vectorization_info(sf):
+    assert isinstance(sf, SumfactKernel)
+    if get_global_context_value("dry_run"):
+        return _collect_sumfact_nodes(sf)
+    else:
+        return _cache_vectorization_info(sf, None)
 
 
 def no_vectorization(sumfacts):
-    for sumf in sumfacts:
-        vectorization_info(sumf.a_matrices,
-                           sumf.restriction,
-                           sumf.a_matrices,
-                           get_counted_variable("buffer"),
-                           get_counted_variable("input"),
-                           None,
-                           frozenset())
-
-
-def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
-    stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage and sf.restriction == restriction])
-    if len(stage_sumfacts) in (3, 4):
+    for sf in sumfacts:
+        _cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer"),
+                                              input=get_counted_variable("input")))
+
+
+def horizontal_vectorization_strategy(sumfacts):
+    if len(sumfacts) in (3, 4):
         # Map the sum factorization to their position in the joint kernel
         position_mapping = {}
         available = set(range(4))
-        for sf in stage_sumfacts:
+        for sf in sumfacts:
             if sf.preferred_position is not None:
                 # This asserts that no two kernels want to take the same position
                 # Later on, more complicated stuff might be necessary here.
@@ -58,7 +53,7 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
                 position_mapping[sf] = sf.preferred_position
 
         # Choose a position for those that have no preferred one!
-        for sumf in stage_sumfacts:
+        for sumf in sumfacts:
             if sumf.preferred_position is None:
                 position_mapping[sumf] = available.pop()
 
@@ -67,31 +62,38 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
         buf = get_counted_variable("joined_buffer")
 
         # Collect the large matrices!
-        large_a_matrices = []
-        for i in range(len(next(iter(stage_sumfacts)).a_matrices)):
+        large_matrix_sequence = []
+        for i in range(len(next(iter(sumfacts)).matrix_sequence)):
             # Assert that the matrices of all sum factorizations have the same size
-            assert len(set(tuple(sf.a_matrices[i].rows for sf in stage_sumfacts))) == 1
-            assert len(set(tuple(sf.a_matrices[i].cols for sf in stage_sumfacts))) == 1
+            assert len(set(tuple(sf.matrix_sequence[i].rows for sf in sumfacts))) == 1
+            assert len(set(tuple(sf.matrix_sequence[i].cols for sf in sumfacts))) == 1
 
             # Collect the derivative information
             derivative = [False] * 4
-            for sf in stage_sumfacts:
-                derivative[position_mapping[sf]] = sf.a_matrices[i].derivative
-
-            from dune.perftool.sumfact.amatrix import LargeAMatrix
-            large = LargeAMatrix(rows=next(iter(stage_sumfacts)).a_matrices[i].rows,
-                                 cols=next(iter(stage_sumfacts)).a_matrices[i].cols,
-                                 transpose=next(iter(stage_sumfacts)).a_matrices[i].transpose,
-                                 derivative=tuple(derivative),
-                                 face=next(iter(stage_sumfacts)).a_matrices[i].face,
-                                 )
-            large_a_matrices.append(large)
-
-        for sumf in stage_sumfacts:
-            vectorization_info(sumf.a_matrices, sumf.restriction, tuple(large_a_matrices), buf, inp, position_mapping[sumf], frozenset(available))
+            for sf in sumfacts:
+                derivative[position_mapping[sf]] = sf.matrix_sequence[i].derivative
+
+            large = BasisTabulationMatrixArray(rows=next(iter(sumfacts)).matrix_sequence[i].rows,
+                                               cols=next(iter(sumfacts)).matrix_sequence[i].cols,
+                                               transpose=next(iter(sumfacts)).matrix_sequence[i].transpose,
+                                               derivative=tuple(derivative),
+                                               face=next(iter(sumfacts)).matrix_sequence[i].face,
+                                               )
+            large_matrix_sequence.append(large)
+
+        for sumf in sumfacts:
+            _cache_vectorization_info(sumf,
+                                      sumf.copy(matrix_sequence=tuple(large_matrix_sequence),
+                                                buffer=buf,
+                                                input=inp,
+                                                index=position_mapping[sumf],
+                                                padding=frozenset(available),
+                                                insn_dep=frozenset().union(sf.insn_dep for sf in sumfacts),
+                                                )
+                                      )
     else:
         # Disable vectorization strategy
-        no_vectorization(stage_sumfacts)
+        no_vectorization(sumfacts)
 
 
 def decide_vectorization_strategy():
@@ -100,40 +102,14 @@ def decide_vectorization_strategy():
     as it is implemented through a post-processing (== loopy transformation) step.
     """
     from dune.perftool.generation import retrieve_cache_items
-    insns = [i for i in retrieve_cache_items("kernel_default and instruction")]
-
-    # Find all sum factorization kernels
-    sumfacts = frozenset()
-    for insn in insns:
-        if isinstance(insn, (lp.Assignment, lp.CallInstruction)):
-            sumfacts = sumfacts.union(find_sumfact(insn.expression))
+    sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")]
 
     if not get_option("vectorize_grads"):
         no_vectorization(sumfacts)
     else:
-        for stage in (1, 3):
-            res = (Restriction.NONE, Restriction.POSITIVE, Restriction.NEGATIVE)
-            import itertools as it
-            for restriction in it.product(res, res):
-                decide_stage_vectorization_strategy(sumfacts, stage, restriction)
-
-
-class HasSumfactMapper(lp.symbolic.CombineMapper):
-    def combine(self, *args):
-        return frozenset().union(*tuple(*args))
-
-    def map_constant(self, expr):
-        return frozenset()
-
-    def map_algebraic_leaf(self, expr):
-        return frozenset()
-
-    def map_loopy_function_identifier(self, expr):
-        return frozenset()
-
-    def map_sumfact_kernel(self, expr):
-        return frozenset({expr})
-
-
-def find_sumfact(expr):
-    return HasSumfactMapper()(expr)
+        # Currently we base our idea here on the fact that we only group sum
+        # factorization kernels with the same input.
+        inputkeys = set(sf.input_key for sf in sumfacts)
+        for inputkey in inputkeys:
+            sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey]
+            horizontal_vectorization_strategy(sumfact_filter)
diff --git a/python/dune/perftool/ufl/visitor.py b/python/dune/perftool/ufl/visitor.py
index 96f1cb85c89ee566882d04e56d386dac6df94d8c..38b7eee67b61d62cb8ac57ddc145eac7fb7aa4f6 100644
--- a/python/dune/perftool/ufl/visitor.py
+++ b/python/dune/perftool/ufl/visitor.py
@@ -98,7 +98,8 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
             shape = len(element.value_shape())
             indices = indices[shape:]
             for i in range(len(element.value_shape())):
-                self.inames = self.inames + (self.interface.dimension_iname(context='arg', count=i),)
+                if self.interface.dimension_iname(context='arg', count=i) not in self.inames:
+                    self.inames = self.inames + (self.interface.dimension_iname(context='arg', count=i),)
 
             # For the purpose of basis evaluation, we need to take the leaf element
             leaf_element = element.sub_elements()[0]
@@ -108,7 +109,9 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
 
         # Have the issued instruction depend on the iname for this localfunction space
         inames = self.interface.lfs_inames(leaf_element, restriction, o.number())
-        self.inames = self.inames + inames
+        for iname in inames:
+            if iname not in self.inames:
+                self.inames = self.inames + (iname,)
 
         if self.reference_grad:
             return maybe_wrap_subscript(self.interface.pymbolic_reference_gradient(leaf_element, restriction, o.number()), indices)
@@ -215,7 +218,8 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
             return index._value
         else:
             if index in self.dimension_indices:
-                self.inames = self.inames + (self.dimension_indices[index],)
+                if self.dimension_indices[index] not in self.inames:
+                    self.inames = self.inames + (self.dimension_indices[index],)
                 return Variable(self.dimension_indices[index])
             else:
                 return Variable(self.interface.name_index(index))