diff --git a/dune/perftool/common/vectorclass.hh b/dune/perftool/common/vectorclass.hh
index fd405135573d4b4098ddc00cba2f816aa3932a80..6204b0a213796861533b9668f4c9a869198eb7cc 100644
--- a/dune/perftool/common/vectorclass.hh
+++ b/dune/perftool/common/vectorclass.hh
@@ -11,12 +11,38 @@
 
 #define BARRIER asm volatile("": : :"memory")
 
+template<typename T>
+struct base_floatingpoint
+{};
+
 #ifndef ENABLE_COUNTER
 
 #include <dune/perftool/vectorclass/vectorclass.h>
 #include <dune/perftool/vectorclass/vectormath_exp.h>
 #include <dune/perftool/vectorclass/vectormath_trig.h>
 
+template<>
+struct base_floatingpoint<Vec4d>
+{
+  using value = double;
+};
+
+template<>
+struct base_floatingpoint<Vec8f>
+{
+  using value = float;
+};
+
+#if MAX_VECTOR_SIZE >= 512
+
+template<>
+struct base_floatingpoint<Vec8d>
+{
+  using value = double;
+};
+
+#endif
+
 #else
 
 #include <algorithm>
@@ -46,10 +72,11 @@ struct Vec4d
     BARRIER;
   }
 
-  Vec4d(double d)
+  Vec4d(F dl, F du)
   {
     BARRIER;
-    std::fill(_d,_d+4,d);
+    std::fill(_d,_d+2,dl);
+    std::fill(_d+2,_d+4,du);
     BARRIER;
   }
 
@@ -114,6 +141,11 @@ struct Vec4d
 
 };
 
+template<>
+struct base_floatingpoint<Vec4d>
+{
+  using value = typename Vec4d::F;
+};
 
 /*****************************************************************************
 *
@@ -142,7 +174,7 @@ static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) {
 static inline Vec4d operator ++ (Vec4d & a, int) {
   BARRIER;
   Vec4d a0 = a;
-  a = a + 1.0;
+  a = a + Vec4d(1.0);
   BARRIER;
   return a0;
 }
@@ -150,7 +182,7 @@ static inline Vec4d operator ++ (Vec4d & a, int) {
 // prefix operator ++
 static inline Vec4d & operator ++ (Vec4d & a) {
   BARRIER;
-  a = a + 1.0;
+  a = a + Vec4d(1.0);
   BARRIER;
   return a;
 }
@@ -187,7 +219,7 @@ static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) {
 static inline Vec4d operator -- (Vec4d & a, int) {
   BARRIER;
   Vec4d a0 = a;
-  a = a - 1.0;
+  a = a - Vec4d(1.0);
   BARRIER;
   return a0;
 }
@@ -195,7 +227,7 @@ static inline Vec4d operator -- (Vec4d & a, int) {
 // prefix operator --
 static inline Vec4d & operator -- (Vec4d & a) {
   BARRIER;
-  a = a - 1.0;
+  a = a - Vec4d(1.0);
   BARRIER;
   return a;
 }
@@ -708,10 +740,19 @@ struct Vec8d
     BARRIER;
   }
 
-  Vec8d(double d)
+  Vec8d(F dl, F du)
   {
     BARRIER;
-    std::fill(_d,_d+8,d);
+    std::fill(_d,_d+4,dl);
+    std::fill(_d+4,_d+8,du);
+    BARRIER;
+  }
+
+  Vec8d(Vec4d low, Vec4d high)
+  {
+    BARRIER;
+    std::copy(_d, _d+4, low._d);
+    std::copy(_d+4, _d+8, high._d);
     BARRIER;
   }
 
@@ -721,6 +762,24 @@ struct Vec8d
     BARRIER;
   }
 
+  Vec4d get_low() const
+  {
+    BARRIER;
+    Vec4d ret;
+    ret.load(_d);
+    BARRIER;
+    return ret;
+  }
+
+  Vec4d get_high() const
+  {
+    BARRIER;
+    Vec4d ret;
+    ret.load(_d + 4);
+    BARRIER;
+    return ret;
+  }
+
   Vec8d& load(const F* p)
   {
     BARRIER;
@@ -776,6 +835,11 @@ struct Vec8d
 
 };
 
+template<>
+struct base_floatingpoint<Vec8d>
+{
+  using value = typename Vec8d::F;
+};
 
 /*****************************************************************************
 *
@@ -804,7 +868,7 @@ static inline Vec8d & operator += (Vec8d & a, Vec8d const & b) {
 static inline Vec8d operator ++ (Vec8d & a, int) {
   BARRIER;
   Vec8d a0 = a;
-  a = a + 1.0;
+  a = a + Vec8d(1.0);
   BARRIER;
   return a0;
 }
@@ -812,7 +876,7 @@ static inline Vec8d operator ++ (Vec8d & a, int) {
 // prefix operator ++
 static inline Vec8d & operator ++ (Vec8d & a) {
   BARRIER;
-  a = a + 1.0;
+  a = a + Vec8d(1.0);
   BARRIER;
   return a;
 }
@@ -849,7 +913,7 @@ static inline Vec8d & operator -= (Vec8d & a, Vec8d const & b) {
 static inline Vec8d operator -- (Vec8d & a, int) {
   BARRIER;
   Vec8d a0 = a;
-  a = a - 1.0;
+  a = a - Vec8d(1.0);
   BARRIER;
   return a0;
 }
@@ -857,7 +921,7 @@ static inline Vec8d operator -- (Vec8d & a, int) {
 // prefix operator --
 static inline Vec8d & operator -- (Vec8d & a) {
   BARRIER;
-  a = a - 1.0;
+  a = a - Vec8d(1.0);
   BARRIER;
   return a;
 }
@@ -1215,6 +1279,11 @@ struct Vec8f
 
 };
 
+template<>
+struct base_floatingpoint<Vec8f>
+{
+  using value = typename Vec8f::F;
+};
 
 /*****************************************************************************
 *
@@ -1243,7 +1312,7 @@ static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) {
 static inline Vec8f operator ++ (Vec8f & a, int) {
   BARRIER;
   Vec8f a0 = a;
-  a = a + 1.0;
+  a = a + Vec8f(1.0);
   BARRIER;
   return a0;
 }
@@ -1251,7 +1320,7 @@ static inline Vec8f operator ++ (Vec8f & a, int) {
 // prefix operator ++
 static inline Vec8f & operator ++ (Vec8f & a) {
   BARRIER;
-  a = a + 1.0;
+  a = a + Vec8f(1.0);
   BARRIER;
   return a;
 }
@@ -1288,7 +1357,7 @@ static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) {
 static inline Vec8f operator -- (Vec8f & a, int) {
   BARRIER;
   Vec8f a0 = a;
-  a = a - 1.0;
+  a = a - Vec8f(1.0);
   BARRIER;
   return a0;
 }
@@ -1296,7 +1365,7 @@ static inline Vec8f operator -- (Vec8f & a, int) {
 // prefix operator --
 static inline Vec8f & operator -- (Vec8f & a) {
   BARRIER;
-  a = a - 1.0;
+  a = a - Vec8f(1.0);
   BARRIER;
   return a;
 }
diff --git a/dune/perftool/sumfact/horizontaladd.hh b/dune/perftool/sumfact/horizontaladd.hh
new file mode 100644
index 0000000000000000000000000000000000000000..db7634f0e5507214318dabb719240ff3674808ed
--- /dev/null
+++ b/dune/perftool/sumfact/horizontaladd.hh
@@ -0,0 +1,19 @@
+#ifndef DUNE_PERFTOOL_SUMFACT_HORIZONTALADD_HH
+#define DUNE_PERFTOOL_SUMFACT_HORIZONTALADD_HH
+
+#include<dune/perftool/common/vectorclass.hh>
+
+
+template<class V>
+typename base_floatingpoint<V>::value horizontal_add_lower(const V& x)
+{
+  return horizontal_add(x.get_low());
+}
+
+template<class V>
+typename base_floatingpoint<V>::value horizontal_add_upper(const V& x)
+{
+  return horizontal_add(x.get_high());
+}
+
+#endif
diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh
index f73c6a2f717b243e32411a5feea948c7777ce430..d2ce09c39bf6ff5b87cc236cbf5d2a5bfcf850f1 100644
--- a/dune/perftool/sumfact/transposereg.hh
+++ b/dune/perftool/sumfact/transposereg.hh
@@ -66,6 +66,9 @@ void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
   a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3);
 }
 
+/** TODO: Is this transpose using blend8d superior to the swap_halves
+ *        version below using get_low/get_high?
+ */
 void transpose_reg (Vec8d& a0, Vec8d& a1)
 {
   Vec8d b0, b1;
@@ -75,6 +78,48 @@ void transpose_reg (Vec8d& a0, Vec8d& a1)
   a1 = b1;
 }
 
+namespace impl
+{
+  /* (alow, aupp), (blow, bupp) -> (alow, blow), (aupp, bupp) */
+  void swap_halves(Vec8d& a, Vec8d& b)
+  {
+    Vec4d tmp = a.get_high();
+    a = Vec8d(a.get_low(), b.get_low());
+    b = Vec8d(tmp, b.get_high());
+  }
+
+  /* A 4x8 transpose that behaves exactly like Vec4d's 4x4 transpose
+   * on the lower and upper halves of the Vec8d
+   */
+  void _transpose4x8(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
+  {
+    Vec8d b0,b1,b2,b3;
+    b0 = blend8d<0,8,2,10,4,12,6,14>(a0,a1);
+    b1 = blend8d<1,9,3,11,5,13,7,15>(a0,a1);
+    b2 = blend8d<0,8,2,10,4,12,6,14>(a2,a3);
+    b3 = blend8d<1,9,3,11,5,13,7,15>(a2,a3);
+    a0 = blend8d<0,1,8,9,4,5,12,13>(b0,b2);
+    a1 = blend8d<0,1,8,9,4,5,12,13>(b1,b3);
+    a2 = blend8d<2,3,10,11,6,7,14,15>(b0,b2);
+    a3 = blend8d<2,3,10,11,6,7,14,15>(b1,b3);
+  }
+}
+
+/* This is the 8x8 transpose of Vec8d's. It uses the same shuffling
+ * as Vec4d, but on the 4x4 subblocks. Afterwards, the off diagonal
+ * blocks are swapped.
+ */
+void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3,
+                   Vec8d& a4, Vec8d& a5, Vec8d& a6, Vec8d& a7)
+{
+  impl::_transpose4x8(a0,a1,a2,a3);
+  impl::_transpose4x8(a4,a5,a6,a7);
+  impl::swap_halves(a0,a4);
+  impl::swap_halves(a1,a5);
+  impl::swap_halves(a2,a6);
+  impl::swap_halves(a3,a7);
+}
+
 #endif
 
 #endif
diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py
index c8c085c178d7dc2224d1efd1d76efaacf9da5259..e541e71366371039157cb757201f710ffd0a19ca 100644
--- a/python/dune/perftool/generation/__init__.py
+++ b/python/dune/perftool/generation/__init__.py
@@ -43,6 +43,7 @@ from dune.perftool.generation.loopy import (barrier,
                                             kernel_cached,
                                             noop_instruction,
                                             silenced_warning,
+                                            subst_rule,
                                             temporary_variable,
                                             transform,
                                             valuearg,
diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index a97df4744fd1e661554a6febb306a8f858589c31..df47d5e9fb1ac03d8ed6bde6ec1de203470f4d2f 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -172,8 +172,8 @@ def noop_instruction(**kwargs):
                    context_tags="kernel",
                    cache_key_generator=no_caching,
                    )
-def transform(trafo, *args):
-    return (trafo, args)
+def transform(trafo, *args, **kwargs):
+    return (trafo, args, kwargs)
 
 
 @generator_factory(item_tags=("instruction", "barrier"),
@@ -216,3 +216,8 @@ def loopy_class_member(name, classtag=None, potentially_vectorized=False, **kwar
     globalarg(name, **kwargs)
 
     return name
+
+
+@generator_factory(item_tags=("substrule",), context_tags="kernel")
+def subst_rule(name, args, expr):
+    return lp.SubstitutionRule(name, args, expr)
diff --git a/python/dune/perftool/loopy/transformations/vectorview.py b/python/dune/perftool/loopy/transformations/vectorview.py
index 1160c8bda81f0f81f6a6c9c6b8c2db327b0fe79d..9b74e3d8ca67cc28584def2ec0055b5504febfc4 100644
--- a/python/dune/perftool/loopy/transformations/vectorview.py
+++ b/python/dune/perftool/loopy/transformations/vectorview.py
@@ -33,7 +33,9 @@ def add_vector_view(knl, tmpname, pad_to=None, flatview=False):
 
     # Add base storage to the original temporary!
     if not temp.base_storage:
-        temp = temp.copy(base_storage=bsname)
+        temp = temp.copy(base_storage=bsname,
+                         _base_storage_access_may_be_aliasing=True,
+                         )
         temporaries[tmpname] = temp
     else:
         bsname = temp.base_storage
@@ -77,6 +79,7 @@ def add_vector_view(knl, tmpname, pad_to=None, flatview=False):
                                                 base_storage=bsname,
                                                 dtype=dtype_floatingpoint(),
                                                 scope=lp.temp_var_scope.PRIVATE,
+                                                _base_storage_access_may_be_aliasing=True,
                                                 )
 
     # Avoid that any of these temporaries are eliminated
diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py
index 143d9b6c0a0ef84f76a9fe39c3024ca4cec041b5..f76f7881cf76e4360d2b6b8c0e691cfcad1488c1 100644
--- a/python/dune/perftool/loopy/vcl.py
+++ b/python/dune/perftool/loopy/vcl.py
@@ -2,7 +2,7 @@
 Our extensions to the loopy type system
 """
 from dune.perftool.options import get_option
-from dune.perftool.generation import function_mangler
+from dune.perftool.generation import function_mangler, include_file
 
 import loopy as lp
 import numpy as np
@@ -62,8 +62,10 @@ def get_vcl_typename(nptype, register_size=None, vector_width=None):
 
 
 class ExplicitVCLCast(lp.symbolic.FunctionIdentifier):
-    def __init__(self, nptype, vector_width):
+    def __init__(self, nptype, vector_width=None):
         self.nptype = nptype
+        if vector_width is None:
+            vector_width = get_vcl_type_size(nptype)
         self.vector_width = vector_width
 
     def __getinitargs__(self):
@@ -74,8 +76,17 @@ class ExplicitVCLCast(lp.symbolic.FunctionIdentifier):
         return get_vcl_typename(self.nptype, vector_width=self.vector_width)
 
 
+class VCLLowerUpperLoad(ExplicitVCLCast):
+    pass
+
+
 @function_mangler
 def vcl_cast_mangler(knl, func, arg_dtypes):
+    if isinstance(func, VCLLowerUpperLoad):
+        return lp.CallMangleInfo(func.name,
+                                 (lp.types.NumpyType(func.nptype),),
+                                 arg_dtypes)
+
     if isinstance(func, ExplicitVCLCast):
         return lp.CallMangleInfo(func.name, (lp.types.NumpyType(func.nptype),), (arg_dtypes[0],))
 
@@ -92,7 +103,8 @@ def vcl_function_mangler(knl, func, arg_dtypes):
         vcl = lp.types.NumpyType(get_vcl_type(dtype))
         return lp.CallMangleInfo("select", (vcl,), (vcl, vcl, vcl))
 
-    if func == "horizontal_add":
+    if func in ("horizontal_add", "horizontal_add_lower", "horizontal_add_upper"):
         dtype = arg_dtypes[0]
         vcl = lp.types.NumpyType(get_vcl_type(dtype))
-        return lp.CallMangleInfo("horizontal_add", (lp.types.NumpyType(dtype.dtype),), (vcl,))
+        include_file("dune/perftool/sumfact/horizontaladd.hh", filetag="operatorfile")
+        return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype.dtype),), (vcl,))
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 5041772548f6646a9c1de40f0d6e929078900faa..6c239ccd5af49e9c042528024a4e6fc5c13b4d0f 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -78,7 +78,7 @@ class PerftoolFormOptionsArray(ImmutableRecord):
     fastdg = PerftoolOption(default=False, helpstr="Use FastDGGridOperator from PDELab.")
     sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization")
     vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
-    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|fromlist")
+    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model")
     vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy")
     vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy")
     vectorization_padding = PerftoolOption(default=None, helpstr="an explicit value for the allowed padding in vectorization")
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 1c3668ed3bca803035369a7ff63ed5c75bc4ad7a..03496a5cfbc24b0191e29df5bf83bac13fa99bc9 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -541,19 +541,7 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
 
     # Apply the transformations that were gathered during tree traversals
     for trafo in transformations:
-        kernel = trafo[0](kernel, *trafo[1])
-
-    # Precompute all the substrules
-    for sr in kernel.substitutions:
-        tmpname = "precompute_{}".format(sr)
-        kernel = lp.precompute(kernel,
-                               sr,
-                               temporary_name=tmpname,
-                               )
-        # Vectorization strategies are actually very likely to eliminate the
-        # precomputation temporary. To avoid the temporary elimination warning
-        # we need to explicitly disable it.
-        kernel = kernel.copy(silenced_warnings=kernel.silenced_warnings + ["temp_to_write({})".format(tmpname)])
+        kernel = trafo[0](kernel, *trafo[1], **trafo[2])
 
     from dune.perftool.loopy import heuristic_duplication
     kernel = heuristic_duplication(kernel)
diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
index 9566efb71e358dcaf346bd3a24b94858e3a56f26..02eb474754f1233e8cfbe86f57c474db4d2ed9d2 100644
--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -9,12 +9,15 @@ from dune.perftool.generation import (backend,
                                       generator_factory,
                                       get_counted_variable,
                                       get_counter,
+                                      get_global_context_value,
+                                      globalarg,
                                       iname,
                                       instruction,
                                       post_include,
                                       kernel_cached,
                                       temporary_variable,
                                       transform,
+                                      valuearg
                                       )
 from dune.perftool.options import (get_form_option,
                                    get_option,
@@ -26,15 +29,16 @@ from dune.perftool.pdelab.localoperator import determine_accumulation_space
 from dune.perftool.pdelab.restriction import restricted_name
 from dune.perftool.pdelab.signatures import assembler_routine_name
 from dune.perftool.pdelab.geometry import world_dimension
+from dune.perftool.pdelab.spaces import name_lfs
 from dune.perftool.sumfact.tabulation import (basis_functions_per_direction,
                                               construct_basis_matrix_sequence,
                                               )
 from dune.perftool.sumfact.switch import (get_facedir,
                                           get_facemod,
                                           )
-from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInputBase
+from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelOutputBase
 from dune.perftool.ufl.modified_terminals import extract_modified_arguments
-from dune.perftool.tools import get_pymbolic_basename
+from dune.perftool.tools import get_pymbolic_basename, get_leaf
 from dune.perftool.error import PerftoolError
 from dune.perftool.sumfact.quadrature import quadrature_inames
 
@@ -81,21 +85,153 @@ def accum_iname(element, bound, i):
     return sumfact_iname(bound, "accum{}".format(suffix))
 
 
-class AlreadyAssembledInput(SumfactKernelInputBase):
-    def __init__(self, index):
-        self.index = index
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.index == other.index
+class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord):
+    def __init__(self,
+                 accumvar=None,
+                 restriction=None,
+                 test_element=None,
+                 test_element_index=None,
+                 trial_element=None,
+                 trial_element_index=None,
+                 ):
+        # TODO: Isnt accumvar superfluous in the presence of all the other infos?
+        ImmutableRecord.__init__(self,
+                                 accumvar=accumvar,
+                                 restriction=restriction,
+                                 test_element=test_element,
+                                 test_element_index=test_element_index,
+                                 trial_element=trial_element,
+                                 trial_element_index=trial_element_index,
+                                 )
 
     def __repr__(self):
-        return "AlreadyAssembledInput({})".format(self.index)
+        return ImmutableRecord.__repr__(self)
 
-    def __hash__(self):
-        return hash(self.index)
+    @property
+    def within_inames(self):
+        if self.trial_element is None:
+            return ()
+        else:
+            from dune.perftool.sumfact.basis import lfs_inames
+            return lfs_inames(get_leaf(self.trial_element, self.trial_element_index), self.restriction)
+
+    def realize(self, sf, result, insn_dep, inames=None, additional_inames=()):
+        trial_leaf_element = get_leaf(self.trial_element, self.trial_element_index) if self.trial_element is not None else None
+
+        basis_size = tuple(mat.basis_size for mat in sf.matrix_sequence)
+
+        if inames is None:
+            inames = tuple(accum_iname(trial_leaf_element, mat.rows, i)
+                           for i, mat in enumerate(sf.matrix_sequence))
+
+            # Determine the expression to accumulate with. This depends on the vectorization strategy!
+            from dune.perftool.tools import maybe_wrap_subscript
+            result = maybe_wrap_subscript(result, tuple(prim.Variable(i) for i in inames))
+
+        # Collect the lfs and lfs indices for the accumulate call
+        restriction = (0, 0) if self.restriction is None else self.restriction
+        test_lfs = name_lfs(self.test_element, restriction[0], self.test_element_index)
+        valuearg(test_lfs, dtype=lp.types.NumpyType("str"))
+        test_lfs_index = flatten_index(tuple(prim.Variable(i) for i in inames),
+                                       basis_size,
+                                       order="f"
+                                       )
+
+        accum_args = [prim.Variable(test_lfs), test_lfs_index]
+
+        # In the jacobian case, also determine the space for the ansatz space
+        if sf.within_inames:
+            # TODO the next line should get its inames from
+            # elsewhere. This is *NOT* robust (but works right now)
+            ansatz_lfs = name_lfs(self.trial_element, restriction[1], self.trial_element_index)
+            valuearg(ansatz_lfs, dtype=lp.types.NumpyType("str"))
+            from dune.perftool.sumfact.basis import _basis_functions_per_direction
+            ansatz_lfs_index = flatten_index(tuple(prim.Variable(sf.within_inames[i])
+                                                   for i in range(world_dimension())),
+                                             _basis_functions_per_direction(trial_leaf_element),
+                                             order="f"
+                                             )
+
+            accum_args.append(prim.Variable(ansatz_lfs))
+            accum_args.append(ansatz_lfs_index)
+
+        accum_args.append(result)
+
+        if not get_form_option("fastdg"):
+            rank = 2 if self.within_inames else 1
+            expr = prim.Call(PDELabAccumulationFunction(self.accumvar, rank),
+                             tuple(accum_args)
+                             )
+            dep = instruction(assignees=(),
+                              expression=expr,
+                              forced_iname_deps=frozenset(inames + additional_inames + self.within_inames),
+                              forced_iname_deps_is_final=True,
+                              depends_on=insn_dep,
+                              predicates=sf.predicates
+                              )
 
-    def __str__(self):
-        return "Input{}".format(self.index[0])
+        return frozenset({dep})
+
+    def realize_direct(self, result, inames, shape, args):
+        ft = get_global_context_value("form_type")
+
+        if self.test_element_index is None:
+            direct_output = "{}_access".format(self.accumvar)
+        else:
+            direct_output = "{}_access_comp{}".format(self.accumvar, self.test_element_index)
+        ftags = ",".join(["f"] * len(shape))
+
+        from dune.perftool.sumfact.realization import alias_data_array
+        if ft == 'residual' or ft == 'jacobian_apply':
+            globalarg(direct_output,
+                      shape=shape,
+                      dim_tags=ftags,
+                      offset=_dof_offset(self.test_element, self.test_element_index),
+                      )
+            alias_data_array(direct_output, self.accumvar)
+            lhs = prim.Subscript(prim.Variable(direct_output), inames)
+        else:
+            assert ft == 'jacobian'
+
+            direct_output = "{}x{}".format(direct_output, self.trial_element_index)
+            rowsize = sum(tuple(s for s in _local_sizes(self.trial_element)))
+            element = get_leaf(self.trial_element, self.trial_element_index)
+            other_shape = tuple(element.degree() + 1 for e in shape)
+            from pytools import product
+            manual_strides = tuple("stride:{}".format(rowsize * product(shape[:i])) for i in range(len(shape)))
+            dim_tags = "{},{}".format(ftags, ",".join(manual_strides))
+            globalarg(direct_output,
+                      shape=other_shape + shape,
+                      offset=rowsize * _dof_offset(self.test_element, self.test_element_index) + _dof_offset(self.trial_element, self.trial_element_index),
+                      dim_tags=dim_tags,
+                      )
+            alias_data_array(direct_output, self.accumvar)
+            # TODO: It is at least questionnable, whether using the *order* of the inames in here
+            #       for indexing is a good idea. Then again, it is hard to find an alternative.
+            _ansatz_inames = tuple(prim.Variable(i) for i in self.within_inames)
+            lhs = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + inames)
+
+        result = prim.Sum((lhs, result))
+        return frozenset({instruction(assignee=lhs, expression=result, **args)})
+
+
+def _local_sizes(element):
+    from ufl import FiniteElement, MixedElement
+    if isinstance(element, MixedElement):
+        for subel in element.sub_elements():
+            for s in _local_sizes(subel):
+                yield s
+    else:
+        assert isinstance(element, FiniteElement)
+        yield (element.degree() + 1)**element.cell().geometric_dimension()
+
+
+def _dof_offset(element, component):
+    if component is None:
+        return 0
+    else:
+        sizes = tuple(s for s in _local_sizes(element))
+        return sum(sizes[0:component])
 
 
 class SumfactAccumulationInfo(ImmutableRecord):
@@ -266,16 +402,18 @@ def generate_accumulation_instruction(expr, visitor):
     if priority is None:
         priority = 3
 
+    output = AccumulationOutput(accumvar=accumvar,
+                                restriction=(test_info.restriction, trial_info.restriction),
+                                test_element=test_info.element,
+                                test_element_index=test_info.element_index,
+                                trial_element=trial_info.element,
+                                trial_element_index=trial_info.element_index,
+                                )
+
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
-                       restriction=(test_info.restriction, trial_info.restriction),
                        stage=3,
                        position_priority=priority,
-                       accumvar=accumvar,
-                       test_element=test_info.element,
-                       test_element_index=test_info.element_index,
-                       trial_element=trial_info.element,
-                       trial_element_index=trial_info.element_index,
-                       input=AlreadyAssembledInput(index=(test_info.element_index,)),
+                       output=output,
                        predicates=predicates,
                        )
 
@@ -342,56 +480,10 @@ def generate_accumulation_instruction(expr, visitor):
                                           depends_on=insn_dep,
                                           within_inames=frozenset(jacobian_inames))})
 
-    inames = tuple(accum_iname(trial_leaf_element, mat.rows, i)
-                   for i, mat in enumerate(vsf.matrix_sequence))
-
-    # Collect the lfs and lfs indices for the accumulate call
-    test_lfs.index = flatten_index(tuple(prim.Variable(i) for i in inames),
-                                   basis_size,
-                                   order="f"
-                                   )
-
-    # In the jacobian case, also determine the space for the ansatz space
-    if jacobian_inames:
-        # TODO the next line should get its inames from
-        # elsewhere. This is *NOT* robust (but works right now)
-        from dune.perftool.sumfact.basis import _basis_functions_per_direction
-        ansatz_lfs.index = flatten_index(tuple(prim.Variable(jacobian_inames[i])
-                                               for i in range(world_dimension())),
-                                         _basis_functions_per_direction(trial_leaf_element),
-                                         order="f"
-                                         )
-
     # Add a sum factorization kernel that implements the multiplication
     # with the test function (stage 3)
     from dune.perftool.sumfact.realization import realize_sum_factorization_kernel
     result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep)))
 
-    # Determine the expression to accumulate with. This depends on the vectorization strategy!
-    result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames))
-    vecinames = ()
-
-    if vsf.vectorized:
-        iname = accum_iname(trial_leaf_element, vsf.vector_width, "vec")
-        vecinames = (iname,)
-        transform(lp.tag_inames, [(iname, "vec")])
-        from dune.perftool.tools import maybe_wrap_subscript
-        result = prim.Call(prim.Variable("horizontal_add"),
-                           (maybe_wrap_subscript(result, prim.Variable(iname)),),
-                           )
-
     if not get_form_option("fastdg"):
-        rank = 2 if jacobian_inames else 1
-        expr = prim.Call(PDELabAccumulationFunction(accumvar, rank),
-                         (test_lfs.get_args() +
-                          ansatz_lfs.get_args() +
-                          (result,)
-                          )
-                         )
-        instruction(assignees=(),
-                    expression=expr,
-                    forced_iname_deps=frozenset(inames + vecinames + jacobian_inames),
-                    forced_iname_deps_is_final=True,
-                    depends_on=insn_dep,
-                    predicates=predicates
-                    )
+        vsf.output.realize(vsf, result, insn_dep)
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index a941b8e0d51e956010f7f11532157993766becf3..7c2740e13f195678ff234c9dbd946002f710eb83 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -11,6 +11,7 @@ from dune.perftool.generation import (backend,
                                       get_counted_variable,
                                       get_counter,
                                       get_global_context_value,
+                                      globalarg,
                                       iname,
                                       instruction,
                                       kernel_cached,
@@ -70,7 +71,11 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
     def __str__(self):
         return repr(self)
 
-    def realize(self, sf, index, insn_dep):
+    @property
+    def direct_input_is_possible(self):
+        return get_form_option("fastdg")
+
+    def realize(self, sf, insn_dep, index=0):
         lfs = name_lfs(self.element, self.restriction, self.element_index)
         basisiname = sumfact_iname(name_lfs_bound(lfs), "basis")
         container = self.coeff_func(self.restriction)
@@ -85,18 +90,31 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
 
         assignee = prim.Subscript(prim.Variable(name),
                                   (prim.Variable(basisiname),) + (index,))
-        instruction(assignee=assignee,
-                    expression=coeff,
-                    depends_on=sf.insn_dep.union(insn_dep),
-                    tags=frozenset({"sumfact_stage{}".format(sf.stage)}),
-                    )
-
-    @property
-    def direct_input(self):
-        if get_form_option("fastdg"):
-            return self.coeff_func(self.restriction)
-        else:
-            return None
+        insn = instruction(assignee=assignee,
+                           expression=coeff,
+                           depends_on=sf.insn_dep.union(insn_dep),
+                           tags=frozenset({"sumfact_stage{}".format(sf.stage)}),
+                           )
+
+        return insn_dep.union(frozenset({insn}))
+
+    def realize_direct(self, shape, inames):
+        arg = "{}_access{}".format(self.coeff_func(self.restriction),
+                                   "_comp{}".format(self.element_index) if self.element_index else ""
+                                   )
+
+        from dune.perftool.sumfact.accumulation import _dof_offset
+        from dune.perftool.sumfact.realization import alias_data_array
+        globalarg(arg,
+                  shape=shape,
+                  dim_tags=",".join("f" * len(shape)),
+                  offset=_dof_offset(self.element, self.element_index),
+                  )
+
+        func = self.coeff_func(self.restriction)
+        alias_data_array(arg, func)
+
+        return prim.Subscript(prim.Variable(arg), inames)
 
 
 def _basis_functions_per_direction(element):
diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py
index b0e85d43bdaa27409af81596ecf3274b6dbabca7..d4aea877bdfd46157f3447baaf02869a2dfe6131 100644
--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -75,22 +75,8 @@ def _realize_sum_factorization_kernel(sf):
                                                          ),
                                              }))
 
-    direct_input = sf.input.direct_input
-
-    # Set up the input for stage 1
-    if direct_input is None:
-        if sf.vectorized:
-            for i, inputsf in enumerate(sf.kernels):
-                inputsf.input.realize(sf, i, inputsf.insn_dep.union(insn_dep))
-        else:
-            sf.input.realize(sf, 0, insn_dep)
-
-        insn_dep = insn_dep.union(frozenset({lp.match.Writes("input_{}".format(sf.buffer))}))
-    else:
-        if sf.input.element_index is None:
-            direct_input_arg = "{}_access".format(direct_input)
-        else:
-            direct_input_arg = "{}_access_comp{}".format(direct_input, sf.input.element_index)
+    if not sf.input.direct_input_is_possible:
+        insn_dep = insn_dep.union(sf.input.realize(sf, insn_dep))
 
     # Prepare some dim_tags/shapes for later use
     ftags = ",".join(["f"] * sf.length)
@@ -143,24 +129,12 @@ def _realize_sum_factorization_kernel(sf):
         # * a global data structure (if FastDGGridOperator is in use)
         # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator)
         input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])
-        if l == 0 and direct_input is not None:
+        if l == 0 and sf.input.direct_input_is_possible:
             # See comment below
             input_inames = permute_backward(input_inames, perm)
             inp_shape = permute_backward(inp_shape, perm)
 
-            globalarg(direct_input_arg,
-                      shape=inp_shape,
-                      dim_tags=novec_ftags,
-                      offset=_dof_offset(sf.input.element, sf.input.element_index),
-                      )
-            alias_data_array(direct_input_arg, direct_input)
-            if matrix.vectorized:
-                input_summand = prim.Call(ExplicitVCLCast(dtype_floatingpoint(), vector_width=sf.vector_width),
-                                          (prim.Subscript(prim.Variable(direct_input_arg),
-                                                          input_inames),))
-            else:
-                input_summand = prim.Subscript(prim.Variable(direct_input_arg),
-                                               input_inames + vec_iname)
+            input_summand = sf.input.realize_direct(inp_shape, input_inames)
         else:
             # If we did permute the order of a matrices above we also
             # permuted the order of out_inames. Unfortunately the
@@ -215,72 +189,33 @@ def _realize_sum_factorization_kernel(sf):
         if l == len(matrix_sequence) - 1:
             output_inames = permute_backward(output_inames, perm)
 
-        # In case of direct output we directly accumulate the result
-        # of the Sumfactorization into some global data structure.
-        if l == len(matrix_sequence) - 1 and get_form_option('fastdg') and sf.stage == 3:
-            ft = get_global_context_value("form_type")
-            if sf.test_element_index is None:
-                direct_output = "{}_access".format(sf.accumvar)
-            else:
-                direct_output = "{}_access_comp{}".format(sf.accumvar, sf.test_element_index)
-            if ft == 'residual' or ft == 'jacobian_apply':
-                globalarg(direct_output,
-                          shape=output_shape,
-                          dim_tags=novec_ftags,
-                          offset=_dof_offset(sf.test_element, sf.test_element_index),
-                          )
-                alias_data_array(direct_output, sf.accumvar)
-
-                assignee = prim.Subscript(prim.Variable(direct_output), output_inames)
-            else:
-                assert ft == 'jacobian'
-
-                direct_output = "{}x{}".format(direct_output, sf.trial_element_index)
-                rowsize = sum(tuple(s for s in _local_sizes(sf.trial_element)))
-                element = sf.trial_element
-                if isinstance(element, MixedElement):
-                    element = element.extract_component(sf.trial_element_index)[1]
-                other_shape = tuple(element.degree() + 1 for e in range(sf.length))
-                from pytools import product
-                manual_strides = tuple("stride:{}".format(rowsize * product(output_shape[:i])) for i in range(sf.length))
-                dim_tags = "{},{}".format(novec_ftags, ",".join(manual_strides))
-                globalarg(direct_output,
-                          shape=other_shape + output_shape,
-                          offset=rowsize * _dof_offset(sf.test_element, sf.test_element_index) + _dof_offset(sf.trial_element, sf.trial_element_index),
-                          dim_tags=dim_tags,
-                          )
-                alias_data_array(direct_output, sf.accumvar)
-                # TODO: It is at least questionnable, whether using the *order* of the inames in here
-                #       for indexing is a good idea. Then again, it is hard to find an alternative.
-                _ansatz_inames = tuple(prim.Variable(i) for i in sf.within_inames)
-                assignee = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + output_inames)
-
-            # In case of vectorization we need to apply a horizontal add
-            if matrix.vectorized:
-                matprod = prim.Call(prim.Variable("horizontal_add"),
-                                    (matprod,))
-
-            # We need to accumulate
-            matprod = prim.Sum((assignee, matprod))
-        else:
-            assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
-
         tag = "sumfact_stage{}".format(sf.stage)
         if sf.stage == 3:
             tag = "{}_{}".format(tag, "_".join(sf.within_inames))
 
-        # Issue the reduction instruction that implements the multiplication
-        # at the same time store the instruction ID for the next instruction to depend on
-        insn_dep = frozenset({instruction(assignee=assignee,
-                                          expression=matprod,
-                                          forced_iname_deps=frozenset([iname for iname in out_inames]).union(frozenset(sf.within_inames)),
-                                          forced_iname_deps_is_final=True,
-                                          depends_on=insn_dep,
-                                          tags=frozenset({tag}),
-                                          predicates=sf.predicates,
-                                          groups=frozenset({sf.group_name}),
-                                          )
-                              })
+        # Collect the key word arguments for the loopy instruction
+        insn_args = {"forced_iname_deps": frozenset([i for i in out_inames]).union(frozenset(sf.within_inames)),
+                     "forced_iname_deps_is_final": True,
+                     "depends_on": insn_dep,
+                     "tags": frozenset({tag}),
+                     "predicates": sf.predicates,
+                     "groups": frozenset({sf.group_name}),
+                     }
+
+        # In case of direct output we directly accumulate the result
+        # of the Sumfactorization into some global data structure.
+        if l == len(matrix_sequence) - 1 and get_form_option('fastdg') and sf.stage == 3:
+            if sf.vectorized:
+                insn_args["forced_iname_deps"] = insn_args["forced_iname_deps"].union(frozenset({vec_iname[0].name}))
+            insn_dep = sf.output.realize_direct(matprod, output_inames, out_shape, insn_args)
+        else:
+            # Issue the reduction instruction that implements the multiplication
+            # at the same time store the instruction ID for the next instruction to depend on
+            insn_dep = frozenset({instruction(assignee=prim.Subscript(prim.Variable(out), output_inames + vec_iname),
+                                              expression=matprod,
+                                              **insn_args
+                                              )
+                                  })
 
     # Measure times and count operations in c++ code
     if get_option("instrumentation_level") >= 4:
@@ -301,22 +236,3 @@ def _realize_sum_factorization_kernel(sf):
     silenced_warning('read_no_write({})'.format(out))
 
     return lp.TaggedVariable(out, sf.tag), insn_dep
-
-
-def _local_sizes(element):
-    from ufl import FiniteElement, MixedElement
-    if isinstance(element, MixedElement):
-        for subel in element.sub_elements():
-            for s in _local_sizes(subel):
-                yield s
-    else:
-        assert isinstance(element, FiniteElement)
-        yield (element.degree() + 1)**element.cell().geometric_dimension()
-
-
-def _dof_offset(element, component):
-    if component is None:
-        return 0
-    else:
-        sizes = tuple(s for s in _local_sizes(element))
-        return sum(sizes[0:component])
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index babb432bc324f6ce08cec9f931df7ba8191181fa..db93c6cf0145007e13dc82bbb7ff817c0b54e060 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -1,10 +1,16 @@
 """ A pymbolic node representing a sum factorization kernel """
 
 from dune.perftool.options import get_option
-from dune.perftool.generation import get_counted_variable
+from dune.perftool.generation import (get_counted_variable,
+                                      subst_rule,
+                                      transform,
+                                      )
 from dune.perftool.pdelab.geometry import local_dimension, world_dimension
 from dune.perftool.sumfact.quadrature import quadrature_inames
 from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray
+from dune.perftool.loopy.target import dtype_floatingpoint
+from dune.perftool.loopy.vcl import ExplicitVCLCast, VCLLowerUpperLoad
+from dune.perftool.tools import get_leaf, maybe_wrap_subscript
 
 from pytools import ImmutableRecord, product
 
@@ -18,11 +24,133 @@ import inspect
 
 class SumfactKernelInputBase(object):
     @property
-    def direct_input(self):
-        return None
+    def direct_input_is_possible(self):
+        return False
+
+    def realize(self, sf, dep, index=0):
+        return dep
+
+    def realize_direct(self, inames):
+        raise NotImplementedError
+
+    def __repr__(self):
+        return "SumfactKernelInputBase()"
+
+
+class VectorSumfactKernelInput(SumfactKernelInputBase):
+    def __init__(self, inputs):
+        assert(isinstance(inputs, tuple))
+        self.inputs = inputs
+
+    def __repr__(self):
+        return "_".join(repr(i) for i in self.inputs)
+
+    @property
+    def direct_input_is_possible(self):
+        return all(i.direct_input_is_possible for i in self.inputs)
+
+    def realize(self, sf, dep):
+        for i, inp in enumerate(self.inputs):
+            dep = dep.union(inp.realize(sf, dep, index=i))
+        return dep
+
+    def realize_direct(self, shape, inames):
+        # Check whether the input exhibits a favorable structure
+        # (whether we can broadcast scalar values into SIMD registers)
+        total = set(self.inputs)
+        lower = set(self.inputs[:len(self.inputs) // 2])
+        upper = set(self.inputs[len(self.inputs) // 2:])
+
+        if len(total) == 1:
+            # All input coefficients use the exact same input coefficient.
+            # We implement this by broadcasting it into a SIMD register
+            return prim.Call(ExplicitVCLCast(dtype_floatingpoint()),
+                             (self.inputs[0].realize_direct(shape, inames),)
+                             )
+        elif len(total) == 2 and len(lower) == 1 and len(upper) == 1:
+            # The lower and the upper part of the SIMD register use
+            # the same input coefficient, we combine the SIMD register
+            # from two shorter SIMD types
+            return prim.Call(VCLLowerUpperLoad(dtype_floatingpoint()),
+                             (self.inputs[0].realize_direct(shape, inames),
+                              self.inputs[len(self.inputs) // 2].realize_direct(shape, inames),
+                              )
+                             )
+        else:
+            # The input does not exhibit a broadcastable structure, we
+            # need to load scalars into the SIMD vector.
+            raise NotImplementedError("SIMD loads from scalars not implemented!")
+
+
+class SumfactKernelOutputBase(object):
+    @property
+    def within_inames(self):
+        return ()
 
-    def realize(self, sf, i, dep):
-        pass
+    def realize(self, sf, result, insn_dep):
+        return dep
+
+    def realize_direct(self, result, inames, shape, args):
+        raise NotImplementedError
+
+    def __repr__(self):
+        return "SumfactKernelOutputBase()"
+
+
+class VectorSumfactKernelOutput(SumfactKernelOutputBase):
+    def __init__(self, outputs):
+        self.outputs = outputs
+
+    def __repr__(self):
+        return "_".join(repr(o) for o in self.outputs)
+
+    def _add_hadd(self, o, result):
+        hadd_function = "horizontal_add"
+        if len(set(self.outputs)) > 1:
+            pos = self.outputs.index(o)
+            if pos == 0:
+                hadd_function = "horizontal_add_lower"
+            else:
+                hadd_function = "horizontal_add_upper"
+
+        return prim.Call(prim.Variable(hadd_function), (result,))
+
+    def realize(self, sf, result, insn_dep):
+        outputs = set(self.outputs)
+
+        trial_element, = set(o.trial_element for o in self.outputs)
+        trial_element_index, = set(o.trial_element_index for o in self.outputs)
+        from dune.perftool.sumfact.accumulation import accum_iname
+        element = get_leaf(trial_element, trial_element_index) if trial_element is not None else None
+        inames = tuple(accum_iname(element, mat.rows, i)
+                       for i, mat in enumerate(sf.matrix_sequence))
+        veciname = accum_iname(element, sf.vector_width // len(outputs), "vec")
+        transform(lp.tag_inames, [(veciname, "vec")])
+
+        deps = frozenset()
+        for o in outputs:
+            hadd_result = self._add_hadd(o, maybe_wrap_subscript(result, tuple(prim.Variable(iname) for iname in inames + (veciname,))))
+            deps = deps.union(o.realize(sf, hadd_result, insn_dep, inames=inames, additional_inames=(veciname,)))
+
+        return deps
+
+    def realize_direct(self, result, inames, shape, args):
+        outputs = set(self.outputs)
+
+        # If multiple horizontal_add's are to be performed with 'result'
+        # we need to precompute the result!
+        if len(outputs) > 1:
+            substname = "haddsubst_{}".format("_".join([i.name for i in inames]))
+            subst_rule(substname, (), result)
+            result = prim.Call(prim.Variable(substname), ())
+            transform(lp.precompute, substname, precompute_outer_inames=args["forced_iname_deps"])
+
+        deps = frozenset()
+        for o in outputs:
+            hadd_result = self._add_hadd(o, result)
+            deps = deps.union(o.realize_direct(hadd_result, inames, shape, args))
+
+        return deps
 
 
 class SumfactKernelBase(object):
@@ -35,14 +163,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
                  buffer=None,
                  stage=1,
                  position_priority=None,
-                 restriction=None,
                  insn_dep=frozenset(),
-                 input=None,
-                 accumvar=None,
-                 test_element=None,
-                 test_element_index=None,
-                 trial_element=None,
-                 trial_element_index=None,
+                 input=SumfactKernelInputBase(),
+                 output=SumfactKernelOutputBase(),
                  predicates=frozenset(),
                  ):
         """Create a sum factorization kernel
@@ -118,11 +241,8 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
 
         assert stage in (1, 3)
 
-        if stage == 1:
-            assert isinstance(input, SumfactKernelInputBase)
-
-        if stage == 3:
-            assert isinstance(restriction, tuple)
+        assert isinstance(input, SumfactKernelInputBase)
+        assert isinstance(output, SumfactKernelOutputBase)
 
         assert isinstance(insn_dep, frozenset)
 
@@ -159,31 +279,54 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     # Watch out for the documentation to see which key is used unter what circumstances
     #
 
+    @property
+    def parallel_key(self):
+        """ A key that identifies parallellizable kernels. """
+        return tuple(m.basis_size for m in self.matrix_sequence) + (self.stage, self.buffer)
+
     @property
     def cache_key(self):
         """ The cache key that can be used in generation magic
         Any two sum factorization kernels having the same cache_key
-        are realized simulatenously!
+        are realized simultaneously!
         """
-        return (self.matrix_sequence, self.restriction, self.stage, self.buffer, self.test_element_index)
+        if self.buffer is None:
+            # During dry run, we return something unique to this kernel
+            return repr(self)
+        else:
+            # Later we identify parallely implemented kernels by the assigned buffer
+            return self.buffer
 
     @property
-    def input_key(self):
+    def inout_key(self):
         """ A cache key for the input coefficients
         Any two sum factorization kernels having the same input_key
-        work on the same input coefficient (and are suitable for simultaneous
-        treatment because of that)
+        work on the same input coefficient (stage 1) or accumulate
+        into the same thing (stage 3)
         """
-        return (self.input, self.restriction, self.accumvar, self.trial_element_index)
+        return (repr(self.input), repr(self.output))
 
     @property
     def group_name(self):
-        return "sfgroup_{}_{}_{}_{}".format(self.input, self.restriction, self.accumvar, self.trial_element_index)
+        return "sfgroup_{}_{}".format(self.input, self.output)
 
     #
     # Some convenience methods to extract information about the sum factorization kernel
     #
 
+    def __lt__(self, other):
+        if self.parallel_key != other.parallel_key:
+            return self.parallel_key < other.parallel_key
+        if self.inout_key != other.inout_key:
+            return self.input_key < other.input_key
+        if self.position_priority == other.position_priority:
+            return repr(self) < repr(other)
+        if self.position_priority is None:
+            return False
+        if other.position_priority is None:
+            return True
+        return self.position_priority < other.position_priority
+
     @property
     def length(self):
         """ The number of matrices to apply """
@@ -199,14 +342,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
 
     @property
     def within_inames(self):
-        if self.trial_element is None:
-            return ()
-        else:
-            from dune.perftool.sumfact.basis import lfs_inames
-            element = self.trial_element
-            if isinstance(element, MixedElement):
-                element = element.extract_component(self.trial_element_index)[1]
-            return lfs_inames(element, self.restriction)
+        return self.output.within_inames
 
     def vec_index(self, sf):
         """ Map an unvectorized sumfact kernel object to its position
@@ -358,7 +494,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         # Assert all the properties that need to be the same across all subkernels
         assert len(set(k.stage for k in kernels)) == 1
         assert len(set(k.length for k in kernels)) == 1
-        assert len(set(k.restriction for k in kernels)) == 1
         assert len(set(k.within_inames for k in kernels)) == 1
         assert len(set(k.predicates for k in kernels)) == 1
 
@@ -366,7 +501,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         for i in range(kernels[0].length):
             assert len(set(tuple(k.matrix_sequence[i].rows for k in kernels))) == 1
             assert len(set(tuple(k.matrix_sequence[i].cols for k in kernels))) == 1
-            assert len(set(tuple(k.matrix_sequence[i].face for k in kernels))) == 1
+            assert len(set(tuple(k.matrix_sequence[i].direction for k in kernels))) == 1
             assert len(set(tuple(k.matrix_sequence[i].transpose for k in kernels))) == 1
 
         # Join the instruction dependencies of all subkernels
@@ -437,36 +572,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     def within_inames(self):
         return self.kernels[0].within_inames
 
-    @property
-    def test_element(self):
-        return self.kernels[0].test_element
-
-    @property
-    def test_element_index(self):
-        return self.kernels[0].test_element_index
-
-    @property
-    def trial_element(self):
-        return self.kernels[0].trial_element
-
-    @property
-    def trial_element_index(self):
-        return self.kernels[0].trial_element_index
-
     @property
     def predicates(self):
         return self.kernels[0].predicates
 
-    @property
-    def input(self):
-        assert len(set(k.input for k in self.kernels)) == 1
-        return self.kernels[0].input
-
-    @property
-    def accumvar(self):
-        assert len(set(k.accumvar for k in self.kernels)) == 1
-        return self.kernels[0].accumvar
-
     @property
     def transposed(self):
         return self.kernels[0].transposed
@@ -486,13 +595,21 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     # Define the same properties the normal SumfactKernel defines
     #
 
+    @property
+    def input(self):
+        return VectorSumfactKernelInput(tuple(k.input for k in self.kernels))
+
+    @property
+    def output(self):
+        return VectorSumfactKernelOutput(tuple(k.output for k in self.kernels))
+
     @property
     def cache_key(self):
         return (tuple(k.cache_key for k in self.kernels), self.buffer)
 
     @property
-    def input_key(self):
-        return tuple(k.input_key for k in self.kernels)
+    def inout_key(self):
+        return tuple(k.inout_key for k in self.kernels)
 
     @property
     def group_name(self):
@@ -507,10 +624,11 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         return True
 
     def horizontal_index(self, sf):
-        key = tuple(mat.derivative for mat in sf.matrix_sequence)
         for i, k in enumerate(self.kernels):
-            if tuple(mat.derivative for mat in k.matrix_sequence) == key:
-                return i
+            if sf.inout_key == k.inout_key:
+                if tuple(mat.derivative for mat in sf.matrix_sequence) == tuple(mat.derivative for mat in k.matrix_sequence):
+                    return i
+
         return 0
 
     def _quadrature_index(self, sf, visitor):
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index bc671b700e989e694bed572241edc19879942ea7..daeabf9005cbff9c5917185b5711ceb6cb4342a3 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -62,14 +62,27 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
                                  slice_index=slice_index,
                                  )
 
+    @property
+    def _shortname(self):
+        infos = ["d{}".format(self.basis_size),
+                 "q{}".format(self.quadrature_size)]
+
+        if self.transpose:
+            infos.append("T")
+
+        if self.derivative:
+            infos.append("dx")
+
+        if self.face is not None:
+            infos.append("f{}".format(self.face))
+
+        if self.slice_size is not None:
+            infos.append("s{}".format(self.slice_index))
+
+        return "".join(infos)
+
     def __str__(self):
-        return "{}{}A{}{}{}" \
-               .format("face{}_".format(self.face) if self.face is not None else "",
-                       "d" if self.derivative else "",
-                       self.basis_size,
-                       "T" if self.transpose else "",
-                       "_slice{}".format(self.slice_index) if self.slice_size is not None else "",
-                       )
+        return "Theta_{}".format(self._shortname)
 
     @property
     def rows(self):
@@ -96,14 +109,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
         return size
 
     def pymbolic(self, indices):
-        name = "{}{}Theta{}{}_qp{}_dof{}" \
-               .format("face{}_".format(self.face) if self.face is not None else "",
-                       "d" if self.derivative else "",
-                       "T" if self.transpose else "",
-                       "_slice{}".format(self.slice_index) if self.slice_size is not None else "",
-                       self.quadrature_size,
-                       self.basis_size,
-                       )
+        name = str(self)
         define_theta(name, self)
         return prim.Subscript(prim.Variable(name), indices)
 
@@ -131,7 +137,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
         assert len(set(t.quadrature_size for t in tabs)) == 1
         assert len(set(t.basis_size for t in tabs)) == 1
         assert len(set(t.transpose for t in tabs)) == 1
-        assert len(set(t.face for t in tabs)) == 1
+        assert len(set(t.direction for t in tabs)) == 1
         assert len(set(t.slice_size for t in tabs)) == 1
         self.tabs = tabs
 
@@ -140,11 +146,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
         self.width = width
 
     def __str__(self):
-        abbrevs = tuple("{}A{}{}".format("d" if t.derivative else "",
-                                         self.basis_size,
-                                         "s{}".format(t.slice_index) if t.slice_size is not None else "")
-                        for t in self.tabs)
-        return "_".join(abbrevs)
+        return "Theta{}".format("_".join((t._shortname for t in self.tabs)))
 
     @property
     def quadrature_size(self):
@@ -196,15 +198,8 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
             theta = self.tabs[0].pymbolic(indices[:-1])
             return prim.Call(ExplicitVCLCast(dtype_floatingpoint(), vector_width=get_vcl_type_size(dtype_floatingpoint())), (theta,))
 
-        abbrevs = tuple("{}x{}".format("d" if t.derivative else "",
-                                       "s{}".format(t.slice_index) if t.slice_size is not None else "")
-                        for t in self.tabs)
-        name = "ThetaLarge{}{}_{}_qp{}_dof{}".format("face{}_".format(self.face) if self.face is not None else "",
-                                                     "T" if self.transpose else "",
-                                                     "_".join(abbrevs),
-                                                     self.tabs[0].quadrature_size,
-                                                     self.tabs[0].basis_size,
-                                                     )
+        name = str(self)
+
         for i, tab in enumerate(self.tabs):
             define_theta(name, tab, additional_indices=(i,), width=self.width)
 
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 690a1c0ba912913f068fe2f295dcbc4c498bdf1e..4f7abe67048b3016db2277153844757116996164 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -48,13 +48,6 @@ def attach_vectorization_info(sf):
         return _cache_vectorization_info(sf, None)
 
 
-def position_penalty_factor(sf):
-    if isinstance(sf, SumfactKernel) or sf.vertical_width > 1:
-        return 1
-    else:
-        return 1 + sum(abs(sf.kernels[i].position_priority - i) if sf.kernels[i].position_priority is not None else 0 for i in range(sf.length))
-
-
 @backend(interface="vectorization_strategy", name="model")
 def costmodel(sf):
     # Penalize vertical vectorization
@@ -66,13 +59,7 @@ def costmodel(sf):
         scalar_penalty = get_vcl_type_size(dtype_floatingpoint())
 
     # Return total operations
-    return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty
-
-
-@backend(interface="vectorization_strategy", name="fromlist")
-def fromlist_costmodel(sf):
-    # The fromlist strategy needs to reuse the cost model!
-    return costmodel(sf)
+    return sf.operations * vertical_penalty * scalar_penalty
 
 
 @backend(interface="vectorization_strategy", name="explicit")
@@ -90,7 +77,7 @@ def explicit_costfunction(sf):
 
     if sf.horizontal_width == horizontal and sf.vertical_width == vertical:
         # Penalize position mapping
-        return sf.operations * position_penalty_factor(sf)
+        return sf.operations
     else:
         return 1000000000000
 
@@ -172,59 +159,21 @@ def decide_vectorization_strategy():
     logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes"
                  .format(len(active_sumfacts)))
 
-    # Find the best vectorization strategy by using a costmodel
-    width = get_vcl_type_size(dtype_floatingpoint())
-
     #
-    # Optimize over all the possible quadrature point tuples
+    # Find the best vectorization strategy by using a costmodel
     #
-    quad_points = [quadrature_points_per_direction()]
-    if get_form_option("vectorization_allow_quadrature_changes"):
-        sf = next(iter(active_sumfacts))
-        depth = 1
-        while depth <= width:
-            i = 0 if sf.matrix_sequence[0].face is None else 1
-            quad = list(quadrature_points_per_direction())
-            quad[i] = round_to_multiple(quad[i], depth)
-            quad_points.append(tuple(quad))
-            depth = depth * 2
-        quad_points = list(set(quad_points))
-
-    if get_form_option("vectorization_strategy") == "fromlist":
-        # This is a bit special and does not follow the minimization procedure at all
-
-        def _choose_strategy_from_list(stage1_sumfacts):
-            strategy = 0
-            for qp in quad_points:
-                for strat in fixed_quad_vectorization_opportunity_generator(frozenset(stage1_sumfacts), width, qp):
-                    if strategy == int(get_form_option("vectorization_list_index")):
-                        # Output the strategy and its cost into a separate file
-                        if get_global_context_value("form_type") == "jacobian_apply":
-                            with open("strategycosts.csv", "a") as f:
-                                f.write("{} {}\n".format(strategy, strategy_cost((qp, strat))))
-                        return qp, strat
-                    strategy = strategy + 1
-
-            raise PerftoolVectorizationError("Specified vectorization list index '{}' was too high!".format(get_form_option("vectorization_list_index")))
-
-        s1_sumfacts = frozenset(sf for sf in active_sumfacts if sf.stage == 1)
-
-        total = sum(len([s for s in fixed_quad_vectorization_opportunity_generator(frozenset(s1_sumfacts), width, qp)]) for qp in quad_points)
-        print("'fromlist' vectorization is attempting to pick #{} of {} strategies...".format(int(get_form_option("vectorization_list_index")),
-                                                                                              total))
-        qp, sfdict = _choose_strategy_from_list(s1_sumfacts)
-
-        keys = frozenset(sf.input_key for sf in active_sumfacts if sf.stage != 1)
-        for key in keys:
-            key_sumfacts = frozenset(sf for sf in active_sumfacts if sf.input_key == key)
-            minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp),
-                          key=fixedqp_strategy_costfunction(qp))
-            sfdict = add_to_frozendict(sfdict, minimum)
-    else:
-        # Find the minimum cost strategy between all the quadrature point tuples
-        optimal_strategies = {qp: fixed_quadrature_optimal_vectorization(active_sumfacts, width, qp) for qp in quad_points}
-        qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp])))
-        sfdict = optimal_strategies[qp]
+    # Note that this optimization procedure uses a hierarchic approach to bypass
+    # the problems of unfavorable complexity of the set of all possible vectorization
+    # opportunities. Optimizations are performed at different levels (you find these
+    # levels in the function names implementing them), where optimal solutions at a
+    # higher level are combined into lower level solutions or optima of optimal solutions
+    # at higher level are calculated:
+    # * Level 1: Finding an optimal quadrature tuple (by finding optimum of level 2 optima)
+    # * Level 2: Split by parallelizability and combine optima into optimal solution
+    # * Level 3: Optimize number of different inputs to consider
+    # * Level 4: Optimize horizontal/vertical/hybrid strategy
+    width = get_vcl_type_size(dtype_floatingpoint())
+    qp, sfdict = level1_optimal_vectorization_strategy(active_sumfacts, width)
 
     set_quadrature_points(qp)
 
@@ -239,88 +188,104 @@ def decide_vectorization_strategy():
         _cache_vectorization_info(sf, sfdict[sf])
 
 
-def fixed_quadrature_optimal_vectorization(sumfacts, width, qp):
-    """ For a given quadrature point tuple, find the optimal strategy!
+def level1_optimal_vectorization_strategy(sumfacts, width):
+    # Gather a list of possible quadrature point tuples
+    quad_points = [quadrature_points_per_direction()]
+    if get_form_option("vectorization_allow_quadrature_changes"):
+        sf = next(iter(sumfacts))
+        depth = 1
+        while depth <= width:
+            i = 0 if sf.matrix_sequence[0].face is None else 1
+            quad = list(quadrature_points_per_direction())
+            quad[i] = round_to_multiple(quad[i], depth)
+            quad_points.append(tuple(quad))
+            depth = depth * 2
+        quad_points = list(set(quad_points))
 
-    In order to have this scale sufficiently, we cannot simply list all vectorization
-    opportunities and score them individually, but we need to do a divide and conquer
-    approach.
-    """
-    # Find the sets of simultaneously realizable kernels (thats an equivalence relation)
-    keys = frozenset(sf.input_key for sf in sumfacts)
+    # Find the minimum cost strategy between all the quadrature point tuples
+    optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points}
+    qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp])))
+
+    return qp, optimal_strategies[qp]
+
+
+def level2_optimal_vectorization_strategy(sumfacts, width, qp):
+    # Find the sets of simultaneously realizable kernels
+    keys = frozenset(sf.parallel_key for sf in sumfacts)
 
     # Find minimums for each of these sets
     sfdict = frozendict()
 
     for key in keys:
-        key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key)
-        minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp),
-                      key=fixedqp_strategy_costfunction(qp))
-        sfdict = add_to_frozendict(sfdict, minimum)
+        key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key)
+        key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp),
+                           key=fixedqp_strategy_costfunction(qp))
+        sfdict = add_to_frozendict(sfdict, key_strategy)
 
     return sfdict
 
 
-def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
+def level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
+    for opp in _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
+        # Add non-vectorized implementation information to all kernels that are not present in
+        # the optimal strategy
+        yield add_to_frozendict(opp,
+                                {sf: sf.copy(buffer=get_counted_variable("buffer")) for sf in sumfacts if sf not in opp})
+
+
+def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, already=frozendict()):
     if len(sumfacts) == 0:
-        # We have gone into recursion deep enough to have all sum factorization nodes
-        # assigned their vectorized counterpart. We can yield the result now!
         yield already
         return
 
-    # Ensure a deterministic order of the given sumfact kernels. This is necessary for the
-    # fromlist strategy to pick correct strategies across different program runs
-    sumfacts = sorted(sumfacts, key=lambda sf: repr(sf))
-
-    # Otherwise we pick a random sum factorization kernel and construct all the vectorization
-    # opportunities realizing this particular kernel and go into recursion.
-    sf_to_decide = sumfacts[0]
-
-    # Have "unvectorized" as an option, although it is not good
-    for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]),
-                                                              width,
-                                                              qp,
-                                                              add_to_frozendict(already,
-                                                                                {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
-                                                                                ),
-                                                              ):
-        yield opp
-
-    horizontal = 1
-    while horizontal <= width:
-        # Iterate over the possible combinations of sum factorization kernels
-        # taking into account all the permutations of kernels. This also includes
-        # combinations which use a padding of 1 - but only for pure horizontality.
-        generators = [it.permutations(sumfacts, horizontal)]
-        if horizontal >= 4:
-            generators.append(it.permutations(sumfacts, horizontal - 1))
-        for combo in it.chain(*generators):
-            # The chosen kernels must be part of the kernels for recursion
-            # to work correctly
-            if sf_to_decide not in combo:
-                continue
+    # We store the information whether a vectorization opportunity has been yielded from this
+    # generator to yield an incomplete strategy if not (which is then completed with unvectorized
+    # kernel implementations)
+    yielded = False
+
+    # Find the number of input coefficients we can work on
+    keys = frozenset(sf.inout_key for sf in sumfacts)
+
+    inoutkey_sumfacts = [tuple(sorted(filter(lambda sf: sf.inout_key == key, sumfacts))) for key in sorted(keys)]
+
+    for parallel in (1, 2):
+        if parallel > len(keys):
+            continue
+
+        horizontal = 1
+        while horizontal <= width // parallel:
+            combo = sum((inoutkey_sumfacts[part][:horizontal] for part in range(parallel)), ())
+
+            vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp)
+            horizontal *= 2
 
-            # Set up the vectorization dict for this combo
-            vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
             if vecdict is None:
                 # This particular choice was rejected for some reason.
                 # Possible reasons:
                 # * the quadrature point tuple not being suitable
                 #   for this vectorization strategy
+                # * there are not enough horizontal kernels
                 continue
 
             # Go into recursion to also vectorize all kernels not in this combo
-            for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo),
-                                                                      width,
-                                                                      qp,
-                                                                      add_to_frozendict(already, vecdict),
-                                                                      ):
+            for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo),
+                                                                        width,
+                                                                        qp,
+                                                                        add_to_frozendict(already, vecdict),
+                                                                        ):
+                yielded = True
                 yield opp
 
-        horizontal = horizontal * 2
+    # If we did not yield on this recursion level, yield what we got so far
+    if not yielded:
+        yield already
 
 
 def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
+    # Discard opportunities that do not contain enough horizontal kernels
+    if len(sumfacts) not in (horizontal, horizontal - 1):
+        return None
+
     # Enhance the list of sumfact nodes by adding vertical splittings
     kernels = []
     for sf in sumfacts:
diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py
index e302f28c0e63d03d013c181dc0eb4f831ec1648c..b29ebe221387a5af529f0dec29592b4de8dc762a 100644
--- a/python/dune/perftool/tools.py
+++ b/python/dune/perftool/tools.py
@@ -82,3 +82,14 @@ def list_diff(l1, l2):
             if item not in l2:
                 l.append(item)
         return l
+
+
+def get_leaf(element, index):
+    """ return a leaf element if the given element is a MixedElement """
+    leaf_element = element
+    from ufl import MixedElement
+    if isinstance(element, MixedElement):
+        assert isinstance(index, int)
+        leaf_element = element.extract_component(index)[1]
+
+    return leaf_element