diff --git a/patches/apply_patches.sh b/patches/apply_patches.sh
index 9bb1d2e2b98930176835ff7bb7fbba3dfb089d11..f95f1f6a28289d3ef0b62fdef5e818d5b87530c4 100755
--- a/patches/apply_patches.sh
+++ b/patches/apply_patches.sh
@@ -8,3 +8,7 @@ popd
 pushd dune/perftool/vectorclass
 git apply ../../../patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
+pushd python/ufl
+git apply ../../patches/ufl/conditional-uflid.patch
diff --git a/patches/ufl/conditional-uflid.patch b/patches/ufl/conditional-uflid.patch
new file mode 100644
index 0000000000000000000000000000000000000000..b469437ed6507bca7ec4ed4b56fa8578804a9809
--- /dev/null
+++ b/patches/ufl/conditional-uflid.patch
@@ -0,0 +1,34 @@
+diff --git a/ufl/conditional.py b/ufl/conditional.py
+index 352624c..ebd647f 100644
+--- a/ufl/conditional.py
++++ b/ufl/conditional.py
+@@ -27,6 +27,7 @@ from ufl.constantvalue import as_ufl
+ from ufl.precedence import parstr
+ from ufl.exprequals import expr_equals
+ from ufl.checks import is_true_ufl_scalar
++from ufl.core.ufl_id import attach_ufl_id
+ # --- Condition classes ---
+@@ -221,10 +222,11 @@ class NotCondition(Condition):
+ @ufl_type(num_ops=3, inherit_shape_from_operand=1,
+           inherit_indices_from_operand=1)
+ class Conditional(Operator):
+-    __slots__ = ()
++    __slots__ = ("_ufl_id")
+-    def __init__(self, condition, true_value, false_value):
++    def __init__(self, condition, true_value, false_value, ufl_id=None):
+         if not isinstance(condition, Condition):
+             error("Expectiong condition as first argument.")
+         true_value = as_ufl(true_value)
+@@ -244,6 +246,7 @@ class Conditional(Operator):
+                         condition.ufl_operands[1].ufl_free_indices == ())):
+                 error("Non-scalar == or != is not allowed.")
++        self._ufl_id = self._init_ufl_id(ufl_id)
+         Operator.__init__(self, (condition, true_value, false_value))
+     def evaluate(self, x, mapping, component, index_values):
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index 113e3b76ab040bf937006a8c4509cc65414721c5..96b7923a2917aeeddca359733b5dceed91b9ca75 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -130,6 +130,8 @@ class DuneCExpressionToCodeMapper(CExpressionToCodeMapper):
                               self.rec(expr.shift, PREC_SHIFT)),
             enclosing_prec, PREC_SHIFT)
+    map_tagged_variable = CExpressionToCodeMapper.map_variable
 class DuneASTBuilder(CASTBuilder):
     def function_manglers(self):
diff --git a/python/dune/perftool/pdelab/__init__.py b/python/dune/perftool/pdelab/__init__.py
index fbe00bb36a49a7fd3b336b26821b5616a8c2cec6..9396782c60ede97a6173d1fa1b8d5e8f33f9e233 100644
--- a/python/dune/perftool/pdelab/__init__.py
+++ b/python/dune/perftool/pdelab/__init__.py
@@ -42,6 +42,19 @@ class PDELabInterface(object):
         # The visitor instance will be registered by its init method
         self.visitor = None
+    # Accumulation interfaces
+    def get_accumulation_info(self, expr, visitor):
+        from dune.perftool.pdelab.localoperator import get_accumulation_info
+        return get_accumulation_info(expr, visitor)
+    def list_accumulation_infos(self, expr, visitor):
+        from dune.perftool.pdelab.localoperator import list_accumulation_infos
+        return list_accumulation_infos(expr, visitor)
+    def generate_accumulation_instruction(self, expr, visitor):
+        from dune.perftool.pdelab.localoperator import generate_accumulation_instruction
+        return generate_accumulation_instruction(expr, visitor)
     # TODO: The following ones are actually entirely PDELab independent!
     # They should be placed elsewhere and be used directly in the visitor.
@@ -60,6 +73,10 @@ class PDELabInterface(object):
     def lfs_inames(self, element, restriction, number=None, context=''):
         return lfs_inames(element, restriction, number, context)
+    def initialize_function_spaces(self, expr, visitor):
+        from dune.perftool.pdelab.spaces import initialize_function_spaces
+        return initialize_function_spaces(expr, visitor)
     # Test and trial function related generator functions
@@ -72,17 +89,17 @@ class PDELabInterface(object):
     def pymbolic_reference_gradient(self, element, restriction, number, context=''):
         return pymbolic_reference_gradient(element, restriction, number, context)
-    def pymbolic_trialfunction_gradient(self, element, restriction, tree_path):
-        return pymbolic_trialfunction_gradient(self.visitor, element, restriction, tree_path)
+    def pymbolic_trialfunction_gradient(self, element, restriction, index):
+        return pymbolic_trialfunction_gradient(element, restriction, index)
-    def pymbolic_apply_function_gradient(self, element, restriction, tree_path):
-        return pymbolic_apply_function_gradient(self.visitor, element, restriction, tree_path)
+    def pymbolic_apply_function_gradient(self, element, restriction, index):
+        return pymbolic_apply_function_gradient(element, restriction, index)
-    def pymbolic_trialfunction(self, element, restriction, tree_path):
-        return pymbolic_trialfunction(self.visitor, element, restriction, tree_path)
+    def pymbolic_trialfunction(self, element, restriction, index):
+        return pymbolic_trialfunction(element, restriction, index)
-    def pymbolic_apply_function(self, element, restriction, tree_path):
-        return pymbolic_apply_function(self.visitor, element, restriction, tree_path)
+    def pymbolic_apply_function(self, element, restriction, index):
+        return pymbolic_apply_function(element, restriction, index)
     # Parameter function related generator functions
diff --git a/python/dune/perftool/pdelab/argument.py b/python/dune/perftool/pdelab/argument.py
index 149dc47ca9461d7950c9216bed21b1e9ee437f3b..5adc0eb48f2ba72a2c29b59ee7a66a046ed78edc 100644
--- a/python/dune/perftool/pdelab/argument.py
+++ b/python/dune/perftool/pdelab/argument.py
@@ -90,35 +90,35 @@ def accumulation_mangler(target, func, dtypes):
-def pymbolic_trialfunction_gradient(visitor, element, restriction, tree_path):
-    rawname = "gradu" + "_".join(str(c) for c in tree_path)
+def pymbolic_trialfunction_gradient(element, restriction, index):
+    rawname = "gradu_{}".format(index)
     name = restricted_name(rawname, restriction)
     container = name_coefficientcontainer(restriction)
-    evaluate_coefficient_gradient(visitor, element, name, container, restriction, tree_path)
+    evaluate_coefficient_gradient(element, name, container, restriction, index)
     return Variable(name)
-def pymbolic_trialfunction(visitor, element, restriction, tree_path):
-    rawname = "u" + "_".join(str(c) for c in tree_path)
+def pymbolic_trialfunction(element, restriction, index):
+    rawname = "u_{}".format(index)
     name = restricted_name(rawname, restriction)
     container = name_coefficientcontainer(restriction)
-    evaluate_coefficient(visitor, element, name, container, restriction, tree_path)
+    evaluate_coefficient(element, name, container, restriction, index)
     return Variable(name)
-def pymbolic_apply_function_gradient(visitor, element, restriction, tree_path):
-    rawname = "gradz_func" + "_".join(str(c) for c in tree_path)
+def pymbolic_apply_function_gradient(element, restriction, index):
+    rawname = "gradz_func_{}".format(index)
     name = restricted_name(rawname, restriction)
     container = name_applycontainer(restriction)
-    evaluate_coefficient_gradient(visitor, element, name, container, restriction, tree_path)
+    evaluate_coefficient_gradient(element, name, container, restriction, index)
     return Variable(name)
-def pymbolic_apply_function(visitor, element, restriction, tree_path):
-    rawname = "z_func" + "_".join(str(c) for c in tree_path)
+def pymbolic_apply_function(element, restriction, index):
+    rawname = "z_func_{}".format(index)
     name = restricted_name(rawname, restriction)
     container = name_applycontainer(restriction)
-    evaluate_coefficient(visitor, element, name, container, restriction, tree_path)
+    evaluate_coefficient(element, name, container, restriction, index)
     return Variable(name)
diff --git a/python/dune/perftool/pdelab/basis.py b/python/dune/perftool/pdelab/basis.py
index f50367d64fc8ffd550345cee71cf9504bf32f110..9d58603aea4b87e909401165c21fb7979bb5ed26 100644
--- a/python/dune/perftool/pdelab/basis.py
+++ b/python/dune/perftool/pdelab/basis.py
@@ -11,16 +11,22 @@ from dune.perftool.generation import (backend,
 from dune.perftool.options import (option_switch,
-from dune.perftool.pdelab.spaces import (lfs_child,
+from dune.perftool.pdelab.spaces import (lfs_iname,
+                                         lfs_inames,
-                                         lfs_inames
+                                         type_leaf_gfs,
 from dune.perftool.pdelab.geometry import (component_iname,
+                                           name_jacobian_inverse_transposed,
+                                           to_cell_coordinates,
+from dune.perftool.pdelab.localoperator import (lop_template_ansatz_gfs,
+                                                lop_template_test_gfs,
+                                                )
 from dune.perftool.tools import (get_pymbolic_basename,
@@ -36,7 +42,7 @@ from loopy import Reduction
 def typedef_localbasis(element, name):
-    basis_type = "{}::Traits::FiniteElementMap::Traits::FiniteElementType::Traits::LocalBasisType".format(type_gfs(element))
+    basis_type = "{}::Traits::FiniteElementMap::Traits::FiniteElementType::Traits::LocalBasisType".format(type_leaf_gfs(element))
     return "using {} = typename {};".format(name, basis_type)
@@ -105,9 +111,9 @@ def pymbolic_basis(leaf_element, restriction, number, context=''):
     name = "phi_{}".format(FEM_name_mangling(leaf_element))
     name = restricted_name(name, restriction)
     evaluate_basis(leaf_element, name, restriction)
-    iname = lfs_inames(leaf_element, restriction, number, context=context)[0]
+    iname, = lfs_inames(leaf_element, restriction, number, context=context)
-    return Subscript(Variable(name), (Variable(iname), ))
+    return Subscript(Variable(name), (Variable(iname),))
@@ -133,7 +139,7 @@ def pymbolic_reference_gradient(leaf_element, restriction, number, context=''):
     name = "js_{}".format(FEM_name_mangling(leaf_element))
     name = restricted_name(name, restriction)
     evaluate_reference_gradient(leaf_element, name, restriction)
-    iname = lfs_inames(leaf_element, restriction, number, context=context)[0]
+    iname, = lfs_inames(leaf_element, restriction, number, context=context)
     return Subscript(Variable(name), (Variable(iname), 0))
@@ -148,86 +154,73 @@ def shape_as_pymbolic(shape):
-def evaluate_coefficient(visitor, element, name, container, restriction, tree_path):
-    from ufl.functionview import select_subelement
-    sub_element = select_subelement(element, tree_path)
+def evaluate_coefficient(element, name, container, restriction, index):
+    sub_element = element
+    if element.num_sub_elements() > 0:
+        sub_element = element.extract_component(index)[1]
-    # Determine the rank of the trialfunction tensor
-    rank = len(sub_element.value_shape())
+    from ufl import FiniteElement
+    assert isinstance(sub_element, FiniteElement)
-    shape = sub_element.value_shape()
-    shape_impl = ('arr', ) * rank
-    idims = tuple(component_iname(count=i) for i in range(rank))
-    leaf_element = sub_element
-    from ufl import VectorElement, TensorElement
-    if isinstance(sub_element, (VectorElement, TensorElement)):
-        leaf_element = sub_element.sub_elements()[0]
+    temporary_variable(name, shape=(),)
-    temporary_variable(name, shape=shape, shape_impl=shape_impl)
-    lfs = name_lfs(element, restriction, tree_path)
-    basis = visitor.interface.pymbolic_basis(leaf_element, restriction, 0, context='trial')
-    index, = get_pymbolic_indices(basis)
+    lfs = name_lfs(element, restriction, index)
+    basis = pymbolic_basis(sub_element, restriction, 0, context='trial')
+    basisindex, = get_pymbolic_indices(basis)
-    if isinstance(sub_element, (VectorElement, TensorElement)):
-        lfs = lfs_child(lfs, idims, shape=shape_as_pymbolic(shape), symmetry=element.symmetry())
     if get_option("blockstructured"):
         from dune.perftool.blockstructured.argument import pymbolic_coefficient
-        coeff = pymbolic_coefficient(container, lfs, element, index)
+        coeff = pymbolic_coefficient(container, lfs, element, basisindex)
         from dune.perftool.pdelab.argument import pymbolic_coefficient
-        coeff = pymbolic_coefficient(container, lfs, index)
+        coeff = pymbolic_coefficient(container, lfs, basisindex)
-    assignee = Subscript(Variable(name), tuple(Variable(i) for i in idims))
+    assignee = Variable(name)
     reduction_expr = Product((coeff, basis))
-    instruction(expression=Reduction("sum", index, reduction_expr, allow_simultaneous=True),
+    instruction(expression=Reduction("sum", basisindex, reduction_expr, allow_simultaneous=True),
-                forced_iname_deps=frozenset(get_backend("quad_inames")()).union(frozenset(idims)),
+                forced_iname_deps=frozenset(get_backend("quad_inames")()),
-def evaluate_coefficient_gradient(visitor, element, name, container, restriction, tree_path):
-    # First we determine the rank of the tensor we are talking about
-    from ufl.functionview import select_subelement
-    sub_element = select_subelement(element, tree_path)
-    rank = len(sub_element.value_shape()) + 1
-    # We do then set some variables accordingly
-    shape = sub_element.value_shape() + (element.cell().geometric_dimension(),)
-    shape_impl = ('arr',) * rank
-    idims = tuple(component_iname(count=i) for i in range(rank))
-    leaf_element = sub_element
-    from ufl import VectorElement, TensorElement
-    if isinstance(sub_element, (VectorElement, TensorElement)):
-        leaf_element = sub_element.sub_elements()[0]
-    # and proceed to call the necessary generator functions
-    temporary_variable(name, shape=shape, shape_impl=shape_impl)
-    lfs = name_lfs(element, restriction, tree_path)
-    basis = visitor.interface.pymbolic_reference_gradient(leaf_element, restriction, 0, context='trialgrad')
-    index, _ = get_pymbolic_indices(basis)
-    if isinstance(sub_element, (VectorElement, TensorElement)):
-        lfs = lfs_child(lfs, idims[:-1], shape=shape_as_pymbolic(shape[:-1]), symmetry=element.symmetry())
+def evaluate_coefficient_gradient(element, name, container, restriction, index):
+    sub_element = element
+    if element.num_sub_elements() > 0:
+        sub_element = element.extract_component(index)[1]
+    from ufl import FiniteElement
+    assert isinstance(sub_element, FiniteElement)
+    temporary_variable(name,
+                       shape=(element.cell().geometric_dimension(),),
+                       shape_impl=("arr",),
+                       )
+    dimindex = component_iname(count=0)
+    lfs = name_lfs(element, restriction, index)
+    basis = pymbolic_reference_gradient(sub_element, restriction, 0, context='trialgrad')
+    basisindex, _ = get_pymbolic_indices(basis)
+    from dune.perftool.tools import maybe_wrap_subscript
+    basis = maybe_wrap_subscript(basis, Variable(dimindex))
     if get_option("blockstructured"):
         from dune.perftool.blockstructured.argument import pymbolic_coefficient
-        coeff = pymbolic_coefficient(container, lfs, element, index)
+        coeff = pymbolic_coefficient(container, lfs, element, basisindex)
         from dune.perftool.pdelab.argument import pymbolic_coefficient
-        coeff = pymbolic_coefficient(container, lfs, index)
+        coeff = pymbolic_coefficient(container, lfs, basisindex)
     assignee = Subscript(Variable(name), tuple(Variable(i) for i in idims))
-    reduction_expr = Product((coeff, Subscript(Variable(get_pymbolic_basename(basis)), basis.index + (Variable(idims[-1]),))))
+    reduction_expr = Product((coeff, basis))
-    instruction(expression=Reduction("sum", index, reduction_expr, allow_simultaneous=True),
+    instruction(expression=Reduction("sum", basisindex, reduction_expr, allow_simultaneous=True),
-                forced_iname_deps=frozenset(get_backend("quad_inames")()).union(frozenset(idims)),
+                forced_iname_deps=frozenset(get_backend("quad_inames")()).union(frozenset({dimindex})),
-                tags=frozenset({"quad"})
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index ecf578882a001960830e0dad814b5613b01e71e0..6c54752e0e2a1e6067a36c91076e6f1bad45d96b 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -34,7 +34,7 @@ import dune.perftool.loopy.mangler
 from pymbolic.primitives import Variable
 import pymbolic.primitives as prim
-from pytools import Record
+from pytools import Record, ImmutableRecord
 import ufl.classes as uc
 import loopy as lp
@@ -212,44 +212,33 @@ class AccumulationSpace(Record):
             return (self.restriction,)
-def determine_accumulation_space(expr, number, measure, idims=None):
-    from dune.perftool.ufl.modified_terminals import extract_modified_arguments
-    args = extract_modified_arguments(expr, argnumber=number)
-    if measure == 'exterior_facet':
-        for ma in args:
-            ma.restriction = Restriction.NEGATIVE
-    # If this is a residual term we return a dummy object
-    if len(args) == 0:
+# TODO maybe move this onto the visitor as a helper function?
+def determine_accumulation_space(info, number):
+    if info is None:
         return AccumulationSpace()
     ma = next(iter(args))
-    # Extract information on the finite element
-    from ufl.functionview import select_subelement
-    subel = select_subelement(ma.argexpr.ufl_element(), ma.tree_path)
+    assert info is not None
+    element = info.element
+    subel = element
+    from ufl import MixedElement
+    if isinstance(element, MixedElement):
+        subel = element.extract_component(info.element_index)[1]
     # And generate a local function space for it!
-    from dune.perftool.pdelab.spaces import name_lfs, name_lfs_bound, lfs_child
-    lfs = name_lfs(ma.argexpr.ufl_element(), ma.restriction, ma.tree_path)
+    from dune.perftool.pdelab.spaces import name_lfs, name_lfs_bound, lfs_iname
+    lfs = name_lfs(element, info.restriction, info.element_index)
     from dune.perftool.generation import valuearg
     from loopy.types import NumpyType
     valuearg(lfs, dtype=NumpyType("str"))
-    if len(subel.value_shape()) != 0:
-        from dune.perftool.pdelab.geometry import component_iname
-        if idims is None:
-            idims = tuple(component_iname(context='arg', count=i) for i in range(len(subel.value_shape())))
-        lfs = lfs_child(lfs, idims, shape=subel.value_shape(), symmetry=subel.symmetry())
-        subel = subel.sub_elements()[0]
     if get_option("blockstructured"):
         from dune.perftool.blockstructured.tools import micro_index_to_macro_index
         from dune.perftool.blockstructured.spaces import lfs_inames
         lfsi = micro_index_to_macro_index(subel, lfs_inames(subel, ma.restriction, count=number)[0])
         from dune.perftool.pdelab.spaces import lfs_inames
-        lfsi = Variable(lfs_inames(subel, ma.restriction, count=number)[0])
+        lfsi = Variable(lfs_iname(subel, info.restriction, count=number))
     # If the LFS is not yet a pymbolic expression, make it one
     from pymbolic.primitives import Expression
@@ -258,7 +247,7 @@ def determine_accumulation_space(expr, number, measure, idims=None):
     return AccumulationSpace(lfs=lfs,
-                             restriction=ma.restriction,
+                             restriction=info.restriction,
@@ -308,6 +297,82 @@ def boundary_predicates(expr, measure, subdomain_id, visitor):
     return predicates
+class PDELabAccumulationInfo(ImmutableRecord):
+    def __init__(self,
+                 element=None,
+                 element_index=0,
+                 restriction=None,
+                 inames=(),
+                 ):
+        ImmutableRecord.__init__(self,
+                                 element=element,
+                                 element_index=element_index,
+                                 restriction=restriction,
+                                 inames=inames,
+                                 )
+    def __eq__(self, other):
+        return type(self) == type(other) and self.element_index == other.element_index and self.restriction == other.restriction
+    def __hash__(self):
+        return (self.element_index, self.restriction)
+def _list_infos(expr, number, visitor):
+    from dune.perftool.ufl.modified_terminals import extract_modified_arguments
+    ma = extract_modified_arguments(expr, argnumber=number)
+    if len(ma) == 0:
+        if number == 1:
+            yield None
+        return
+    element = ma[0].argexpr.ufl_element()
+    from dune.perftool.ufl.modified_terminals import Restriction
+    if visitor.measure == "cell":
+        restrictions = (Restriction.NONE,)
+    elif visitor.measure == "exterior_facet":
+        restrictions = (Restriction.NEGATIVE,)
+    elif visitor.measure == "interior_facet":
+        restrictions = (Restriction.NEGATIVE, Restriction.POSITIVE)
+    for res in restrictions:
+        for ei in range(element.num_sub_elements() + 1):
+            yield PDELabAccumulationInfo(element_index=ei, restriction=res)
+def list_accumulation_infos(expr, visitor):
+    testgen = _list_infos(expr, 0, visitor)
+    trialgen = _list_infos(expr, 1, visitor)
+    import itertools
+    return itertools.product(testgen, trialgen)
+def get_accumulation_info(expr, visitor):
+    element = expr.ufl_element()
+    leaf_element = element
+    element_index = 0
+    from ufl import MixedElement
+    if isinstance(expr.ufl_element(), MixedElement):
+        element_index = visitor.indices[0]
+        leaf_element = element.extract_component(element_index)[1]
+    restriction = visitor.restriction
+    if visitor.measure == 'exterior_facet':
+        from dune.perftool.pdelab.restriction import Restriction
+        restriction = Restriction.NEGATIVE
+    inames = visitor.interface.lfs_inames(leaf_element,
+                                          restriction,
+                                          expr.number()
+                                          )
+    return PDELabAccumulationInfo(element=expr.ufl_element(),
+                                  element_index=element_index,
+                                  restriction=restriction,
+                                  inames=inames,
+                                  )
 def grad_iname(index, dim):
     from dune.perftool.pdelab.index import name_index
@@ -316,111 +381,60 @@ def grad_iname(index, dim):
     return name
-def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
-    # When we do not do sumfactorization we do not split the test function
-    assert(accterm.argument.expr is None)
-    # TODO boundary_predicates may reset visitor's inames, is that wanted?
-    predicates = boundary_predicates(accterm.term, measure, subdomain_id, visitor)
-    # Do the tree traversal to get a pymbolic expression representing this expression
-    pymbolic_expr = visitor(accterm.term)
-    # It may happen that an entire accumulation term vanishes. We do nothing in that case
-    if pymbolic_expr == 0:
-        return
+def generate_accumulation_instruction(expr, visitor):
     # Collect the lfs and lfs indices for the accumulate call
-    test_lfs = determine_accumulation_space(accterm.term, 0, measure)
+    test_lfs = determine_accumulation_space(visitor.test_info, 0)
     # In the jacobian case, also determine the space for the ansatz space
-    ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure)
+    ansatz_lfs = determine_accumulation_space(visitor.trial_info, 1)
     # Collect the lfs and lfs indices for the accumulate call
     from dune.perftool.pdelab.argument import name_accumulation_variable
-    accumvar = name_accumulation_variable((test_lfs.get_restriction() + ansatz_lfs.get_restriction()))
+    accumvar = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
+    predicates = boundary_predicates(expr, visitor.measure, visitor.subdomain_id, visitor.copy())
     rank = 1 if ansatz_lfs.lfs is None else 2
     from dune.perftool.pdelab.argument import PDELabAccumulationFunction
     from pymbolic.primitives import Call
-    expr = Call(PDELabAccumulationFunction(accumvar, rank),
-                (test_lfs.get_args() + ansatz_lfs.get_args() + (pymbolic_expr,))
-                )
+    accexpr = Call(PDELabAccumulationFunction(accumvar, rank),
+                   (test_lfs.get_args() + ansatz_lfs.get_args() + (expr,))
+                   )
     from dune.perftool.generation import instruction
     from dune.perftool.options import option_switch
     quad_inames = visitor.interface.quadrature_inames()
+    lfs_inames = frozenset(visitor.test_info.inames)
+    if visitor.trial_info:
+        lfs_inames = lfs_inames.union(visitor.trial_info.inames)
-                expression=expr,
-                forced_iname_deps=frozenset(quad_inames).union(frozenset(visitor.inames)),
+                expression=accexpr,
+                forced_iname_deps=lfs_inames.union(frozenset(quad_inames)),
-def visit_integrals(integrals):
-    for integral in integrals:
-        integrand = integral.integrand()
-        measure = integral.integral_type()
-        subdomain_id = integral.subdomain_id()
-        subdomain_data = integral.subdomain_data()
-        # Maybe make the jacobian inverse diagonal!
-        if get_option('diagonal_transformation_matrix'):
-            if not get_option('turn_off_diagonal_jacobian'):
-                from dune.perftool.ufl.transformations.axiparallel import diagonal_jacobian
-                integrand = diagonal_jacobian(integrand)
-        # Generate code for the LFS trees present in the form
-        from dune.perftool.ufl.modified_terminals import extract_modified_arguments
-        test_ma = extract_modified_arguments(integrand, argnumber=0)
-        trial_ma = extract_modified_arguments(integrand, coeffcount=0)
-        apply_ma = extract_modified_arguments(integrand, coeffcount=1)
-        import itertools
-        for ma in itertools.chain(test_ma, trial_ma, apply_ma):
-            if measure == 'exterior_facet':
-                ma.restriction = Restriction.NEGATIVE
-            from dune.perftool.pdelab.spaces import traverse_lfs_tree
-            traverse_lfs_tree(ma)
-        # Now split the given integrand into accumulation
-        # expressions. If we do sumfactorization we cut the test
-        # argument from the rest of the expression. This gives the
-        # right input for the sumfactorization kernel of stage 3.
-        from dune.perftool.ufl.extract_accumulation_terms import split_into_accumulation_terms
-        if get_option('sumfact'):
-            accterms = split_into_accumulation_terms(integrand, cut_test_arg=True, split_gradients=True)
-        else:
-            accterms = split_into_accumulation_terms(integrand)
-        # Iterate over the terms and generate a kernel
-        for accterm in accterms:
-            # Get component indices
-            indexmap = {}
-            from dune.perftool.ufl.componentindex import component_index_mapping
-            if accterm.argument.expr is not None:
-                indexmap.update(component_index_mapping(accterm.indexed_test_arg()))
-            indexmap.update(component_index_mapping(accterm.term))
-            # Get a transformer instance for this kernel
-            if get_option('sumfact'):
-                from dune.perftool.sumfact import SumFactInterface
-                interface = SumFactInterface()
-            elif get_option('blockstructured'):
-                from dune.perftool.blockstructured import BlockStructuredInterface
-                interface = BlockStructuredInterface()
-            else:
-                from dune.perftool.pdelab import PDELabInterface
-                interface = PDELabInterface()
-            from dune.perftool.ufl.visitor import UFL2LoopyVisitor
-            visitor = UFL2LoopyVisitor(interface, measure, indexmap)
-            get_backend(interface="accum_insn")(visitor, accterm, measure, subdomain_id)
+def visit_integral(integral):
+    integrand = integral.integrand()
+    measure = integral.integral_type()
+    subdomain_id = integral.subdomain_id()
+    subdomain_data = integral.subdomain_data()
+    # Get a transformer instance for this kernel
+    if get_option('sumfact'):
+        from dune.perftool.sumfact import SumFactInterface
+        interface = SumFactInterface()
+    else:
+        from dune.perftool.pdelab import PDELabInterface
+        interface = PDELabInterface()
+    from dune.perftool.ufl.visitor import UFL2LoopyVisitor
+    visitor = UFL2LoopyVisitor(interface, measure, subdomain_id)
+    # Start the visiting process!
+    visitor.accumulate(integrand)
 def generate_kernel(integrals):
@@ -429,7 +443,8 @@ def generate_kernel(integrals):
     # Visit all integrals once to collect information (dry-run)!
     logger.debug('generate_kernel: visit_integrals (dry run)')
     with global_context(dry_run=True):
-        visit_integrals(integrals)
+        for integral in integrals:
+            visit_integral(integral)
     # Now perform some checks on what should be done
     from dune.perftool.sumfact.vectorization import decide_vectorization_strategy
@@ -440,7 +455,8 @@ def generate_kernel(integrals):
     logger.debug('generate_kernel: visit_integrals (no dry run)')
     from dune.perftool.generation import delete_cache_items
-    visit_integrals(integrals)
+    for integral in integrals:
+        visit_integral(integral)
     knl = extract_kernel_from_cache("kernel_default")
diff --git a/python/dune/perftool/pdelab/spaces.py b/python/dune/perftool/pdelab/spaces.py
index 791a4ccff5370141036cc60c809a2ba05d719c3c..068672bafbd2c5196d7b4bd01dfb62c2d1e789fc 100644
--- a/python/dune/perftool/pdelab/spaces.py
+++ b/python/dune/perftool/pdelab/spaces.py
@@ -1,6 +1,7 @@
 """ Generator functions for PDELab local/grid function spaces etc. """
-from dune.perftool.generation import (domain,
+from dune.perftool.generation import (class_member,
+                                      domain,
@@ -8,34 +9,17 @@ from dune.perftool.generation import (domain,
 from dune.perftool.pdelab.restriction import restricted_name
+from dune.perftool.ufl.modified_terminals import Restriction
 from loopy import CallMangleInfo
 from loopy.symbolic import FunctionIdentifier
 from loopy.types import NumpyType
 from pymbolic.primitives import Variable
+from functools import partial
 import numpy
-class LFSChild(FunctionIdentifier):
-    def __init__(self, lfs):
-        self.lfs = lfs
-    def __getinitargs__(self):
-        return (self.lfs,)
-    @property
-    def name(self):
-        return '{}.child'.format(self.lfs)
-def lfs_child_mangler(target, func, dtypes):
-    if isinstance(func, LFSChild):
-        return CallMangleInfo(func.name, (NumpyType(str),), (NumpyType(numpy.int32),))
 def define_lfs_bound(lfs, bound):
     return 'auto {} = {}.size();'.format(bound, lfs)
@@ -57,40 +41,6 @@ def using_indices():
     return "using namespace Dune::Indices;"
-def define_lfs(name, father, child):
-    using_indices()
-    return "auto {} = child({}, _{});".format(name, father, child)
-def lfs_child(lfs, children, shape=None, symmetry=False):
-    from pymbolic.primitives import Call, Product, Sum
-    # Old pre-TensorElement implementation kept for comaptibility
-    if shape is None:
-        indices = (Variable(children[0]),)
-    else:
-        if symmetry and len(children) == 2:
-            # I do not want to think about tensors of rank > 2 right now
-            i, j = children
-            if i > j:
-                j, i = i, j
-            i = Variable(i)
-            j = Variable(j)
-            n = len(children)
-            indices = (Sum((Product((n - 1, i)), Product((.5, i, 1 - i)), j)),)
-        else:
-            # If the index is not an int we need to make a variable out of it.
-            #
-            # Note: ints occur in the sumfactorisation case with
-            # vector valued functions (eg. Stokes)
-            if not isinstance(children[0], int):
-                children = tuple(Variable(c) for c in children)
-            indices = (Sum(tuple(Product((Product(tuple(shape[j] for j in range(i))), child)) for i, child in enumerate(children))),)
-    return Call(LFSChild(lfs), indices)
 @generator_factory(cache_key_generator=lambda e, r, **kw: (e, r))
 def name_leaf_lfs(leaf_element, restriction, val=None):
     """ This function just caches leaf lfs names based on the
@@ -98,90 +48,111 @@ def name_leaf_lfs(leaf_element, restriction, val=None):
     for size information. OTOH, they are available with just the
     leaf element available (as seen in basis evaluation).
-    # This generator function should be prepoluted by the lfs tree traversal,
-    # so val should always be None when we actually want the result
     assert val
+    return val
+@generator_factory(cache_key_generator=lambda e, **kw: e)
+def type_leaf_gfs(leaf_element, val=None):
+    """ This function just caches leaf lfs names based on the
+    element. The resulting local function spaces are useful only
+    for size information. OTOH, they are available with just the
+    leaf element available (as seen in basis evaluation).
+    """
+    assert val
     return val
-@generator_factory(cache_key_generator=lambda e, r, t, **kw: (e, r, t), context_tags=("kernel",))
-def name_lfs(element, restriction, tree_path, prefix=None):
-    # Omitting the prefix is only valid upon a second call, which will
-    # result in a cache hit.
-    assert prefix
+@generator_factory(cache_key_generator=lambda e, r, **kw: (e, r))
+def available_lfs_names(element, restriction, name=None):
+    assert name
+    return name
-    def _name_lfs(prefix, tree_path):
-        name = prefix
-        if len(tree_path) > 0:
-            name = name + '_' + '_'.join(str(i) for i in tree_path)
-        return name
-    name = _name_lfs(prefix, tree_path)
-    if len(tree_path) > 0:
-        father = _name_lfs(prefix, tree_path[:-1])
-        # If this localfunction space is the child of another one, trigger
-        # the extraction preamble. Necessary before going into recursion
-        # for having the correct (top-down) order of preambles
-        define_lfs(name, father, tree_path[-1])
+@generator_factory(cache_key_generator=lambda e, r, **kw: e)
+def available_gfs_names(element, restriction, name=None):
+    assert name
+    return name
-    # Recurse into the given element to define all other local function spaces!
-    from ufl import MixedElement
-    from ufl.functionview import select_subelement
-    from ufl.classes import FixedIndex
-    subel = select_subelement(element, tree_path)
-    if isinstance(subel, MixedElement):
-        for i in range(subel.num_sub_elements()):
-            name_lfs(element, restriction, tree_path + (FixedIndex(i),), prefix=prefix)
+def define_lfs(name, father, child):
+    using_indices()
+    return "auto {} = child({}, _{});".format(name, father, child)
-    # Cache the name for the subelement
-    name_leaf_lfs(subel, restriction, val=name)
-    # Now return the prefix!
-    return name
+def define_gfs(name, father, child):
+    include_file("dune/typetree/childextraction.hh", filetag="operatorfile")
+    return 'using {} = Dune::TypeTree::Child<{},{}>;'.format(name, father, child)
-@generator_factory(cache_key_generator=lambda e, **kw: e)
-def type_gfs(element, basetype=None, index_stack=None):
-    # Omitting basetype and index_stack is only valid upon a second call,
-    # which will result in a cache hit.
-    assert basetype
-    assert index_stack is not None
+def _name_lfs(element, restriction, tp, name):
+    if len(tp) == 0:
+        name_leaf_lfs(element, restriction, val=name)
+        return name
-    # Additionally, element is expected to be a ufl finite element
-    from ufl import FiniteElementBase
-    assert isinstance(element, FiniteElementBase)
+    childname = "{}_{}".format(name, tp[0])
+    define_lfs(childname, name, tp[0])
+    return _name_lfs(element.sub_elements()[tp[0]], restriction, tp[1:], childname)
+def _type_gfs(element, restriction, tp, name):
+    if len(tp) == 0:
+        type_leaf_gfs(element, val=name)
+        return name
+    childname = "{}_{}".format(name, tp[0])
+    define_gfs(childname, name, tp[0])
+    return _type_gfs(element.sub_elements()[tp[0]], restriction, tp[1:], childname)
+def _function_space_traversal(element, restriction, index, defaultname=None, recfunc=None):
+    name = defaultname(element, restriction)
-    # Recurse into the given element to define all other local function spaces!
+    tp = ()
     from ufl import MixedElement
-    from ufl.classes import FixedIndex
     if isinstance(element, MixedElement):
-        for i, subelem in enumerate(element.sub_elements()):
-            type_gfs(subelem, basetype=basetype, index_stack=index_stack + (FixedIndex(i),))
+        assert index is not None
+        tp = element.extract_subelement_component(index)
+        tp = (tp[0],) + tp[1]
+    return recfunc(element, restriction, tp, name)
-    if len(index_stack) == 0:
-        return basetype
-    else:
-        include_file("dune/typetree/childextraction.hh", filetag="operatorfile")
-        return 'Dune::TypeTree::Child<{},{}>'.format(basetype, ','.join(str(i) for i in index_stack))
+name_lfs = partial(_function_space_traversal, defaultname=available_lfs_names, recfunc=_name_lfs)
+type_gfs = partial(_function_space_traversal, defaultname=available_gfs_names, recfunc=_type_gfs)
-def traverse_lfs_tree(arg):
-    from dune.perftool.ufl.modified_terminals import ModifiedArgument
-    assert isinstance(arg, ModifiedArgument)
-    # First we need to determine the basename as given in the signature of
-    # this kernel method!
-    lfs_basename = name_argumentspace(arg)
-    from dune.perftool.pdelab.localoperator import lop_template_gfs
-    gfs_basename = lop_template_gfs(arg)
+def initialize_function_spaces(expr, visitor):
+    restriction = visitor.restriction
+    if visitor.measure == 'exterior_facet':
+        restriction = Restriction.NEGATIVE
+    index = None
+    from ufl import MixedElement
+    if isinstance(expr.ufl_element(), MixedElement):
+        index = visitor.indices[0]
-    # Now start recursively extracting local function spaces and fill the cache with
-    # all those values. That way we can later get a correct local function space with
-    # just the ufl finite element.
-    from ufl.classes import MultiIndex
-    name_lfs(arg.argexpr.ufl_element(), arg.restriction, MultiIndex(()), prefix=lfs_basename)
-    type_gfs(arg.argexpr.ufl_element(), basetype=gfs_basename, index_stack=())
+    from ufl.classes import Argument, Coefficient
+    if isinstance(expr, Argument) and expr.number() == 0:
+        available_lfs_names(expr.ufl_element(),
+                            restriction,
+                            name=name_testfunctionspace(restriction))
+        name_lfs(expr.ufl_element(), restriction, index)
+        from dune.perftool.pdelab.localoperator import lop_template_test_gfs
+        available_gfs_names(expr.ufl_element(), 0,
+                            name=lop_template_test_gfs())
+        type_gfs(expr.ufl_element(), restriction, index)
+    else:
+        available_lfs_names(expr.ufl_element(),
+                            restriction,
+                            name=name_trialfunctionspace(restriction))
+        name_lfs(expr.ufl_element(), restriction, index)
+        from dune.perftool.pdelab.localoperator import lop_template_ansatz_gfs
+        available_gfs_names(expr.ufl_element(), 0,
+                            name=lop_template_ansatz_gfs())
+        type_gfs(expr.ufl_element(), restriction, index)
 @generator_factory(item_tags=("iname",), cache_key_generator=lambda e, r, c: (e, c), context_tags=("kernel",))
@@ -222,24 +193,6 @@ def lfs_inames(element, restriction, count=None, context=''):
     return (lfs_iname(element, restriction, count, context),)
-class LFSLocalIndex(FunctionIdentifier):
-    def __init__(self, lfs):
-        self.lfs = lfs
-    def __getinitargs__(self):
-        return (self.lfs,)
-    @property
-    def name(self):
-        return '{}.localIndex'.format(self.lfs)
-def lfs_localindex_mangler(target, func, dtypes):
-    if isinstance(func, LFSLocalIndex):
-        return CallMangleInfo(func.name, (NumpyType(numpy.int32),), (NumpyType(numpy.int32),))
 def name_testfunctionspace(restriction):
     return restricted_name("lfsv", restriction)
@@ -248,21 +201,6 @@ def name_trialfunctionspace(restriction):
     return restricted_name("lfsu", restriction)
-def name_argumentspace(ma):
-    from ufl.classes import Argument, Coefficient
-    if isinstance(ma.argexpr, Argument):
-        if ma.argexpr.number() == 0:
-            return name_testfunctionspace(ma.restriction)
-        if ma.argexpr.number() == 1:
-            return name_trialfunctionspace(ma.restriction)
-    if isinstance(ma.argexpr, Coefficient):
-        # Index 0 is reserved for trialfunction, index 1 is reserved for jacobian apply function
-        assert ma.argexpr.count() < 2
-        return name_trialfunctionspace(ma.restriction)
-    # We should never encounter an argument other than 0 or 1
-    assert False
 def type_testfunctionspace():
     return "LFSV"
diff --git a/python/dune/perftool/sumfact/__init__.py b/python/dune/perftool/sumfact/__init__.py
index a2b5623caa23a641dfaf7d050fd612a0ce6f573a..2da267716037ac75567ff959ce9a93f75aeed816 100644
--- a/python/dune/perftool/sumfact/__init__.py
+++ b/python/dune/perftool/sumfact/__init__.py
@@ -22,6 +22,18 @@ from dune.perftool.pdelab import PDELabInterface
 class SumFactInterface(PDELabInterface):
+    def get_accumulation_info(self, expr, visitor):
+        from dune.perftool.sumfact.accumulation import get_accumulation_info
+        return get_accumulation_info(expr, visitor)
+    def list_accumulation_infos(self, expr, visitor):
+        from dune.perftool.sumfact.accumulation import list_accumulation_infos
+        return list_accumulation_infos(expr, visitor)
+    def generate_accumulation_instruction(self, expr, visitor):
+        from dune.perftool.sumfact.accumulation import generate_accumulation_instruction
+        return generate_accumulation_instruction(expr, visitor)
     def lfs_inames(self, element, restriction, number=None, context=''):
         return lfs_inames(element, restriction, number, context)
@@ -33,23 +45,23 @@ class SumFactInterface(PDELabInterface):
         self.visitor.indices = indices
         return ret
-    def pymbolic_trialfunction_gradient(self, element, restriction, tree_path):
-        ret, indices = pymbolic_coefficient_gradient(element, restriction, tree_path, name_coefficientcontainer, self.visitor.indices)
+    def pymbolic_trialfunction_gradient(self, element, restriction, index):
+        ret, indices = pymbolic_coefficient_gradient(element, restriction, index, name_coefficientcontainer, self.visitor.indices)
         self.visitor.indices = indices
         return ret
-    def pymbolic_trialfunction(self, element, restriction, tree_path):
-        ret, indices = pymbolic_coefficient(element, restriction, tree_path, name_coefficientcontainer, self.visitor.indices)
+    def pymbolic_trialfunction(self, element, restriction, index):
+        ret, indices = pymbolic_coefficient(element, restriction, index, name_coefficientcontainer, self.visitor.indices)
         self.visitor.indices = indices
         return ret
-    def pymbolic_apply_function_gradient(self, element, restriction, tree_path):
-        ret, indices = pymbolic_coefficient_gradient(element, restriction, tree_path, name_applycontainer, self.visitor.indices)
+    def pymbolic_apply_function_gradient(self, element, restriction, index):
+        ret, indices = pymbolic_coefficient_gradient(element, restriction, index, name_applycontainer, self.visitor.indices)
         self.visitor.indices = indices
         return ret
-    def pymbolic_apply_function(self, element, restriction, tree_path):
-        ret, indices = pymbolic_coefficient(element, restriction, tree_path, name_applycontainer, self.visitor.indices)
+    def pymbolic_apply_function(self, element, restriction, index):
+        ret, indices = pymbolic_coefficient(element, restriction, index, name_applycontainer, self.visitor.indices)
         self.visitor.indices = indices
         return ret
diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
index 86a19cd249816df2f09ac89a80ff589e0082b23d..f6ce77eb46de74d2ffd47f419448cea0b7798ba1 100644
--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -75,222 +75,277 @@ class AlreadyAssembledInput(SumfactKernelInputBase):
         return hash(self.index)
-def _component_gradient_indices(argument):
-    """Return component and gradient index of test function argument
+class SumfactAccumulationInfo(ImmutableRecord):
+    def __init__(self,
+                 element=None,
+                 element_index=0,
+                 restriction=None,
+                 inames=(),
+                 grad_index=None,
+                 ):
+        ImmutableRecord.__init__(self,
+                                 element=element,
+                                 element_index=element_index,
+                                 restriction=restriction,
+                                 inames=inames,
+                                 grad_index=grad_index,
+                                 )
+    def __eq__(self, other):
+        return type(self) == type(other) and self.element_index == other.element_index and self.restriction == other.restriction and self.grad_index == other.grad_index
+    def __hash__(self):
+        return (self.element_index, self.restriction, self.grad_index)
+def get_accumulation_info(expr, visitor):
+    element = expr.ufl_element()
+    leaf_element = element
+    element_index = 0
+    from ufl import MixedElement
+    if isinstance(expr.ufl_element(), MixedElement):
+        element_index = visitor.indices[0]
+        leaf_element = element.extract_component(element_index)[1]
+    restriction = visitor.restriction
+    if visitor.measure == 'exterior_facet':
+        from dune.perftool.pdelab.restriction import Restriction
+        restriction = Restriction.NEGATIVE
+    inames = visitor.interface.lfs_inames(leaf_element,
+                                          restriction,
+                                          expr.number()
+                                          )
-    The component_index is the component of vector valued functions
-    (eg v in Stokes). The gradient index is the direction of the
-    derivative (eg in \Delat u in poisson or \grad u in Stoks).
-    """
     grad_index = None
-    component_index = None
-    # If this is a gradient the last index is the grad_index
-    if argument.reference_grad:
-        grad_index = argument.expr.ufl_operands[1][-1]._value
-    # If this argument has indices there could be a component_index
-    if isinstance(argument.expr, uc.Indexed):
-        # More than two indices not supported
-        if len(argument.expr.ufl_operands[1]) > 2:
-            assert False
-        # For two indices the first is the component_index
-        if len(argument.expr.ufl_operands[1]) == 2:
-            assert grad_index is not None
-            component_index = argument.expr.ufl_operands[1].indices()[0]._value
-        # For there is no gradient index we should have only one index, the component_index
-        if not argument.reference_grad:
-            assert len(argument.expr.ufl_operands[1]) == 1
-            component_index = argument.expr.ufl_operands[1].indices()[0]._value
-    return component_index, grad_index
-@backend(interface="accum_insn", name="sumfact")
-def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
-    # When doing sum factorization we want to split the test function
-    assert(accterm.argument.expr is not None)
-    # Get component and gradient index
-    component_index, grad_index = _component_gradient_indices(accterm.argument)
-    # Do the tree traversal to get a pymbolic expression representing this expression
-    pymbolic_expr = visitor(accterm.term)
-    if pymbolic_expr == 0:
+    if visitor.reference_grad and expr.number() == 0:
+        if isinstance(expr.ufl_element(), MixedElement):
+            grad_index = visitor.indices[1]
+        else:
+            grad_index = visitor.indices[0]
+    return SumfactAccumulationInfo(element=expr.ufl_element(),
+                                   element_index=element_index,
+                                   restriction=restriction,
+                                   inames=inames,
+                                   grad_index=grad_index,
+                                   )
+def _test_generator(expr, visitor):
+    from dune.perftool.ufl.modified_terminals import extract_modified_arguments
+    ma = extract_modified_arguments(expr, argnumber=0)
+    if len(ma) == 0:
-    # Number of basis functions
+    element = ma[0].argexpr.ufl_element()
     dim = world_dimension()
-    mod_arg_expr = accterm.argument.expr
-    while (not isinstance(mod_arg_expr, uc.FunctionView)) and (not isinstance(mod_arg_expr, uc.Argument)):
-        mod_arg_expr = mod_arg_expr.ufl_operands[0]
-    degree = mod_arg_expr.ufl_element()._degree
-    basis_size = degree + 1
+    from dune.perftool.ufl.modified_terminals import Restriction
+    if visitor.measure == "cell":
+        restrictions = (Restriction.NONE,)
+    elif visitor.measure == "exterior_facet":
+        restrictions = (Restriction.NEGATIVE,)
+    elif visitor.measure == "interior_facet":
+        restrictions = (Restriction.NEGATIVE, Restriction.POSITIVE)
+    for res in restrictions:
+        for ei in range(element.num_sub_elements() + 1):
+            for grad in (None,) + tuple(range(dim)):
+                yield SumfactAccumulationInfo(element_index=ei,
+                                              restriction=res,
+                                              grad_index=grad)
+def _trial_generator(expr, visitor):
+    from dune.perftool.ufl.modified_terminals import extract_modified_arguments
+    ma = extract_modified_arguments(expr, argnumber=1)
+    if len(ma) == 0:
+        yield None
+        return
+    element = ma[0].argexpr.ufl_element()
+    from dune.perftool.ufl.modified_terminals import Restriction
+    if visitor.measure == "cell":
+        restrictions = (Restriction.NONE,)
+    elif visitor.measure == "exterior_facet":
+        restrictions = (Restriction.NEGATIVE,)
+    elif visitor.measure == "interior_facet":
+        restrictions = (Restriction.NEGATIVE, Restriction.POSITIVE)
+    for res in restrictions:
+        for ei in range(element.num_sub_elements() + 1):
+            yield SumfactAccumulationInfo(element_index=ei, restriction=res)
+def list_accumulation_infos(expr, visitor):
+    import itertools
+    return itertools.product(_test_generator(expr, visitor), _trial_generator(expr, visitor))
-    jacobian_inames = tuple()
-    if accterm.is_jacobian:
-        jacobian_inames = visitor.inames
-    # Only accumulate boundary conditions on parts where it is defined
+def generate_accumulation_instruction(expr, visitor):
+    dim = world_dimension()
+    test_info = visitor.test_info
+    trial_info = visitor.trial_info
+    leaf_element = test_info.element
+    if leaf_element.num_sub_elements() > 0:
+        leaf_element = leaf_element.extract_component(test_info.element_index)[1]
+    basis_size = leaf_element.degree() + 1
     from dune.perftool.pdelab.localoperator import boundary_predicates
-    predicates = boundary_predicates(accterm.term, measure, subdomain_id, visitor)
-    def emit_sumfact_kernel(restriction, insn_dep):
-        test_lfs = determine_accumulation_space(accterm.argument.expr, 0, measure, idims=(component_index,))
-        ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure, idims=(component_index,))
-        accum = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
-        # Construct the matrix sequence for this sum factorization
-        matrix_sequence = construct_basis_matrix_sequence(
-            transpose=True,
-            derivative=grad_index,
-            facedir=get_facedir(accterm.argument.restriction),
-            facemod=get_facemod(accterm.argument.restriction),
-            basis_size=basis_size)
-        # TODO: Adapt preferred position for stokes sumfact symdiff
-        sf = SumfactKernel(matrix_sequence=matrix_sequence,
-                           restriction=(accterm.argument.restriction, restriction),
-                           stage=3,
-                           preferred_position=grad_index,
-                           accumvar=accum,
-                           within_inames=jacobian_inames,
-                           input=AlreadyAssembledInput(index=(component_index,)),
-                           component_index=component_index,
-                           )
+    predicates = boundary_predicates(expr,
+                                     visitor.measure,
+                                     visitor.subdomain_id,
+                                     visitor)
-        from dune.perftool.sumfact.vectorization import attach_vectorization_info
-        vsf = attach_vectorization_info(sf)
-        # Make sure we have a buffer that we can set up the input with
-        buffer = vsf.buffer
-        if buffer is None:
-            buffer = get_counted_variable("buffer")
-        vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset()
-        temp = get_buffer_temporary(buffer,
-                                    shape=vsf.quadrature_shape,
-                                    dim_tags=vsf.quadrature_dimtags,
-                                    name="input_{}".format(buffer),
-                                    )
-        # Those input fields, that are padded need to be set to zero
-        # in order to do a horizontal_add later on
-        for pad in vsf.padded_indices:
-            assignee = prim.Subscript(lp.TaggedVariable(temp, vsf.tag), pad)
-            instruction(assignee=assignee,
-                        expression=0,
-                        forced_iname_deps=frozenset(quadrature_inames() + jacobian_inames),
-                        forced_iname_deps_is_final=True,
-                        tags=frozenset(["quadvec", "gradvec"]),
-                        )
-        # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1)
-        timer_dep = frozenset()
-        if get_option("instrumentation_level") >= 4:
-            timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
-            post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-            dump_accumulate_timer(timer_name)
-            if(jacobian_inames):
-                timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
-                                                   within_inames=frozenset(jacobian_inames))})
-        # Determine dependencies
-        from loopy.match import Or, Writes
-        from loopy.symbolic import DependencyMapper
-        from dune.perftool.tools import get_pymbolic_basename
-        deps = Or(tuple(Writes(get_pymbolic_basename(expr)) for expr in DependencyMapper()(pymbolic_expr)))
-        # Issue an instruction in the quadrature loop that fills the buffer
-        # with the evaluation of the contribution at all quadrature points
-        assignee = prim.Subscript(lp.TaggedVariable(temp, vsf.tag),
-                                  vsf.quadrature_index(sf))
-        contrib_dep = instruction(assignee=assignee,
-                                  expression=pymbolic_expr,
-                                  forced_iname_deps=frozenset(quadrature_inames() + jacobian_inames),
-                                  forced_iname_deps_is_final=True,
-                                  tags=frozenset({"quadvec"}).union(vectag),
-                                  depends_on=frozenset({deps}).union(timer_dep).union(frozenset({lp.match.Tagged("sumfact_stage1")})),
-                                  )
-        if insn_dep is None:
-            insn_dep = frozenset({contrib_dep})
-        if get_option("instrumentation_level") >= 4:
-            insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
-                                              depends_on=insn_dep,
-                                              within_inames=frozenset(jacobian_inames))})
-        inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i)
-                       for i, mat in enumerate(vsf.matrix_sequence))
-        # Collect the lfs and lfs indices for the accumulate call
-        test_lfs.index = flatten_index(tuple(prim.Variable(i) for i in inames),
-                                       (basis_size,) * dim,
-                                       order="f"
-                                       )
-        # In the jacobian case, also determine the space for the ansatz space
-        if accterm.is_jacobian:
-            # TODO the next line should get its inames from
-            # elsewhere. This is *NOT* robust (but works right now)
-            ansatz_lfs.index = flatten_index(tuple(prim.Variable(jacobian_inames[i])
-                                                   for i in range(world_dimension())),
-                                             (basis_functions_per_direction(),) * dim,
-                                             order="f"
-                                             )
-        # Add a sum factorization kernel that implements the multiplication
-        # with the test function (stage 3)
-        from dune.perftool.sumfact.realization import realize_sum_factorization_kernel
-        result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep)))
-        # Determine the expression to accumulate with. This depends on the vectorization strategy!
-        result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames))
-        vecinames = ()
-        if vsf.vectorized:
-            iname = accum_iname((accterm.argument.restriction, restriction), vsf.vector_width, "vec")
-            vecinames = (iname,)
-            transform(lp.tag_inames, [(iname, "vec")])
-            from dune.perftool.tools import maybe_wrap_subscript
-            result = prim.Call(prim.Variable("horizontal_add"),
-                               (maybe_wrap_subscript(result, prim.Variable(iname)),),
-                               )
-        if not get_option("fastdg"):
-            rank = 2 if accterm.is_jacobian else 1
-            expr = prim.Call(PDELabAccumulationFunction(accum, rank),
-                             (test_lfs.get_args() +
-                              ansatz_lfs.get_args() +
-                              (result,)
+    insn_dep = None
+    from dune.perftool.pdelab.localoperator import determine_accumulation_space
+    test_lfs = determine_accumulation_space(test_info, 0)
+    ansatz_lfs = determine_accumulation_space(trial_info, 1)
+    if trial_info is None:
+        trial_info = SumfactAccumulationInfo()
+    from dune.perftool.pdelab.argument import name_accumulation_variable
+    accumvar = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
+    matrix_sequence = construct_basis_matrix_sequence(
+        transpose=True,
+        derivative=test_info.grad_index,
+        facedir=get_facedir(test_info.restriction),
+        facemod=get_facemod(test_info.restriction),
+        basis_size=basis_size)
+    jacobian_inames = trial_info.inames
+    priority = test_info.grad_index
+    if priority is None:
+        priority = 3
+    sf = SumfactKernel(matrix_sequence=matrix_sequence,
+                       restriction=(test_info.restriction, trial_info.restriction),
+                       stage=3,
+                       position_priority=priority,
+                       accumvar=accumvar,
+                       within_inames=jacobian_inames,
+                       input=AlreadyAssembledInput(index=(test_info.element_index,)),
+                       component_index=test_info.element_index,
+                       )
+    from dune.perftool.sumfact.vectorization import attach_vectorization_info
+    vsf = attach_vectorization_info(sf)
+    # Make sure we have a buffer that we can set up the input with
+    buffer = vsf.buffer
+    if buffer is None:
+        buffer = get_counted_variable("buffer")
+    vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset()
+    temp = get_buffer_temporary(buffer,
+                                shape=vsf.quadrature_shape,
+                                dim_tags=vsf.quadrature_dimtags,
+                                name="input_{}".format(buffer),
+                                )
+    # Those input fields, that are padded need to be set to zero
+    # in order to do a horizontal_add later on
+    for pad in vsf.padded_indices:
+        assignee = prim.Subscript(lp.TaggedVariable(temp, vsf.tag), pad)
+        instruction(assignee=assignee,
+                    expression=0,
+                    forced_iname_deps=frozenset(quadrature_inames() + jacobian_inames),
+                    forced_iname_deps_is_final=True,
+                    tags=frozenset(["quadvec", "gradvec"]),
+                    )
+    # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1)
+    timer_dep = frozenset()
+    if get_option("instrumentation_level") >= 4:
+        timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
+        post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+        dump_accumulate_timer(timer_name)
+        if(jacobian_inames):
+            timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+                                               within_inames=frozenset(jacobian_inames))})
+    # Determine dependencies
+    from loopy.match import Or, Writes
+    from loopy.symbolic import DependencyMapper
+    from dune.perftool.tools import get_pymbolic_basename
+    deps = Or(tuple(Writes(get_pymbolic_basename(e)) for e in DependencyMapper()(expr)))
+    # Issue an instruction in the quadrature loop that fills the buffer
+    # with the evaluation of the contribution at all quadrature points
+    assignee = prim.Subscript(lp.TaggedVariable(temp, vsf.tag),
+                              vsf.quadrature_index(sf))
+    contrib_dep = instruction(assignee=assignee,
+                              expression=expr,
+                              forced_iname_deps=frozenset(quadrature_inames() + jacobian_inames),
+                              forced_iname_deps_is_final=True,
+                              tags=frozenset({"quadvec"}).union(vectag),
+                              depends_on=frozenset({deps}).union(timer_dep).union(frozenset({lp.match.Tagged("sumfact_stage1")})),
-                             )
-            instruction(assignees=(),
-                        expression=expr,
-                        forced_iname_deps=frozenset(inames + vecinames + jacobian_inames),
-                        forced_iname_deps_is_final=True,
-                        depends_on=insn_dep,
-                        predicates=predicates
-                        )
-        # Mark the transformation that moves the quadrature loop
-        # inside the trialfunction loops for application
-        if accterm.is_jacobian:
-            transform(nest_quadrature_loops, jacobian_inames)
-        return insn_dep
-    # Extract the restrictions on argument-1:
-    jac_restrictions = frozenset(tuple(ma.restriction for ma in
-                                       extract_modified_arguments(accterm.term,
-                                                                  argnumber=1,
-                                                                  do_index=True)))
-    if not jac_restrictions:
-        jac_restrictions = frozenset({0})
-    insn_dep = None
-    for restriction in jac_restrictions:
-        insn_dep = emit_sumfact_kernel(restriction, insn_dep)
+    if insn_dep is None:
+        insn_dep = frozenset({contrib_dep})
+    if get_option("instrumentation_level") >= 4:
+        insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+                                          depends_on=insn_dep,
+                                          within_inames=frozenset(jacobian_inames))})
+    inames = tuple(accum_iname((test_info.restriction, trial_info.restriction), mat.rows, i)
+                   for i, mat in enumerate(vsf.matrix_sequence))
+    # Collect the lfs and lfs indices for the accumulate call
+    test_lfs.index = flatten_index(tuple(prim.Variable(i) for i in inames),
+                                   (basis_size,) * dim,
+                                   order="f"
+                                   )
+    # In the jacobian case, also determine the space for the ansatz space
+    if jacobian_inames:
+        # TODO the next line should get its inames from
+        # elsewhere. This is *NOT* robust (but works right now)
+        ansatz_lfs.index = flatten_index(tuple(prim.Variable(jacobian_inames[i])
+                                               for i in range(world_dimension())),
+                                         (basis_functions_per_direction(),) * dim,
+                                         order="f"
+                                         )
+    # Add a sum factorization kernel that implements the multiplication
+    # with the test function (stage 3)
+    from dune.perftool.sumfact.realization import realize_sum_factorization_kernel
+    result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep)))
+    # Determine the expression to accumulate with. This depends on the vectorization strategy!
+    result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames))
+    vecinames = ()
+    if vsf.vectorized:
+        iname = accum_iname((test_info.restriction, trial_info.restriction), vsf.vector_width, "vec")
+        vecinames = (iname,)
+        transform(lp.tag_inames, [(iname, "vec")])
+        from dune.perftool.tools import maybe_wrap_subscript
+        result = prim.Call(prim.Variable("horizontal_add"),
+                           (maybe_wrap_subscript(result, prim.Variable(iname)),),
+                           )
+    if not get_option("fastdg"):
+        rank = 2 if jacobian_inames else 1
+        expr = prim.Call(PDELabAccumulationFunction(accumvar, rank),
+                         (test_lfs.get_args() +
+                          ansatz_lfs.get_args() +
+                          (result,)
+                          )
+                         )
+        instruction(assignees=(),
+                    expression=expr,
+                    forced_iname_deps=frozenset(inames + vecinames + jacobian_inames),
+                    forced_iname_deps_is_final=True,
+                    depends_on=insn_dep,
+                    predicates=predicates
+                    )
+    # Mark the transformation that moves the quadrature loop
+    # inside the trialfunction loops for application
+    if jacobian_inames:
+        transform(nest_quadrature_loops, jacobian_inames)
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 7a2a0e2f31902282ddfd6b701a7b456c846aae35..9ea50e08c57b52a15843fc8bf3979547ed4d0389 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -37,12 +37,11 @@ from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInputBase
 from dune.perftool.options import get_option
 from dune.perftool.pdelab.driver import FEM_name_mangling
 from dune.perftool.pdelab.restriction import restricted_name
-from dune.perftool.pdelab.spaces import name_lfs, name_lfs_bound, lfs_child, name_leaf_lfs
+from dune.perftool.pdelab.spaces import name_lfs, name_lfs_bound, name_leaf_lfs
 from dune.perftool.tools import maybe_wrap_subscript
 from dune.perftool.pdelab.basis import shape_as_pymbolic
 from dune.perftool.sumfact.accumulation import sumfact_iname
-from ufl.functionview import select_subelement
 from ufl import VectorElement, TensorElement
 from pytools import product, ImmutableRecord
@@ -55,46 +54,23 @@ import pymbolic.primitives as prim
 class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
     def __init__(self,
-                 component_index=None,
-                 tree_path=None,
+                 element_index=0,
-                                 component_index=component_index,
-                                 tree_path=tree_path,
+                                 element_index=element_index,
     def realize(self, sf, index, insn_dep):
-        lfs = name_lfs(self.element, self.restriction, self.tree_path)
-        sub_element = select_subelement(self.element, self.tree_path)
-        shape = sub_element.value_shape() + (self.element.cell().geometric_dimension(),)
-        if isinstance(sub_element, (VectorElement, TensorElement)):
-            # Could be 0 but shouldn't be None
-            assert self.component_index is not None
-            lfs_pym = lfs_child(lfs,
-                                (self.component_index,),
-                                shape=shape_as_pymbolic(shape[:-1]),
-                                symmetry=self.element.symmetry())
-        leaf_element = sub_element
-        if isinstance(sub_element, (VectorElement, TensorElement)):
-            leaf_element = sub_element.sub_elements()[0]
-        lfs = name_leaf_lfs(leaf_element, self.restriction)
+        lfs = name_lfs(self.element, self.restriction, self.element_index)
         basisiname = sumfact_iname(name_lfs_bound(lfs), "basis")
         container = self.coeff_func(self.restriction)
-        if isinstance(sub_element, (VectorElement, TensorElement)):
-            from dune.perftool.pdelab.argument import pymbolic_coefficient as pc
-            coeff = pc(container, lfs_pym, basisiname)
-        else:
-            from dune.perftool.pdelab.argument import pymbolic_coefficient as pc
-            coeff = pc(container, lfs, basisiname)
+        from dune.perftool.pdelab.argument import pymbolic_coefficient as pc
+        coeff = pc(container, lfs, basisiname)
         # Get the input temporary!
         name = get_buffer_temporary(sf.buffer,
@@ -118,39 +94,25 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
             return None
-def _basis_functions_per_direction(element, tree_path):
-    """Number of basis functions per direction of a given tree_path of an element"""
-    assert len(tree_path.indices()) <= 1
-    if len(tree_path.indices()) == 0:
-        degree = element.degree()
-    else:
-        index = tree_path.indices()[0]._value
-        degree = element.sub_elements()[index].degree()
-    basis_size = degree + 1
-    return basis_size
+def _basis_functions_per_direction(element):
+    """Number of basis functions per direction """
+    from ufl import FiniteElement
+    assert isinstance(element, FiniteElement)
+    return element.degree() + 1
-def pymbolic_coefficient_gradient(element, restriction, tree_path, coeff_func, visitor_indices):
-    rawname = "gradu" + "_".join(str(c) for c in tree_path)
-    name = restricted_name(rawname, restriction)
+def pymbolic_coefficient_gradient(element, restriction, index, coeff_func, visitor_indices):
+    sub_element = element
+    grad_index = visitor_indices[0]
+    if element.num_sub_elements() > 0:
+        sub_element = element.extract_component(index)[1]
+    from ufl import FiniteElement
+    assert isinstance(sub_element, FiniteElement)
     # Number of basis functions per direction
-    basis_size = _basis_functions_per_direction(element, tree_path)
-    # Get a temporary for the gradient
-    from ufl.functionview import select_subelement
-    sub_element = select_subelement(element, tree_path)
-    rank = len(sub_element.value_shape()) + 1
-    shape = sub_element.value_shape() + (world_dimension(),)
-    shape_impl = ('arr',) * rank
-    temporary_variable(name, shape=shape, shape_impl=shape_impl)
-    if len(visitor_indices) == 1:
-        component_index = None
-        grad_index, = visitor_indices
-    else:
-        component_index, grad_index = visitor_indices
+    basis_size = _basis_functions_per_direction(sub_element)
     # Construct the matrix sequence for this sum factorization
     matrix_sequence = construct_basis_matrix_sequence(derivative=grad_index,
@@ -160,15 +122,14 @@ def pymbolic_coefficient_gradient(element, restriction, tree_path, coeff_func, v
     inp = LFSSumfactKernelInput(coeff_func=coeff_func,
-                                component_index=component_index,
-                                tree_path=tree_path,
+                                element_index=index,
     # The sum factorization kernel object gathering all relevant information
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
-                       preferred_position=grad_index,
+                       position_priority=grad_index,
@@ -182,29 +143,30 @@ def pymbolic_coefficient_gradient(element, restriction, tree_path, coeff_func, v
-def pymbolic_coefficient(element, restriction, tree_path, coeff_func, visitor_indices):
+def pymbolic_coefficient(element, restriction, index, coeff_func, visitor_indices):
+    sub_element = element
+    if element.num_sub_elements() > 0:
+        sub_element = element.extract_component(index)[1]
+    from ufl import FiniteElement
+    assert isinstance(sub_element, FiniteElement)
     # Basis functions per direction
-    basis_size = _basis_functions_per_direction(element, tree_path)
+    basis_size = _basis_functions_per_direction(sub_element)
     # Construct the matrix sequence for this sum factorization
     matrix_sequence = construct_basis_matrix_sequence(facedir=get_facedir(restriction),
-    component_index = None
-    if visitor_indices:
-        assert len(visitor_indices) == 1
-        component_index = visitor_indices[0]
     inp = LFSSumfactKernelInput(coeff_func=coeff_func,
-                                component_index=component_index,
-                                tree_path=tree_path,
+                                element_index=index,
     sf = SumfactKernel(matrix_sequence=matrix_sequence,
+                       position_priority=3,
     from dune.perftool.sumfact.vectorization import attach_vectorization_info
@@ -227,9 +189,11 @@ def sumfact_lfs_iname(bound, dim):
 @backend(interface="lfs_inames", name="sumfact")
 def lfs_inames(element, restriction, number=1, context=''):
-    assert number == 1
-    dim = world_dimension()
-    return tuple(sumfact_lfs_iname(basis_functions_per_direction(), d) for d in range(dim))
+    if number == 0:
+        return ()
+    else:
+        dim = world_dimension()
+        return tuple(sumfact_lfs_iname(basis_functions_per_direction(), d) for d in range(dim))
 @backend(interface="evaluate_basis", name="sumfact")
@@ -264,7 +228,10 @@ def evaluate_basis(element, name, restriction):
 def pymbolic_basis(element, restriction, number):
-    assert number == 1
+    # If this is a test function we omit it!
+    if number == 0:
+        return 1
     assert element.num_sub_elements() == 0
     name = "phi_{}".format(FEM_name_mangling(element))
@@ -313,7 +280,10 @@ def evaluate_reference_gradient(element, name, restriction, index):
 def pymbolic_reference_gradient(element, restriction, number, indices):
-    assert number == 1
+    # If this is a test function, we omit it.
+    if number == 0:
+        return 1, None
     assert len(indices) == 1
     index, = indices
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index 878e047c613da83ce01c94376a49f5b8b1553191..aeb2a2260c8fd914f19414ab87686e147b029924 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -32,7 +32,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
-                 preferred_position=None,
+                 position_priority=None,
@@ -92,7 +92,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
             pre-initialized with the input or you have to provide
             direct_input (FastDGGridOperator).
         stage: 1 or 3
-        preferred_position: Will be used in the dry run to order kernels
+        position_priority: Will be used in the dry run to order kernels
             when doing vectorization e.g. (dx u,dy u,dz u, u).
         restriction: Restriction for faces values.
         within_inames: Instructions will be executed within those
@@ -112,9 +112,6 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
         assert stage in (1, 3)
-        if preferred_position is not None:
-            assert isinstance(preferred_position, int)
         if stage == 1:
             assert isinstance(input, SumfactKernelInputBase)
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 7df5f236a26855ecd0cbd9207ba210f7a563dc65..790fa59a1f052078800785cf10ed35fbf988da03 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -90,38 +90,22 @@ def vertical_vectorization_strategy(sumfact, depth):
 def horizontal_vectorization_strategy(sumfacts, width, allow_padding=1):
     result = {}
     todo = set(sumfacts)
     while todo:
-        position_mapping = {}
-        available = set(range(width))
-        for sf in todo:
-            if sf.preferred_position is not None and sf.preferred_position in available:
-                available.discard(sf.preferred_position)
-                position_mapping[sf.preferred_position] = sf
-        for sf in position_mapping.values():
-            todo.discard(sf)
-        for pos in available:
-            if todo:
-                position_mapping[pos] = todo.pop()
-        kernels = [None] * len(position_mapping)
-        for pos in position_mapping:
-            kernels[pos] = position_mapping[pos]
-        kernels = tuple(kernels)
+        kernels = []
+        for sf in sorted(todo, key=lambda s: s.position_priority):
+            if len(kernels) < width:
+                kernels.append(sf)
+                todo.discard(sf)
         buffer = get_counted_variable("joined_buffer")
+        if len(kernels) in range(width - allow_padding, width + 1):
+            for sf in kernels:
+                result[sf] = VectorizedSumfactKernel(kernels=tuple(kernels),
+                                                     horizontal_width=width,
+                                                     buffer=buffer,
+                                                     )
-        for sumf in kernels:
-            if len(kernels) in range(width - allow_padding, width + 1):
-                result[sumf] = VectorizedSumfactKernel(kernels=kernels,
-                                                       horizontal_width=width,
-                                                       buffer=buffer,
-                                                       )
     return result
diff --git a/python/dune/perftool/ufl/execution.py b/python/dune/perftool/ufl/execution.py
index 3c3583391f46c1f03a98d12dc839cd377675e5bb..1a0ac249cf4bcb1ebd13312c01a2d5249c515618 100644
--- a/python/dune/perftool/ufl/execution.py
+++ b/python/dune/perftool/ufl/execution.py
@@ -30,7 +30,7 @@ class Coefficient(ufl.Coefficient):
 def split(obj):
-    return ufl.split_functions.split2(obj)
+    return ufl.split_functions.split(obj)
 def Coefficients(element):
diff --git a/python/dune/perftool/ufl/modified_terminals.py b/python/dune/perftool/ufl/modified_terminals.py
index da555529faddd7d3157585afeb5cd73706246c4c..cc818a7b563224c21909e7f86eba7f9420ef50d3 100644
--- a/python/dune/perftool/ufl/modified_terminals.py
+++ b/python/dune/perftool/ufl/modified_terminals.py
@@ -80,11 +80,11 @@ class ModifiedTerminalTracker(MultiFunction):
         self.reference_grad = False
         return ret
-    def function_view(self, o):
-        self.tree_path = o.ufl_operands[1]
-        ret = self.call(o.ufl_operands[0])
-        self.tree_path = MultiIndex(())
-        return ret
+#     def function_view(self, o):
+#         self.tree_path = o.ufl_operands[1]
+#         ret = self.call(o.ufl_operands[0])
+#         self.tree_path = MultiIndex(())
+#         return ret
     def reference_value(self, o):
         self.reference = True
@@ -176,7 +176,7 @@ class _ModifiedArgumentExtractor(MultiFunction):
     positive_restricted = pass_on
     negative_restricted = pass_on
-    function_view = pass_on
+#     function_view = pass_on
     reference_value = pass_on
     def argument(self, o):
diff --git a/python/dune/perftool/ufl/preprocess.py b/python/dune/perftool/ufl/preprocess.py
index fba501dea4918827af809d89d6fe7782f846cc7b..df756bf742628e46d76cb69de431a383463219c0 100644
--- a/python/dune/perftool/ufl/preprocess.py
+++ b/python/dune/perftool/ufl/preprocess.py
@@ -30,8 +30,8 @@ def apply_default_transformations(form):
     from dune.perftool.ufl.transformations.reindexing import reindexing
     from dune.perftool.ufl.transformations.unroll import unroll_dimension_loops
-    form = transform_form(form, unroll_dimension_loops)
+#     form = transform_form(form, unroll_dimension_loops)
     form = transform_form(form, pushdown_indexed)
-    form = transform_form(form, reindexing)
+#     form = transform_form(form, reindexing)
     return form
diff --git a/python/dune/perftool/ufl/transformations/unroll.py b/python/dune/perftool/ufl/transformations/unroll.py
index 01088008cd4634f62184b79a42175cc66b2085c3..1a43849a63f194138551d01f471351e9e03cd98e 100644
--- a/python/dune/perftool/ufl/transformations/unroll.py
+++ b/python/dune/perftool/ufl/transformations/unroll.py
@@ -16,6 +16,8 @@ import ufl.classes as uc
 class UnrollDimensionLoops(MultiFunction):
     def __init__(self):
         self.replace = {}
+        self._indices_backup = []
+        self.indices = None
     call = MultiFunction.__call__
@@ -23,24 +25,60 @@ class UnrollDimensionLoops(MultiFunction):
     def expr(self, o):
         return self.reuse_if_untouched(o, *tuple(self(op) for op in o.ufl_operands))
+    def indexed(self, o):
+        self._indices_backup.append(self.indices)
+        self.indices = self.call(o.ufl_operands[1])
+        ret = self.call(o.ufl_operands[0])
+        if self.indices is not None:
+            ret = uc.Indexed(ret, self.indices)
+        self.indices = self._indices_backup.pop()
+        return ret
     def multi_index(self, o):
         return uc.MultiIndex(tuple(self.replace.get(i, i) for i in o))
     def index_sum(self, o):
         operands = []
-        # TODO: What is the correct way to get the shape here?
-        for i in range(o.geometric_dimension()):
+        for i in range(o.dimension()):
             self.replace[o.ufl_operands[1][0]] = uc.FixedIndex(i)
             del self.replace[o.ufl_operands[1][0]]
         return construct_binary_operator(tuple(operands), uc.Sum)
+    def list_tensor(self, o):
+        index = self.indices[0]
+        index = index._value
+        self.indices = self.indices[1:]
+        if len(self.indices) == 0:
+            self.indices = None
+        return self.call(o.ufl_operands[index])
+    def component_tensor(self, o):
+        assert len(self.indices) == len(o.ufl_operands[1])
+        # Update the index mapping
+        for i, ind in enumerate(o.ufl_operands[1]):
+            self.replace[ind] = self.indices[i]
+        self.indices = None
+        ret = self.call(o.ufl_operands[0])
+        for i, ind in enumerate(o.ufl_operands[1]):
+            del self.replace[ind]
+        return ret
 def unroll_dimension_loops(expr):
-    if get_option("unroll_dimension_loops"):
-        return UnrollDimensionLoops()(expr)
-    else:
-        return expr
+    return UnrollDimensionLoops()(expr)
+#     if get_option("unroll_dimension_loops"):
+#         return UnrollDimensionLoops()(expr)
+#     else:
+#         return expr
diff --git a/python/dune/perftool/ufl/visitor.py b/python/dune/perftool/ufl/visitor.py
index 3692d7548bac09df0f998bbd01f0d4684ee0c0a9..417b8678d471564a3efb4bf1277e6fb30dc8b6a0 100644
--- a/python/dune/perftool/ufl/visitor.py
+++ b/python/dune/perftool/ufl/visitor.py
@@ -9,6 +9,7 @@ from dune.perftool.ufl.modified_terminals import (ModifiedTerminalTracker,
 from dune.perftool.tools import maybe_wrap_subscript
+from dune.perftool.options import get_option
 from loopy import Reduction
 from pymbolic.primitives import (Call,
@@ -21,7 +22,6 @@ from pymbolic.primitives import (Call,
 from ufl.algorithms import MultiFunction
 from ufl.checks import is_cellwise_constant
-from ufl.functionview import select_subelement
 from ufl import (VectorElement,
@@ -34,21 +34,36 @@ import pymbolic.primitives as prim
 class UFL2LoopyVisitor(ModifiedTerminalTracker):
-    def __init__(self, interface, measure, component_indices):
+    def __init__(self, interface, measure, subdomain_id):
         self.interface = interface
         self.interface.visitor = self
         self.measure = measure
-        self.component_indices = component_indices
+        self.subdomain_id = subdomain_id
         # Call base class constructors
         super(UFL2LoopyVisitor, self).__init__()
     def __call__(self, o, do_predicates=False):
-        # Reset some state variables that are reinitialized for each accumulation term
+        self.current_info = None
+        return self._call(o, do_predicates)
+    def accumulate(self, o):
+        for info in self.interface.list_accumulation_infos(o, self):
+            self.current_info = info
+            expr = self._call(o, False)
+            if expr != 0:
+                self.interface.generate_accumulation_instruction(expr, self)
+    def _call(self, o, do_predicates):
+        # Reset state variables
+        self.indexmap = {}
         self.indices = None
         self._indices_backup = []
+        self.test_info = None
+        self.trial_info = None
         self.inames = ()
         self.do_predicates = do_predicates
         return self.call(o)
     call = MultiFunction.__call__
@@ -59,39 +74,42 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
     def argument(self, o):
+        self.interface.initialize_function_spaces(o, self)
+        # Update the information on where to accumulate this
+        info = self.interface.get_accumulation_info(o, self)
+        if o.number() == 0:
+            if info != self.current_info[0]:
+                self.indices = None
+                return 0
+            else:
+                self.test_info = info
+        elif o.number() == 1:
+            if info != self.current_info[1]:
+                self.indices = None
+                return 0
+            else:
+                self.trial_info = info
         # Correct the restriction on boundary integrals
         restriction = self.restriction
         if self.measure == 'exterior_facet':
             restriction = Restriction.NEGATIVE
+        leaf_element = o.ufl_element()
-        # Select the correct subtree of the finite element
-        element = select_subelement(o.ufl_element(), self.tree_path)
-        leaf_element = element
-        # Now treat the case of this being a vector finite element
-        if element.num_sub_elements() > 0:
-            # I cannot handle general mixed elements here...
-            assert isinstance(element, (VectorElement, TensorElement))
-            # If this is a vector element, we need add an additional accumulation loop iname
-            shape = len(element.value_shape())
-            self.indices = self.indices[shape:]
-            for i in range(len(element.value_shape())):
-                if self.interface.component_iname(context='arg', count=i) not in self.inames:
-                    self.inames = self.inames + (self.interface.component_iname(context='arg', count=i),)
+        # Select the correct leaf element in the case of this being a mixed finite element
+        if o.ufl_element().num_sub_elements() > 0:
+            index = self.indices[0]
+            assert isinstance(index, int)
+            self.indices = self.indices[1:]
+            if len(self.indices) == 0:
+                self.indices = None
             # For the purpose of basis evaluation, we need to take the leaf element
-            leaf_element = element.sub_elements()[0]
+            leaf_element = leaf_element.extract_component(index)[1]
         if self.grad:
             raise PerftoolUFLError("Gradients should have been transformed to reference gradients!!!")
-        # Have the issued instruction depend on the iname for this localfunction space
-        inames = self.interface.lfs_inames(leaf_element, restriction, o.number())
-        for iname in inames:
-            if iname not in self.inames:
-                self.inames = self.inames + (iname,)
         if self.reference_grad:
             return self.interface.pymbolic_reference_gradient(leaf_element, restriction, o.number())
@@ -105,19 +123,29 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
             if self.measure == 'exterior_facet':
                 restriction = Restriction.NEGATIVE
+            self.interface.initialize_function_spaces(o, self)
+            index = None
+            if o.ufl_element().num_sub_elements() > 0:
+                index = self.indices[0]
+                assert isinstance(index, int)
+                self.indices = self.indices[1:]
+                if len(self.indices) == 0:
+                    self.indices = None
             if self.grad:
                 raise PerftoolUFLError("Gradients should have been transformed to reference gradients!!!")
             if self.reference_grad:
                 if o.count() == 0:
-                    return self.interface.pymbolic_trialfunction_gradient(o.ufl_element(), restriction, self.tree_path)
+                    return self.interface.pymbolic_trialfunction_gradient(o.ufl_element(), restriction, index)
-                    return self.interface.pymbolic_apply_function_gradient(o.ufl_element(), restriction, self.tree_path)
+                    return self.interface.pymbolic_apply_function_gradient(o.ufl_element(), restriction, index)
                 if o.count() == 0:
-                    return self.interface.pymbolic_trialfunction(o.ufl_element(), restriction, self.tree_path)
+                    return self.interface.pymbolic_trialfunction(o.ufl_element(), restriction, index)
-                    return self.interface.pymbolic_apply_function(o.ufl_element(), restriction, self.tree_path)
+                    return self.interface.pymbolic_apply_function(o.ufl_element(), restriction, index)
         # Check if this is a parameter function
@@ -163,44 +191,30 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
             self.indices = self._indices_backup.pop()
             return maybe_wrap_subscript(aggr, indices)
-    def index_sum(self, o, additional_inames=()):
-        # There is three scenarios here:
-        # * This is a normal IndexSum:
-        #   We should collect the reduction inames, go into recursion and issue
-        #   a reduction operation to a temporary
-        # * This is part of a nested indexsum:
-        #   We only collect the reduction iname and go into recursion
-        # * This IndexSum is implicitly handled by the accumulation process
-        #   We only go into recursion
-        # Get the iname for the reduction index
-        ind = o.ufl_operands[1][0]
-        redinames = additional_inames + (ind,)
-        shape = o.ufl_operands[0].ufl_index_dimensions[0]
-        domain(self.interface.name_index(ind), shape)
-        # If the left operand is an index sum to, we do it in one reduction
-        if isinstance(o.ufl_operands[0], IndexSum):
-            return self.index_sum(o.ufl_operands[0], additional_inames=redinames)
-        else:
-            # Recurse to get the summation expression
-            term = self.call(o.ufl_operands[0])
-            redinames = tuple(i for i in redinames if i not in self.component_indices)
-            if len(redinames) > 0:
-                ret = Reduction("sum", tuple(self.interface.name_index(ind) for ind in redinames), term)
-            else:
-                ret = term
+    def index_sum(self, o):
+        # This implementation fully unrolls the given indexed sum.
+        # This is done for a variety of reasons:
+        # * It eases handling of the given loopy kernel in terms of schedulability
+        # * The compiler would unroll these anyway
+        # * It allows handling of arbitrarily bad nesting of ComponentTensor and
+        #   ListTensor, which otherwise becomes a *nightmare*.
+        index = o.ufl_operands[1][0]
+        operands = []
+        for i in range(o.dimension()):
+            self.indexmap[index] = i
+            operands.append(self.call(o.ufl_operands[0]))
+            del self.indexmap[index]
-            return ret
+        from pymbolic import flattened_sum
+        return flattened_sum(tuple(operands))
     def _index_or_fixed_index(self, index):
         if isinstance(index, FixedIndex):
             return index._value
-            if index in self.component_indices:
-                if self.component_indices[index] not in self.inames:
-                    self.inames = self.inames + (self.component_indices[index],)
-                return Variable(self.component_indices[index])
+            if index in self.indexmap:
+                return self.indexmap[index]
                 return Variable(self.interface.name_index(index))
@@ -218,6 +232,20 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
             return self.interface.pymbolic_list_tensor(o)
+    def component_tensor(self, o):
+        assert len(self.indices) == len(o.ufl_operands[1])
+        # Update the index mapping
+        for i, ind in enumerate(o.ufl_operands[1]):
+            self.indexmap[ind] = self.indexmap.get(self.indices[i], self.indices[i])
+        self.indices = None
+        ret = self.call(o.ufl_operands[0])
+        for i, ind in enumerate(o.ufl_operands[1]):
+            del self.indexmap[ind]
+        return ret
     def identity(self, o):
         return self.interface.pymbolic_identity(o)
@@ -227,7 +255,8 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
     def product(self, o):
-        return Product(tuple(self.call(op) for op in o.ufl_operands))
+        from pymbolic import flattened_product
+        return flattened_product(tuple(self.call(op) for op in o.ufl_operands))
     def float_value(self, o):
         return o.value()
@@ -239,7 +268,8 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         return Quotient(self.call(o.ufl_operands[0]), self.call(o.ufl_operands[1]))
     def sum(self, o):
-        return Sum(tuple(self.call(op) for op in o.ufl_operands))
+        from pymbolic import flattened_sum
+        return flattened_sum(tuple(self.call(op) for op in o.ufl_operands))
     def zero(self, o):
         return 0
@@ -347,8 +377,9 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         assert self.restriction is not Restriction.NONE
         # Optimize facet normal on axiparallel grids
+        # TODO move this into the sumfact backend, it is only valid there
         from dune.perftool.options import get_option
-        if get_option("diagonal_transformation_matrix"):
+        if get_option("diagonal_transformation_matrix") and get_option("sumfact"):
             index, = self.indices
             from dune.perftool.sumfact.switch import get_facedir
             if isinstance(index, int) and index != get_facedir(self.restriction):
@@ -380,8 +411,9 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         self.indices = None
         # Implement diagonal jacobians for unrolled matrices!
-        if isinstance(i, int) and isinstance(j, int) and i != j:
-            return 0
+        if get_option("diagonal_transformation_matrix"):
+            if isinstance(i, int) and isinstance(j, int) and i != j:
+                return 0
         return self.interface.pymbolic_jacobian_inverse_transposed(i, j, restriction)
@@ -413,3 +445,7 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
     def __hash__(self):
         return 0
+    def copy(self):
+        """ Get a copy of this visitor """
+        return type(self)(self.interface, self.measure, self.subdomain_id)
diff --git a/python/ufl b/python/ufl
index 8b7062528ff99e99c7e928e7d08f0c09c8776978..962d56f65821fb9c50ca4a5a858882c472243431 160000
--- a/python/ufl
+++ b/python/ufl
@@ -1 +1 @@
-Subproject commit 8b7062528ff99e99c7e928e7d08f0c09c8776978
+Subproject commit 962d56f65821fb9c50ca4a5a858882c472243431
diff --git a/test/stokes/CMakeLists.txt b/test/stokes/CMakeLists.txt
index dc0e4d36d6d3fb94f6fbaf0643c428850c36e4ec..2a274367c3d7dc652ccfadb580b68571b4d97e96 100644
--- a/test/stokes/CMakeLists.txt
+++ b/test/stokes/CMakeLists.txt
@@ -27,11 +27,11 @@ dune_add_formcompiler_system_test(UFLFILE stokes_dg_quadrilateral.ufl
                                   INIFILE stokes_dg_quadrilateral.mini
-dune_add_formcompiler_system_test(UFLFILE stokes_stress.ufl
-                                  BASENAME stokes_stress
-                                  INIFILE stokes_stress.mini
-                                  )
+#dune_add_formcompiler_system_test(UFLFILE stokes_stress.ufl
+#                                  BASENAME stokes_stress
+#                                  INIFILE stokes_stress.mini
+#                                  )
 # Do not test stokes_stress_sym until the function_view project
 # has been fully implemented.