diff --git a/patches/apply_patches.sh b/patches/apply_patches.sh
index 0ac07e6614e86c9a439584a71d6f17b0a1a8a90d..e7a0b5f373139932981fa0e491924b685b6076ff 100755
--- a/patches/apply_patches.sh
+++ b/patches/apply_patches.sh
@@ -2,7 +2,6 @@
 
 pushd python/loopy
 git apply ../../patches/loopy/Current.patch
-git apply ../../patches/loopy/0001-Allow-using-globalargs-as-base-storage.patch
 popd
 
 pushd dune/perftool/vectorclass
diff --git a/patches/loopy/0001-Allow-using-globalargs-as-base-storage.patch b/patches/loopy/0001-Allow-using-globalargs-as-base-storage.patch
deleted file mode 100644
index 5df322c990864c60614eb1cd3b3bdb928dd76acc..0000000000000000000000000000000000000000
--- a/patches/loopy/0001-Allow-using-globalargs-as-base-storage.patch
+++ /dev/null
@@ -1,63 +0,0 @@
-From b15ad57a11b774bde714f058c85248990c461048 Mon Sep 17 00:00:00 2001
-From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
-Date: Tue, 23 May 2017 12:55:25 +0200
-Subject: [PATCH] Allow using globalargs as base storage
-
----
- loopy/preprocess.py        |  2 +-
- loopy/target/c/__init__.py | 23 ++++++++++++-----------
- 2 files changed, 13 insertions(+), 12 deletions(-)
-
-diff --git a/loopy/preprocess.py b/loopy/preprocess.py
-index 17226b6..65a9df1 100644
---- a/loopy/preprocess.py
-+++ b/loopy/preprocess.py
-@@ -165,7 +165,7 @@ def find_temporary_scope(kernel):
-     for temp_var in six.itervalues(kernel.temporary_variables):
-         if temp_var.base_storage is not None:
-             # no nesting allowed
--            if temp_var.base_storage in kernel_var_names:
-+            if temp_var.base_storage in kernel.temporary_variables:
-                 raise LoopyError("base_storage for temporary '%s' is '%s', "
-                         "which is an existing variable name"
-                         % (temp_var.name, temp_var.base_storage))
-diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
-index e4835a3..7c6a617 100644
---- a/loopy/target/c/__init__.py
-+++ b/loopy/target/c/__init__.py
-@@ -439,20 +439,21 @@ class CASTBuilder(ASTBuilderBase):
-                 assert tv.initializer is None
- 
-                 offset = 0
--                base_storage_sizes.setdefault(tv.base_storage, []).append(
--                        tv.nbytes)
--                base_storage_to_scope.setdefault(tv.base_storage, []).append(
--                        tv.scope)
-+                if not tv.base_storage in kernel.args:
-+                    base_storage_sizes.setdefault(tv.base_storage, []).append(
-+                            tv.nbytes)
-+                    base_storage_to_scope.setdefault(tv.base_storage, []).append(
-+                            tv.scope)
- 
--                align_size = tv.dtype.itemsize
-+                    align_size = tv.dtype.itemsize
- 
--                from loopy.kernel.array import VectorArrayDimTag
--                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
--                    if isinstance(dim_tag, VectorArrayDimTag):
--                        align_size *= axis_len
-+                    from loopy.kernel.array import VectorArrayDimTag
-+                    for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
-+                        if isinstance(dim_tag, VectorArrayDimTag):
-+                            align_size *= axis_len
- 
--                base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
--                        align_size)
-+                    base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
-+                            align_size)
- 
-                 for idi in decl_info:
-                     cast_decl = POD(self, idi.dtype, "")
--- 
-2.1.4
-
diff --git a/python/dune/perftool/blockstructured/basis.py b/python/dune/perftool/blockstructured/basis.py
index 59d55174faa0f9fbc0133bcf6cbd523913477a4f..77b266d75d9b0312b5bd9e8e8763223b05dd5852 100644
--- a/python/dune/perftool/blockstructured/basis.py
+++ b/python/dune/perftool/blockstructured/basis.py
@@ -21,6 +21,8 @@ from dune.perftool.pdelab.spaces import type_leaf_gfs
 from dune.perftool.pdelab.restriction import restricted_name
 from dune.perftool.blockstructured.spaces import lfs_inames
 
+from ufl import MixedElement
+
 import pymbolic.primitives as prim
 
 
@@ -79,7 +81,7 @@ def evaluate_basis(leaf_element, name, restriction):
 
 
 def pymbolic_basis(leaf_element, restriction, number, context=''):
-    assert leaf_element.num_sub_elements() == 0
+    assert not isinstance(leaf_element, MixedElement)
     name = "phi_{}".format(FEM_name_mangling(leaf_element))
     name = restricted_name(name, restriction)
     evaluate_basis(leaf_element, name, restriction)
@@ -102,7 +104,7 @@ def evaluate_reference_gradient(leaf_element, name, restriction):
 
 
 def pymbolic_reference_gradient(leaf_element, restriction, number, context=''):
-    assert leaf_element.num_sub_elements() == 0
+    assert not isinstance(leaf_element, MixedElement)
     name = "js_{}".format(FEM_name_mangling(leaf_element))
     name = restricted_name(name, restriction)
     evaluate_reference_gradient(leaf_element, name, restriction)
diff --git a/python/dune/perftool/pdelab/basis.py b/python/dune/perftool/pdelab/basis.py
index 2bda1822594203be1e406e6452154fb13fd8f8f5..0c7ab18a63f12b9e870992aad88143d65229ef67 100644
--- a/python/dune/perftool/pdelab/basis.py
+++ b/python/dune/perftool/pdelab/basis.py
@@ -35,7 +35,11 @@ from dune.perftool.pdelab.restriction import restricted_name
 from dune.perftool.pdelab.driver import (isPk,
                                          isQk,
                                          isDG)
+
 from pymbolic.primitives import Product, Subscript, Variable
+
+from ufl import MixedElement
+
 from loopy import Reduction
 
 
@@ -107,7 +111,7 @@ def evaluate_basis(leaf_element, name, restriction):
 
 
 def pymbolic_basis(leaf_element, restriction, number, context=''):
-    assert leaf_element.num_sub_elements() == 0
+    assert not isinstance(leaf_element, MixedElement)
     name = "phi_{}".format(FEM_name_mangling(leaf_element))
     name = restricted_name(name, restriction)
     evaluate_basis(leaf_element, name, restriction)
@@ -135,7 +139,7 @@ def evaluate_reference_gradient(leaf_element, name, restriction):
 
 
 def pymbolic_reference_gradient(leaf_element, restriction, number, context=''):
-    assert leaf_element.num_sub_elements() == 0
+    assert not isinstance(leaf_element, MixedElement)
     name = "js_{}".format(FEM_name_mangling(leaf_element))
     name = restricted_name(name, restriction)
     evaluate_reference_gradient(leaf_element, name, restriction)
@@ -156,7 +160,7 @@ def shape_as_pymbolic(shape):
 @kernel_cached
 def evaluate_coefficient(visitor, element, name, container, restriction, index):
     sub_element = element
-    if element.num_sub_elements() > 0:
+    if isinstance(element, MixedElement):
         sub_element = element.extract_component(index)[1]
 
     from ufl import FiniteElement
@@ -189,7 +193,7 @@ def evaluate_coefficient(visitor, element, name, container, restriction, index):
 @kernel_cached
 def evaluate_coefficient_gradient(visitor, element, name, container, restriction, index):
     sub_element = element
-    if element.num_sub_elements() > 0:
+    if isinstance(element, MixedElement):
         sub_element = element.extract_component(index)[1]
     from ufl import FiniteElement
     assert isinstance(sub_element, FiniteElement)
diff --git a/python/dune/perftool/pdelab/driver/gridfunctionspace.py b/python/dune/perftool/pdelab/driver/gridfunctionspace.py
index bf47920a94513d0075b92d3b65bc8357081e371e..bf20ff6695941280b39d5680a6ca087ca4e54da2 100644
--- a/python/dune/perftool/pdelab/driver/gridfunctionspace.py
+++ b/python/dune/perftool/pdelab/driver/gridfunctionspace.py
@@ -223,31 +223,31 @@ def name_test_gfs():
     return name_gfs(element, is_dirichlet)
 
 
-def name_gfs(element, is_dirichlet, treepath=()):
+def name_gfs(element, is_dirichlet, treepath=(), root=True):
     if isinstance(element, (VectorElement, TensorElement)):
         subel = element.sub_elements()[0]
-        subgfs = name_gfs(subel, is_dirichlet[:subel.value_size()], treepath=treepath + (0,))
+        subgfs = name_gfs(subel, is_dirichlet[:subel.value_size()], treepath=treepath + (0,), root=False)
         name = "{}_pow{}gfs_{}".format(subgfs,
                                        element.num_sub_elements(),
                                        "_".join(str(t) for t in treepath))
-        define_power_gfs(element, is_dirichlet, name, subgfs)
+        define_power_gfs(element, is_dirichlet, name, subgfs, root)
         return name
     elif isinstance(element, MixedElement):
         k = 0
         subgfs = []
         for i, subel in enumerate(element.sub_elements()):
-            subgfs.append(name_gfs(subel, is_dirichlet[k:k + subel.value_size()], treepath=treepath + (i,)))
+            subgfs.append(name_gfs(subel, is_dirichlet[k:k + subel.value_size()], treepath=treepath + (i,), root=False))
             k = k + subel.value_size()
         name = "_".join(subgfs)
         name = "{}_{}".format(name, "_".join(str(t) for t in treepath))
-        define_composite_gfs(element, is_dirichlet, name, tuple(subgfs))
+        define_composite_gfs(element, is_dirichlet, name, tuple(subgfs), root)
         return name
     else:
         assert isinstance(element, (FiniteElement, TensorProductElement))
         name = "{}{}_gfs_{}".format(FEM_name_mangling(element).lower(),
                                     "_dirichlet" if is_dirichlet[0] else "",
                                     "_".join(str(t) for t in treepath))
-        define_gfs(element, is_dirichlet, name)
+        define_gfs(element, is_dirichlet, name, root)
         return name
 
 
@@ -263,34 +263,34 @@ def type_trial_gfs():
     return type_gfs(element, is_dirichlet)
 
 
-def type_gfs(element, is_dirichlet):
+def type_gfs(element, is_dirichlet, root=True):
     if isinstance(element, (VectorElement, TensorElement)):
         subel = element.sub_elements()[0]
-        subgfs = type_gfs(subel, is_dirichlet[:subel.value_size()])
+        subgfs = type_gfs(subel, is_dirichlet[:subel.value_size()], root=False)
         name = "{}_POW{}GFS".format(subgfs, element.num_sub_elements())
-        typedef_power_gfs(element, is_dirichlet, name, subgfs)
+        typedef_power_gfs(element, is_dirichlet, name, subgfs, root)
         return name
     elif isinstance(element, MixedElement):
         k = 0
         subgfs = []
         for subel in element.sub_elements():
-            subgfs.append(type_gfs(subel, is_dirichlet[k:k + subel.value_size()]))
+            subgfs.append(type_gfs(subel, is_dirichlet[k:k + subel.value_size()], root=False))
             k = k + subel.value_size()
         name = "_".join(subgfs)
-        typedef_composite_gfs(element, name, tuple(subgfs))
+        typedef_composite_gfs(element, name, tuple(subgfs), root)
         return name
     else:
         assert isinstance(element, (FiniteElement, TensorProductElement))
         name = "{}{}_GFS".format(FEM_name_mangling(element).upper(),
                                  "_dirichlet" if is_dirichlet[0] else "",
                                  )
-        typedef_gfs(element, is_dirichlet, name)
+        typedef_gfs(element, is_dirichlet, name, root)
         return name
 
 
 @preamble
-def define_gfs(element, is_dirichlet, name):
-    gfstype = type_gfs(element, is_dirichlet)
+def define_gfs(element, is_dirichlet, name, root):
+    gfstype = type_gfs(element, is_dirichlet, root=root)
     gv = name_leafview()
     fem = name_fem(element)
     return ["{} {}({}, {});".format(gfstype, name, gv, fem),
@@ -298,24 +298,23 @@ def define_gfs(element, is_dirichlet, name):
 
 
 @preamble
-def define_power_gfs(element, is_dirichlet, name, subgfs):
-    gv = name_leafview()
-    fem = name_fem(element.sub_elements()[0])
-    gfstype = type_gfs(element, is_dirichlet)
-    return ["{} {}({}, {});".format(gfstype, name, gv, fem),
-            "{}.name(\"{}\");".format(name, name)]
+def define_power_gfs(element, is_dirichlet, name, subgfs, root):
+    gfstype = type_gfs(element, is_dirichlet, root=root)
+    names = ["using namespace Dune::Indices;"]
+    names = names + ["{0}.child(_{1}).name(\"{0}_{1}\");".format(name, i) for i in range(element.num_sub_elements())]
+    return ["{} {}({});".format(gfstype, name, subgfs)] + names
 
 
 @preamble
-def define_composite_gfs(element, is_dirichlet, name, subgfs):
-    gfstype = type_gfs(element, is_dirichlet)
+def define_composite_gfs(element, is_dirichlet, name, subgfs, root):
+    gfstype = type_gfs(element, is_dirichlet, root=root)
     return ["{} {}({});".format(gfstype, name, ", ".join(subgfs)),
             "{}.update();".format(name)]
 
 
 @preamble
-def typedef_gfs(element, is_dirichlet, name):
-    vb = type_vectorbackend(element)
+def typedef_gfs(element, is_dirichlet, name, root):
+    vb = type_vectorbackend(element, root)
     gv = type_leafview()
     fem = type_fem(element)
     cass = type_constraintsassembler(bool(is_dirichlet[0]))
@@ -323,58 +322,50 @@ def typedef_gfs(element, is_dirichlet, name):
 
 
 @preamble
-def typedef_power_gfs(element, is_dirichlet, name, subgfs):
-    include_file("dune/pdelab/gridfunctionspace/vectorgridfunctionspace.hh", filetag="driver")
-    gv = type_leafview()
-    vb = type_vectorbackend(element)
-    fem = type_fem(element.sub_elements()[0])
-    dim = element.num_sub_elements()
-    cass = type_constraintsassembler(bool(is_dirichlet[0]))
-    return "using {} = Dune::PDELab::VectorGridFunctionSpace<{}, {}, {}, {}, {}, {}>;".format(name, gv, fem, dim, vb, vb, cass)
+def typedef_power_gfs(element, is_dirichlet, name, subgfs, root):
+    include_file("dune/pdelab/gridfunctionspace/powergridfunctionspace.hh", filetag="driver")
+    vb = type_vectorbackend(element, root)
+    ot = type_orderingtag(False)
+
+    return "using {} = Dune::PDELab::PowerGridFunctionSpace<{}, {}, {}, {}>;".format(name, subgfs, element.num_sub_elements(), vb, ot)
 
 
 @preamble
-def typedef_composite_gfs(element, name, subgfs):
-    vb = type_vectorbackend(element)
-    ot = type_orderingtag()
+def typedef_composite_gfs(element, name, subgfs, root):
+    vb = type_vectorbackend(element, root)
+    ot = type_orderingtag(isinstance(element, FiniteElement))
     args = ", ".join(subgfs)
     return "using {} = Dune::PDELab::CompositeGridFunctionSpace<{}, {}, {}>;".format(name, vb, ot, args)
 
 
 @preamble
-def define_blocksize(name, element):
-    from dune.perftool.pdelab.driver import isDG, isQuadrilateral
-    assert isDG(element)
-    assert isQuadrilateral(element.cell())
-    dimension = get_dimension()
-    degree = element.degree()
-    return "static const int {} = Dune::QkStuff::QkSize<{}, {}>::value;".format(name, degree, dimension)
-
-
-def name_blocksize(element):
-    name = "blocksize"
-    define_blocksize(name, element)
-    return name
-
-
-@preamble
-def typedef_vectorbackend(name, element):
+def typedef_vectorbackend(name, element, root):
     include_file("dune/pdelab/backend/istl.hh", filetag="driver")
-    if get_option("fastdg"):
-        blocksize = name_blocksize(element)
-        return "using {} = Dune::PDELab::istl::VectorBackend<Dune::PDELab::istl::Blocking::fixed, {}>;".format(name, blocksize)
+    if get_option("fastdg") and root:
+        blocking = "Dune::PDELab::ISTL::Blocking::fixed"
     else:
-        return "using {} = Dune::PDELab::istl::VectorBackend<Dune::PDELab::istl::Blocking::none, 1>;".format(name)
+        blocking = "Dune::PDELab::ISTL::Blocking::none"
 
+    if isinstance(element, MixedElement):
+        blocksize = ""
+    else:
+        include_file("dune/pdelab/finiteelement/qkdglagrange.hh", filetag="driver")
+        blocksize = ", Dune::QkStuff::QkSize<{}, {}>::value".format(element.degree(), get_dimension())
 
-def type_vectorbackend(element):
-    name = "VectorBackend"
-    typedef_vectorbackend(name, element)
+    return "using {} = Dune::PDELab::ISTL::VectorBackend<{}{}>;".format(name, blocking, blocksize)
+
+
+def type_vectorbackend(element, root):
+    name = "VectorBackend{}".format(FEM_name_mangling(element).upper())
+    typedef_vectorbackend(name, element, root)
     return name
 
 
-def type_orderingtag():
-    return "Dune::PDELab::LexicographicOrderingTag"
+def type_orderingtag(leaf):
+    if leaf or not get_option("fastdg"):
+        return "Dune::PDELab::LexicographicOrderingTag"
+    else:
+        return "Dune::PDELab::EntityBlockedOrderingTag"
 
 
 @preamble
diff --git a/python/dune/perftool/pdelab/driver/gridoperator.py b/python/dune/perftool/pdelab/driver/gridoperator.py
index eb6cfd893ddd68c4f987e7dea77c3a501c9c71b6..39071cc8db29d90ff5189808ef4991a2c2794707 100644
--- a/python/dune/perftool/pdelab/driver/gridoperator.py
+++ b/python/dune/perftool/pdelab/driver/gridoperator.py
@@ -134,7 +134,7 @@ def name_dofestimate():
 @preamble
 def typedef_matrixbackend(name):
     include_file("dune/pdelab/backend/istl.hh", filetag="driver")
-    return "using {} = Dune::PDELab::istl::BCRSMatrixBackend<>;".format(name)
+    return "using {} = Dune::PDELab::ISTL::BCRSMatrixBackend<>;".format(name)
 
 
 def type_matrixbackend():
diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
index 19f19f8c8483f7a9d2864a3ca08e4d88db10f4be..dac38218879442f9cc9c91d9a3c41cd2f3cbec5c 100644
--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -266,9 +266,12 @@ def generate_accumulation_instruction(expr, visitor):
                        stage=3,
                        position_priority=priority,
                        accumvar=accumvar,
-                       trial_element=trial_leaf_element,
+                       test_element=test_info.element,
+                       test_element_index=test_info.element_index,
+                       trial_element=trial_info.element,
+                       trial_element_index=trial_info.element_index,
                        input=AlreadyAssembledInput(index=(test_info.element_index,)),
-                       component_index=test_info.element_index,
+                       predicates=predicates,
                        )
 
     from dune.perftool.sumfact.vectorization import attach_vectorization_info
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 00ef29661f33d4c7f32600babe622789caca50d3..d01c8ef8edead963d0bd56fd4e34c956484957d3 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -160,10 +160,10 @@ def pymbolic_coefficient_gradient(element, restriction, index, coeff_func, visit
 @kernel_cached
 def pymbolic_coefficient(element, restriction, index, coeff_func, visitor_indices):
     sub_element = element
-    if element.num_sub_elements() > 0:
+    if isinstance(element, MixedElement):
         sub_element = element.extract_component(index)[1]
     from ufl import FiniteElement
-    assert isinstance(sub_element, FiniteElement)
+    assert isinstance(sub_element, (FiniteElement, TensorProductElement))
 
     # Basis functions per direction
     basis_size = _basis_functions_per_direction(sub_element)
@@ -262,7 +262,7 @@ def pymbolic_basis(element, restriction, number):
     if number == 0:
         return 1
 
-    assert element.num_sub_elements() == 0
+    assert not isinstance(element, MixedElement)
 
     name = "phi_{}".format(FEM_name_mangling(element))
     name = restricted_name(name, restriction)
diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py
index 5c9e4a5b50fd07d39c1d4639982d05ca82a40224..5be6fc825ef0ee63e313b65a7aecd9c0f3f113d5 100644
--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -9,7 +9,9 @@ from dune.perftool.generation import (barrier,
                                       globalarg,
                                       instruction,
                                       post_include,
+                                      preamble,
                                       silenced_warning,
+                                      temporary_variable,
                                       transform,
                                       )
 from dune.perftool.loopy.buffer import (get_buffer_temporary,
@@ -27,6 +29,8 @@ from dune.perftool.sumfact.permutation import (sumfact_permutation_strategy,
 from dune.perftool.sumfact.accumulation import sumfact_iname
 from dune.perftool.loopy.vcl import ExplicitVCLCast
 
+from ufl import MixedElement
+
 import loopy as lp
 import numpy as np
 import pymbolic.primitives as prim
@@ -39,6 +43,11 @@ def realize_sum_factorization_kernel(sf, **kwargs):
         return _realize_sum_factorization_kernel(sf, **kwargs)
 
 
+@preamble
+def alias_data_array(name, data):
+    return "auto {} = {}.data();".format(name, data)
+
+
 @generator_factory(item_tags=("sumfactkernel",),
                    context_tags=("kernel",),
                    cache_key_generator=lambda s, **kw: s.cache_key)
@@ -67,10 +76,11 @@ def _realize_sum_factorization_kernel(sf):
             sf.input.realize(sf, 0, insn_dep)
 
         insn_dep = insn_dep.union(frozenset({lp.match.Writes("input_{}".format(sf.buffer))}))
-
-    direct_output = None
-    if get_option('fastdg') and sf.stage == 3:
-        direct_output = sf.accumvar + ".data()"
+    else:
+        if sf.input.element_index is None:
+            direct_input_arg = "{}_access".format(direct_input)
+        else:
+            direct_input_arg = "{}_access_comp{}".format(direct_input, sf.input.element_index)
 
     # Prepare some dim_tags/shapes for later use
     ftags = ",".join(["f"] * sf.length)
@@ -128,13 +138,19 @@ def _realize_sum_factorization_kernel(sf):
             input_inames = permute_backward(input_inames, perm)
             inp_shape = permute_backward(inp_shape, perm)
 
-            globalarg(direct_input, dtype=np.float64, shape=inp_shape, dim_tags=novec_ftags)
+            globalarg(direct_input_arg,
+                      dtype=np.float64,
+                      shape=inp_shape,
+                      dim_tags=novec_ftags,
+                      offset=_dof_offset(sf.input.element, sf.input.element_index),
+                      )
+            alias_data_array(direct_input_arg, direct_input)
             if matrix.vectorized:
                 input_summand = prim.Call(ExplicitVCLCast(np.float64, vector_width=sf.vector_width),
-                                          (prim.Subscript(prim.Variable(direct_input),
+                                          (prim.Subscript(prim.Variable(direct_input_arg),
                                                           input_inames),))
             else:
-                input_summand = prim.Subscript(prim.Variable(direct_input),
+                input_summand = prim.Subscript(prim.Variable(direct_input_arg),
                                                input_inames + vec_iname)
         else:
             # If we did permute the order of a matrices above we also
@@ -192,17 +208,41 @@ def _realize_sum_factorization_kernel(sf):
 
         # In case of direct output we directly accumulate the result
         # of the Sumfactorization into some global data structure.
-        if l == len(matrix_sequence) - 1 and direct_output is not None:
+        if l == len(matrix_sequence) - 1 and get_option('fastdg') and sf.stage == 3:
             ft = get_global_context_value("form_type")
+            if sf.test_element_index is None:
+                direct_output = "{}_access".format(sf.accumvar)
+            else:
+                direct_output = "{}_access_comp{}".format(sf.accumvar, sf.test_element_index)
             if ft == 'residual' or ft == 'jacobian_apply':
-                globalarg(direct_output, dtype=np.float64, shape=output_shape, dim_tags=novec_ftags)
+                globalarg(direct_output,
+                          dtype=np.float64,
+                          shape=output_shape,
+                          dim_tags=novec_ftags,
+                          offset=_dof_offset(sf.test_element, sf.test_element_index),
+                          )
+                alias_data_array(direct_output, sf.accumvar)
+
                 assignee = prim.Subscript(prim.Variable(direct_output), output_inames)
             else:
                 assert ft == 'jacobian'
+
+                direct_output = "{}x{}".format(direct_output, sf.trial_element_index)
+                rowsize = sum(tuple(s for s in _local_sizes(sf.trial_element)))
+                element = sf.trial_element
+                if isinstance(element, MixedElement):
+                    element = element.extract_component(sf.trial_element_index)[1]
+                other_shape = tuple(element.degree() + 1 for e in range(sf.length))
+                from pytools import product
+                manual_strides = tuple("stride:{}".format(rowsize * product(output_shape[:i])) for i in range(sf.length))
+                dim_tags = "{},{}".format(novec_ftags, ",".join(manual_strides))
                 globalarg(direct_output,
                           dtype=np.float64,
-                          shape=output_shape + output_shape,
-                          dim_tags=novec_ftags + "," + novec_ftags)
+                          shape=other_shape + output_shape,
+                          offset=rowsize * _dof_offset(sf.test_element, sf.test_element_index) + _dof_offset(sf.trial_element, sf.trial_element_index),
+                          dim_tags=dim_tags,
+                          )
+                alias_data_array(direct_output, sf.accumvar)
                 # TODO: It is at least questionnable, whether using the *order* of the inames in here
                 #       for indexing is a good idea. Then again, it is hard to find an alternative.
                 _ansatz_inames = tuple(prim.Variable(i) for i in sf.within_inames)
@@ -226,6 +266,7 @@ def _realize_sum_factorization_kernel(sf):
                                           forced_iname_deps_is_final=True,
                                           depends_on=insn_dep,
                                           tags=frozenset({"sumfact_stage{}".format(sf.stage)}),
+                                          predicates=sf.predicates,
                                           )
                               })
 
@@ -248,3 +289,22 @@ def _realize_sum_factorization_kernel(sf):
     silenced_warning('read_no_write({})'.format(out))
 
     return lp.TaggedVariable(out, sf.tag), insn_dep
+
+
+def _local_sizes(element):
+    from ufl import FiniteElement, MixedElement
+    if isinstance(element, MixedElement):
+        for subel in element.sub_elements():
+            for s in _local_sizes(subel):
+                yield s
+    else:
+        assert isinstance(element, FiniteElement)
+        yield (element.degree() + 1)**element.cell().geometric_dimension()
+
+
+def _dof_offset(element, component):
+    if component is None:
+        return 0
+    else:
+        sizes = tuple(s for s in _local_sizes(element))
+        return sum(sizes[0:component])
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index a0cc1054afac0a67f312667f7fa4243700c1ccac..7aa9c327001c807740956a3549966d7de0d5f6a1 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -8,6 +8,8 @@ from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase, BasisTab
 
 from pytools import ImmutableRecord, product
 
+from ufl import MixedElement
+
 import pymbolic.primitives as prim
 import loopy as lp
 import frozendict
@@ -37,8 +39,11 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
                  insn_dep=frozenset(),
                  input=None,
                  accumvar=None,
-                 component_index=None,
+                 test_element=None,
+                 test_element_index=None,
                  trial_element=None,
+                 trial_element_index=None,
+                 predicates=frozenset(),
                  ):
         """Create a sum factorization kernel
 
@@ -100,10 +105,12 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
             other.
         input: An SumfactKernelInputBase instance describing the input of the kernel
         accumvar: The accumulation variable to accumulate into
-        component_index: Index to get the right input coefficient
-            (needed for systems of PDEs)
         trial_element: The leaf element of the trial function space.
-            Used to correctly nest stage 3 in the sumfact case.
+            Used to correctly nest stage 3 in the jacobian case.
+        test_element: The leaf element of the test function space
+            Used to compute offsets in the fastdg case.
+        test_element_index: the component of the test_element
+        trial_element_index: the component of the trial_element
         """
         # Assert the inputs!
         assert isinstance(matrix_sequence, tuple)
@@ -152,7 +159,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
         Any two sum factorization kernels having the same cache_key
         are realized simulatenously!
         """
-        return (self.matrix_sequence, self.restriction, self.stage, self.buffer, self.component_index)
+        return (self.matrix_sequence, self.restriction, self.stage, self.buffer, self.test_element_index)
 
     @property
     def input_key(self):
@@ -186,7 +193,10 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
             return ()
         else:
             from dune.perftool.sumfact.basis import lfs_inames
-            return lfs_inames(self.trial_element, self.restriction)
+            element = self.trial_element
+            if isinstance(element, MixedElement):
+                element = element.extract_component(self.trial_element_index)[1]
+            return lfs_inames(element, self.restriction)
 
     def vec_index(self, sf):
         """ Map an unvectorized sumfact kernel object to its position
@@ -203,7 +213,10 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
         return tuple(mat.quadrature_size for mat in self.matrix_sequence)
 
     def quadrature_index(self, _):
-        quad_inames = quadrature_inames(self.trial_element)
+        element = self.trial_element
+        if element is not None and isinstance(element, MixedElement):
+            element = element.extract_component(self.trial_element_index)[1]
+        quad_inames = quadrature_inames(element)
         if len(self.matrix_sequence) == local_dimension():
             return tuple(prim.Variable(i) for i in quad_inames)
 
@@ -309,6 +322,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         assert len(set(k.length for k in kernels)) == 1
         assert len(set(k.restriction for k in kernels)) == 1
         assert len(set(k.within_inames for k in kernels)) == 1
+        assert len(set(k.predicates for k in kernels)) == 1
 
         # Assert properties of the matrix sequence of the underlying kernels
         for i in range(kernels[0].length):
@@ -379,10 +393,26 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     def within_inames(self):
         return self.kernels[0].within_inames
 
+    @property
+    def test_element(self):
+        return self.kernels[0].test_element
+
+    @property
+    def test_element_index(self):
+        return self.kernels[0].test_element_index
+
     @property
     def trial_element(self):
         return self.kernels[0].trial_element
 
+    @property
+    def trial_element_index(self):
+        return self.kernels[0].trial_element_index
+
+    @property
+    def predicates(self):
+        return self.kernels[0].predicates
+
     @property
     def input(self):
         assert len(set(k.input for k in self.kernels)) == 1
@@ -437,7 +467,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         return 0
 
     def _quadrature_index(self, sf):
-        quad_inames = quadrature_inames(self.trial_element)
+        element = self.trial_element
+        if element is not None and isinstance(element, MixedElement):
+            element = element.extract_component(self.trial_element_index)[1]
+        quad_inames = quadrature_inames(element)
         index = []
 
         if len(self.matrix_sequence) == local_dimension():
@@ -466,7 +499,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         return tuple(index)
 
     def vec_index(self, sf):
-        quad_inames = quadrature_inames(self.trial_element)
+        element = self.trial_element
+        if element is not None and isinstance(element, MixedElement):
+            element = element.extract_component(self.trial_element_index)[1]
+        quad_inames = quadrature_inames(element)
         sliced = 0
         if len(sf.matrix_sequence) == local_dimension():
             for d in range(local_dimension()):
diff --git a/python/loopy b/python/loopy
index 3b74c2863b4ed82da208375f9f29bba08a3a7c7c..095644a7d7a52e2ae43c747e380ddf6003c9ee18 160000
--- a/python/loopy
+++ b/python/loopy
@@ -1 +1 @@
-Subproject commit 3b74c2863b4ed82da208375f9f29bba08a3a7c7c
+Subproject commit 095644a7d7a52e2ae43c747e380ddf6003c9ee18
diff --git a/test/heatequation/driver.hh b/test/heatequation/driver.hh
index 1aaf9f171747325885cfebf6a74103e30575d138..4222ca89cea72d35021faabee8344c24a5cc63f0 100644
--- a/test/heatequation/driver.hh
+++ b/test/heatequation/driver.hh
@@ -30,7 +30,7 @@ void driver (const GV& gv, const FEM& fem, Dune::ParameterTree& ptree)
 
   // Make grid function space
   typedef Dune::PDELab::ConformingDirichletConstraints CON;
-  typedef Dune::PDELab::istl::VectorBackend<> VBE;
+  typedef Dune::PDELab::ISTL::VectorBackend<> VBE;
   typedef Dune::PDELab::GridFunctionSpace<GV,FEM,CON,VBE> GFS;
   GFS gfs(gv,fem);
   gfs.name("Vh");
@@ -63,7 +63,7 @@ void driver (const GV& gv, const FEM& fem, Dune::ParameterTree& ptree)
   // LOP lop(ptree, params_poisson);
 
 
-  typedef Dune::PDELab::istl::BCRSMatrixBackend<> MBE;
+  typedef Dune::PDELab::ISTL::BCRSMatrixBackend<> MBE;
   int degree = ptree.get("fem.degree",(int)1);
   MBE mbe((int)pow(1+2*degree,dim));
   typedef Dune::PDELab::GridOperator<GFS,GFS,LOP,MBE,
diff --git a/test/laplace/reference_driver.hh b/test/laplace/reference_driver.hh
index db1b55946833b34ae8e925737c7c127241b57f2a..02791b7dcb785463376e4dca0c15407f496d0c74 100644
--- a/test/laplace/reference_driver.hh
+++ b/test/laplace/reference_driver.hh
@@ -88,7 +88,7 @@ class CDProb
 };
 
 
-void driver(int argc, char** argv){  typedef Dune::PDELab::istl::VectorBackend<Dune::PDELab::istl::Blocking::none, 1> VectorBackend;
+void driver(int argc, char** argv){  typedef Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::none, 1> VectorBackend;
   static const int dim = 2;
   typedef Dune::UGGrid<dim> Grid;
   typedef Grid::LeafGridView GV;
@@ -110,7 +110,7 @@ void driver(int argc, char** argv){  typedef Dune::PDELab::istl::VectorBackend<D
   VTKWriter vtkwriter(gv, sublevel);
   using LOP = Dune::PDELab::ConvectionDiffusionDG<CDProb<GV, R>, DG1_FEM>;
   typedef DG1_DIRICHLET_GFS::ConstraintsContainer<R>::Type DG1_CC;
-  typedef Dune::PDELab::istl::BCRSMatrixBackend<> MatrixBackend;
+  typedef Dune::PDELab::ISTL::BCRSMatrixBackend<> MatrixBackend;
   typedef Dune::PDELab::GridOperator<DG1_DIRICHLET_GFS, DG1_DIRICHLET_GFS, LOP, MatrixBackend, DF, R, R, DG1_CC, DG1_CC> GO;
   typedef GO::Traits::Domain V;
   V x(dg1_dirichlet_gfs);
diff --git a/test/nonlinear/reference_driver.hh b/test/nonlinear/reference_driver.hh
index db0b072f1e0cd3050d2b6a0ac36adffc91f4cf1c..e24b24e1c534fac76d6c82bf12ec45426d9c327c 100644
--- a/test/nonlinear/reference_driver.hh
+++ b/test/nonlinear/reference_driver.hh
@@ -56,7 +56,7 @@ void driver (int argc, char** argv){
   typedef Dune::PDELab::PkLocalFiniteElementMap<GV,DF,double,1> FEM;
   FEM fem(gv);
   typedef Dune::PDELab::ConformingDirichletConstraints CON;
-  typedef Dune::PDELab::istl::VectorBackend<> VBE;
+  typedef Dune::PDELab::ISTL::VectorBackend<> VBE;
   typedef Dune::PDELab::GridFunctionSpace<GV,FEM,CON,VBE> GFS;
   GFS gfs(gv,fem);
   gfs.name("Vh");
@@ -85,7 +85,7 @@ void driver (int argc, char** argv){
   LOP lop(problem);
 
   // Make a global operator
-  typedef Dune::PDELab::istl::BCRSMatrixBackend<> MBE;
+  typedef Dune::PDELab::ISTL::BCRSMatrixBackend<> MBE;
   int degree = initree.get("fem.degree",(int)1);
   MBE mbe((int)pow(1+2*degree,dim));
   typedef Dune::PDELab::GridOperator<
diff --git a/test/poisson/reference_driver.hh b/test/poisson/reference_driver.hh
index 6c3e45f2f2c59e8c7a08554aa288e86d6dac3cba..c12054dfbc81ae1ed2350fc6b1dc91755f07b8cd 100644
--- a/test/poisson/reference_driver.hh
+++ b/test/poisson/reference_driver.hh
@@ -94,7 +94,7 @@ class CDProb
 };
 
 
-void driver(int argc, char** argv){  typedef Dune::PDELab::istl::VectorBackend<Dune::PDELab::istl::Blocking::none, 1> VectorBackend;
+void driver(int argc, char** argv){  typedef Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::none, 1> VectorBackend;
   static const int dim = 2;
   typedef Dune::UGGrid<dim> Grid;
   typedef Grid::LeafGridView GV;
@@ -116,7 +116,7 @@ void driver(int argc, char** argv){  typedef Dune::PDELab::istl::VectorBackend<D
   VTKWriter vtkwriter(gv, sublevel);
   using LOP = Dune::PDELab::ConvectionDiffusionDG<CDProb<GV, R>, DG1_FEM>;
   typedef DG1_DIRICHLET_GFS::ConstraintsContainer<R>::Type DG1_CC;
-  typedef Dune::PDELab::istl::BCRSMatrixBackend<> MatrixBackend;
+  typedef Dune::PDELab::ISTL::BCRSMatrixBackend<> MatrixBackend;
   typedef Dune::PDELab::GridOperator<DG1_DIRICHLET_GFS, DG1_DIRICHLET_GFS, LOP, MatrixBackend, DF, R, R, DG1_CC, DG1_CC> GO;
   typedef GO::Traits::Domain V;
   V x(dg1_dirichlet_gfs);
diff --git a/test/sumfact/stokes/stokes_3d_dg.mini b/test/sumfact/stokes/stokes_3d_dg.mini
index 28063941746a213957276465c8fa993d4fe2c07f..dac8dbc82bebd1edd8eecfbc17a197e094c69995 100644
--- a/test/sumfact/stokes/stokes_3d_dg.mini
+++ b/test/sumfact/stokes/stokes_3d_dg.mini
@@ -1,4 +1,7 @@
-__name = sumfact_stokes_3d_dg
+__name = sumfact_stokes_3d_dg_{__exec_suffix}
+
+__exec_suffix = {fastdg_suffix}
+fastdg_suffix = fastdg, nonfastdg | expand fastdg
 
 cells = 4 4 4
 extension = 1. 1. 1.
@@ -11,3 +14,4 @@ extension = vtu
 [formcompiler]
 numerical_jacobian = 0
 sumfact = 1
+fastdg = 1, 0 | expand fastdg
diff --git a/test/sumfact/stokes/stokes_dg.mini b/test/sumfact/stokes/stokes_dg.mini
index e533756a11d14de50003dcb5e89c12928e000416..e3374e4a18e844f6f1356ce45b38e5b5212f015f 100644
--- a/test/sumfact/stokes/stokes_dg.mini
+++ b/test/sumfact/stokes/stokes_dg.mini
@@ -1,8 +1,8 @@
 __name = sumfact_stokes_dg_{__exec_suffix}
 
-__exec_suffix = {diff_suffix}_{vec_suffix}
+__exec_suffix = {diff_suffix}_{fastdg_suffix}
 diff_suffix = symdiff, numdiff | expand num
-vec_suffix = quadvec, nonquadvec | expand vec
+fastdg_suffix = fastdg, nonfastdg | expand fastdg
 
 cells = 8 8
 extension = 1. 1.
@@ -14,6 +14,8 @@ extension = vtu
 
 [formcompiler]
 numerical_jacobian = 0, 1 | expand num
-vectorize_quad = 1, 0 | expand vec
 compare_l2errorsquared = 1e-8
 sumfact = 1
+fastdg = 1, 0 | expand fastdg
+
+{formcompiler.fastdg} == 1 and {formcompiler.numerical_jacobian} == 1 | exclude
\ No newline at end of file