diff --git a/.gitmodules b/.gitmodules
index d07bf324168684295261b9284a89551048323a6a..c0aa0e62e13178c6010b9f7120309688556ebd40 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -8,3 +8,6 @@
 [submodule "python/pymbolic"]
 	path = python/pymbolic
 	url = https://github.com/inducer/pymbolic.git
+[submodule "python/cgen"]
+	path = python/cgen
+	url = https://github.com/inducer/cgen.git
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 9dc164e7e1f6768ee09135706df5e6d378d390a5..e3c67559b3c7fc897e8923b34c25673a924c0d39 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -9,6 +9,7 @@ create_virtualenv_wrapper(COMMANDS python -m IPython notebook
                           NAME notebook)
 
 # Install all the external packages that we have as submodules
+dune_install_python_package(PATH cgen)
 dune_install_python_package(PATH pymbolic)
 dune_install_python_package(PATH loopy)
 dune_install_python_package(PATH ufl)
diff --git a/python/cgen b/python/cgen
new file mode 160000
index 0000000000000000000000000000000000000000..4aa99a0ce7a843da42e6c0f2fba942489cfb3c59
--- /dev/null
+++ b/python/cgen
@@ -0,0 +1 @@
+Subproject commit 4aa99a0ce7a843da42e6c0f2fba942489cfb3c59
diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py
index 1b084cbc18ac34b4b1193d3c45ed841d0766884a..32463b4b1ccd0af0ececa244c5578b5ccc4a6d67 100644
--- a/python/dune/perftool/generation/__init__.py
+++ b/python/dune/perftool/generation/__init__.py
@@ -1,5 +1,11 @@
 from __future__ import absolute_import
 
+from dune.perftool.generation.backend import (backend,
+                                              get_backend,
+                                              )
+
+from dune.perftool.generation.counter import get_counter
+
 from dune.perftool.generation.cache import (cached,
                                             generator_factory,
                                             no_caching,
@@ -30,7 +36,9 @@ from dune.perftool.generation.loopy import (constantarg,
                                             globalarg,
                                             iname,
                                             instruction,
+                                            noop_instruction,
                                             pymbolic_expr,
+                                            silenced_warning,
                                             temporary_variable,
                                             valuearg,
                                             )
diff --git a/python/dune/perftool/generation/backend.py b/python/dune/perftool/generation/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..27356c937f1c8617297736a34e74d540dd0996d2
--- /dev/null
+++ b/python/dune/perftool/generation/backend.py
@@ -0,0 +1,41 @@
+from dune.perftool.generation.cache import _RegisteredFunction
+from pytools import Record
+
+
+_backend_mapping = {}
+
+
+class FuncProxy(Record):
+    def __init__(self, interface, name, func):
+        Record.__init__(self, interface=interface, name=name, func=func)
+
+
+def register_backend(interface, name, func):
+    _backend_mapping.setdefault(interface, {})
+    assert name not in _backend_mapping[interface]
+    _backend_mapping[interface][name] = func
+
+
+def backend(interface=None, name='default'):
+    assert interface
+
+    def _dec(func):
+        if not isinstance(func, _RegisteredFunction):
+            # Allow order independence of the generator decorators
+            # and the backend decorator by delaying the registration
+            func = FuncProxy(interface, name, func)
+        else:
+            register_backend(interface, name, func)
+
+        return func
+
+    return _dec
+
+
+def get_backend(interface=None, selector=None, **kwargs):
+    assert interface and selector
+
+    select = selector(**kwargs)
+    assert select in _backend_mapping[interface]
+
+    return _backend_mapping[interface][select]
diff --git a/python/dune/perftool/generation/cache.py b/python/dune/perftool/generation/cache.py
index 5c2b6cc827d41a790cbb364d8554adff8c5e6af0..53c8eada2d1722bc554bd802527264facbad8ed9 100644
--- a/python/dune/perftool/generation/cache.py
+++ b/python/dune/perftool/generation/cache.py
@@ -1,6 +1,7 @@
 """ This module provides the memoization infrastructure for code
 generating functions.
 """
+from dune.perftool.generation.counter import get_counter
 
 # Store a global list of generator functions
 _generators = []
@@ -40,16 +41,8 @@ def _freeze(data):
     raise TypeError('Cannot freeze non-hashable object {} of type {}'.format(data, type(data)))
 
 
-class _GlobalCounter(object):
-    counter = 0
-
-    def get(self):
-        _GlobalCounter.counter = _GlobalCounter.counter + 1
-        return _GlobalCounter.counter
-
-
 def no_caching(*a, **k):
-    return _GlobalCounter().get()
+    return get_counter('__no_caching')
 
 
 class _RegisteredFunction(object):
@@ -72,6 +65,13 @@ class _RegisteredFunction(object):
         # Register this generator function
         _generators.append(self)
 
+        # Allow order independence of the backend and the generator decorators.
+        # If backend was applied first, we resolve the issued FuncProxy object
+        from dune.perftool.generation.backend import FuncProxy, register_backend
+        if isinstance(self.func, FuncProxy):
+            register_backend(self.func.interface, self.func.name, self)
+            self.func = self.func.func
+
     def _get_content(self, key):
         if self.counted:
             return self._memoize_cache[key][1]
@@ -90,7 +90,7 @@ class _RegisteredFunction(object):
             val = self.on_store(self.func(*args, **kwargs))
             # Maybe wrap it with a counter!
             if self.counted:
-                val = (_GlobalCounter().get(), val)
+                val = (get_counter('__cache_counted'), val)
             # and store the result
             self._memoize_cache[cache_key] = val
 
diff --git a/python/dune/perftool/generation/counter.py b/python/dune/perftool/generation/counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..607d7eaa79351d4b9f50e79a919d0c8e075aac83
--- /dev/null
+++ b/python/dune/perftool/generation/counter.py
@@ -0,0 +1,9 @@
+""" Some global counters used to generate unique names throughout the project """
+
+_counts = {}
+
+
+def get_counter(identifier):
+    count = _counts.setdefault(identifier, 0)
+    _counts[identifier] = _counts[identifier] + 1
+    return count
diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index 10620a1ef073c181774a21aff8073eb1fc8fad9a..f62f309c069b3cbec2ca3a9ebce6f13cd7653646 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -1,7 +1,8 @@
 """ The loopy specific generators """
 from __future__ import absolute_import
 
-from dune.perftool.generation import (generator_factory,
+from dune.perftool.generation import (get_counter,
+                                      generator_factory,
                                       no_caching,
                                       preamble,
                                       )
@@ -12,6 +13,7 @@ import numpy
 iname = generator_factory(item_tags=("iname",))
 pymbolic_expr = generator_factory(item_tags=("kernel", "pymbolic"))
 function_mangler = generator_factory(item_tags=("mangler",))
+silenced_warning = generator_factory(item_tags=("silenced_warning",), no_deco=True)
 
 
 @generator_factory(item_tags=("argument", "globalarg"),
@@ -80,29 +82,56 @@ def default_declaration(name, shape=(), shape_impl=()):
         return '{} {}(0.0);'.format(t, name)
 
 
-class _TemporaryCounter:
-    counter = 0
+def declaration_with_base_storage(base_storage, storage_shape):
+    # We currently consider the base storage to be linear
+    assert(len(storage_shape) == 1)
+
+    @preamble
+    def base_storage_decl(name, shape=(), shape_impl=()):
+        default_declaration(base_storage, shape=storage_shape, shape_impl=('arr',))
+        return 'double* {} = {};'.format(name, base_storage)
+
+    return base_storage_decl
 
 
 def get_temporary_name():
-    name = 'expr_{}'.format(str(_TemporaryCounter.counter).zfill(4))
-    _TemporaryCounter.counter = _TemporaryCounter.counter + 1
-    return name
+    return 'expr_{}'.format(str(get_counter('__temporary').zfill(4)))
+
+
+class UnmanagedTemporaryVariable(loopy.TemporaryVariable):
+    """
+    A TemporaryVariable class that does not have knowledge of data layout.
+    Is used when the data layout is under control of the underlying C++ type.
+    Indexed access will result in multiple operator[]s. Only known to
+    the DuneTarget.
+    """
+    pass
 
 
 @generator_factory(item_tags=("temporary",), cache_key_generator=no_caching)
-def temporary_variable(name, **kwargs):
+def temporary_variable(name, managed=False, **kwargs):
     if 'dtype' not in kwargs:
         kwargs['dtype'] = numpy.float64
 
+    # Check if a specified base_storage has already been initialized
+    if kwargs.get('base_storage', None):
+        assert(kwargs.get('decl_method', None) is None)
+        assert('storage_shape' in kwargs)
+        decl_method = declaration_with_base_storage(kwargs['base_storage'], kwargs['storage_shape'])
+    else:
+        decl_method = kwargs.pop('decl_method', default_declaration)
+
     shape = kwargs.get('shape', ())
     shape_impl = kwargs.pop('shape_impl', ('arr',) * len(shape))
-    decl_method = kwargs.pop('decl_method', default_declaration)
 
     if decl_method is not None:
         decl_method(name, shape, shape_impl)
 
-    return loopy.TemporaryVariable(name, **kwargs)
+    if managed:
+        return loopy.TemporaryVariable(name, **kwargs)
+    else:
+        return UnmanagedTemporaryVariable(name, **kwargs)
+
 
 # Now define generators for instructions. To ease dependency handling of instructions
 # these generators are a bit more involved... We apply the following procedure:
@@ -139,10 +168,6 @@ def call_instruction_impl(**kw):
     return loopy.CallInstruction(**kw)
 
 
-class _IDCounter:
-    count = 0
-
-
 def _insn_cache_key(code=None, expression=None, **kwargs):
     if code is not None:
         return code
@@ -158,8 +183,7 @@ def instruction(code=None, expression=None, **kwargs):
     assert 'id' not in kwargs
 
     # Get an ID for this instruction
-    id = 'insn' + str(_IDCounter.count).zfill(4)
-    _IDCounter.count = _IDCounter.count + 1
+    id = 'insn_{}'.format(str(get_counter('__insn_id')).zfill(4))
 
     # Now create the actual instruction
     if code:
@@ -172,3 +196,8 @@ def instruction(code=None, expression=None, **kwargs):
 
     # return the ID, as it is the only useful information to the user
     return id
+
+
+@generator_factory(item_tags=("instruction",), cache_key_generator=lambda **kw: kw['id'])
+def noop_instruction(**kwargs):
+    return loopy.NoOpInstruction(**kwargs)
diff --git a/python/dune/perftool/loopy/buffer.py b/python/dune/perftool/loopy/buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4238c4929ca188edf6b53dd4583937b47d4df91b
--- /dev/null
+++ b/python/dune/perftool/loopy/buffer.py
@@ -0,0 +1,61 @@
+from dune.perftool.generation import (generator_factory,
+                                      temporary_variable,
+                                      )
+
+
+class FlipFlopBuffer(object):
+    def __init__(self, identifier, base_storage_size=None, num=2):
+        self.identifier = identifier
+        self.base_storage_size = base_storage_size
+        self.num = num
+
+        # Initialize the counter that switches between the base storages!
+        self._current = 0
+
+        # Initialize a total counter for the issued temporaries
+        self._counter = 0
+
+        # Generate the base storage names
+        self.base_storage = tuple("{}_base_{}".format(self.identifier, i) for i in range(self.num))
+
+    def switch_base_storage(self):
+        self._current = (self._current + 1) % self.num
+
+    def get_temporary(self, **kwargs):
+        assert("base_storage" not in kwargs)
+        assert("storage_shape" not in kwargs)
+
+        # Select the base storage and increase counter
+        base = self.base_storage[self._current]
+
+        # Construct a temporary name
+        name = "{}_{}".format(self.identifier, self._counter)
+        self._counter = self._counter + 1
+
+        # Construct the temporary and return it
+        temporary_variable(name,
+                           base_storage=base,
+                           storage_shape=(self.base_storage_size,),
+                           managed=True,
+                           **kwargs
+                           )
+
+        return name
+
+
+@generator_factory(item_tags=("kernel", "buffer"), cache_key_generator=lambda i, **kw: i)
+def initialize_buffer(identifier, base_storage_size=None, num=2):
+    if base_storage_size is None:
+        raise ValueError("The buffer for identifier {} has not been initialized.".format(identifier))
+    return FlipFlopBuffer(identifier,
+                          base_storage_size=base_storage_size,
+                          num=num,
+                          )
+
+
+def get_buffer_temporary(identifier, **kwargs):
+    return initialize_buffer(identifier).get_temporary(**kwargs)
+
+
+def switch_base_storage(identifier):
+    initialize_buffer(identifier).switch_base_storage()
diff --git a/python/dune/perftool/loopy/stages.py b/python/dune/perftool/loopy/stages.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8ff325a95aed044c043aecd722cef3b97aa4f4f
--- /dev/null
+++ b/python/dune/perftool/loopy/stages.py
@@ -0,0 +1,34 @@
+""" loopy instructions to mark stages of computations """
+
+from dune.perftool.generation import (generator_factory,
+                                      noop_instruction,
+                                      )
+
+from loopy import add_dependency
+from loopy.match import Id
+
+
+@generator_factory(item_tags=("stage",), cache_key_generator=lambda n, **kw: n)
+def stage_insn(n, **kwargs):
+    assert 'id' not in kwargs
+
+    # Get an ID for this instruction
+    id = 'stage_insn_{}'.format(n)
+
+    # Chain dependencies of stage instructions
+    if n > 0:
+        kwargs['depends_on'] = kwargs.get('depends_on', frozenset([])).union(frozenset([stage_insn(n - 1, **kwargs)]))
+
+    # Actually issue the instruction
+    noop_instruction(id=id, **kwargs)
+
+    return id
+
+
+def finalize_stage_instructions(kernel):
+    for i in range(len(stage_insn._memoize_cache)):
+        deps = frozenset({insn.id for insn in kernel.instructions if stage_insn(i) in insn.depends_on and not insn.id.startswith('stage_insn_')})
+        for dep_id in deps:
+            kernel = add_dependency(kernel, Id(stage_insn(i + 1)), dep_id)
+
+    return kernel
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index 36110ff2cfd53ee94155cadb73955fcdc54c30a1..b9767cdb92cb788f130533ea36e83d68588f202b 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -3,7 +3,7 @@ from loopy.target import (TargetBase,
                           DummyHostASTBuilder,
                           )
 from loopy.target.c import CASTBuilder
-from loopy.target.c.codegen.expression import ExpressionToCMapper
+from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper, CExpressionToCodeMapper
 
 _registry = {'float32': 'float',
              'int32': 'int',
@@ -12,20 +12,26 @@ _registry = {'float32': 'float',
              'str': 'std::string'}
 
 
-class MyMapper(ExpressionToCMapper):
-    def map_subscript(self, expr, enclosing_prec, type_context):
-        ret = str(expr.aggregate)
-        from pymbolic.primitives import Variable
-        if isinstance(expr.index, Variable):
-            ret = ret + '[{}]'.format(str(expr.index))
-        else:
+class MyMapper(ExpressionToCExpressionMapper):
+    def map_subscript(self, expr, type_context):
+        from dune.perftool.generation.loopy import UnmanagedTemporaryVariable
+        if isinstance(self.find_array(expr), UnmanagedTemporaryVariable):
+            # If there is but one index, we do not need to handle this
+            from pymbolic.primitives import Subscript, Variable
+            if isinstance(expr.index, (Variable, int)):
+                return expr
+
+            # Else, we construct a nested Subscript chain
+            ret = expr.aggregate
             for i in expr.index:
-                ret = ret + '[{}]'.format(str(i))
-        return ret
+                ret = Subscript(ret, i)
+            return ret
+        else:
+            return ExpressionToCExpressionMapper.map_subscript(self, expr, type_context)
 
 
 class DuneASTBuilder(CASTBuilder):
-    def get_expression_to_code_mapper(self, codegen_state):
+    def get_expression_to_c_expression_mapper(self, codegen_state):
         return MyMapper(codegen_state)
 
     def get_temporary_decls(self, codegen_state, schedule_index):
diff --git a/python/dune/perftool/pdelab/argument.py b/python/dune/perftool/pdelab/argument.py
index a55a485c28a8f358824efa6644588b314d091157..06d4a75dc02bfb21051eb16311ed2c7d03c0c55a 100644
--- a/python/dune/perftool/pdelab/argument.py
+++ b/python/dune/perftool/pdelab/argument.py
@@ -101,9 +101,9 @@ def name_trialfunction_gradient(element, restriction, component):
     # This is just a temporary test used to create an A-matrix as
     # local operator class member. Right now it doesn't evaluate
     # anything.
-    if get_option("sumfact"):
-        from dune.perftool.sumfact import evaluate_coefficient_gradient_sumfact
-        evaluate_coefficient_gradient_sumfact()
+    if get_option("sumfact") and restriction == Restriction.NONE:
+        from dune.perftool.sumfact import start_sumfactorization
+        start_sumfactorization(element, container, restriction, component)
 
     evaluate_coefficient_gradient(element, name, container, restriction, component)
     return name
diff --git a/python/dune/perftool/pdelab/driver.py b/python/dune/perftool/pdelab/driver.py
index 303425bcd794f5d2c12d3d07c9dd4b784af6b31f..abb8938c6f0bb786aa0cef1f08c5ee880f05fffe 100644
--- a/python/dune/perftool/pdelab/driver.py
+++ b/python/dune/perftool/pdelab/driver.py
@@ -798,8 +798,8 @@ def name_solution_function(tree_path=()):
     try:
         name = get_global_context_value("data").object_names[id(expr)]
     except KeyError:
-        from dune.perftool.generation.cache import _GlobalCounter
-        name = 'generic_{}'.format(_GlobalCounter().get())
+        from dune.perftool.generation import get_counter
+        name = 'generic_{}'.format(get_counter('__generic'))
 
     define_boundary_function(expr, name)
 
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 702e8ff9a81c8af52f15fe0c38bded1a5b9086f3..3c008a173b98bcfcd06fb77f2948d4b6a76e2462 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -470,6 +470,7 @@ def generate_kernel(integrals):
     temporaries = {i.name: i for i in retrieve_cache_items("temporary")}
     arguments = [i for i in retrieve_cache_items("argument")]
     manglers = retrieve_cache_functions("mangler")
+    silenced = [l for l in retrieve_cache_items("silenced_warning")]
 
     # Construct an options object
     from loopy import Options
@@ -484,6 +485,7 @@ def generate_kernel(integrals):
                          function_manglers=manglers,
                          target=DuneTarget(),
                          options=opt,
+                         silenced_warnings=silenced,
                          )
 
     from loopy import make_reduction_inames_unique
@@ -494,6 +496,10 @@ def generate_kernel(integrals):
     from dune.perftool.loopy.duplicate import heuristic_duplication
     kernel = heuristic_duplication(kernel)
 
+    # Finalize our stages mechanism
+    from dune.perftool.loopy.stages import finalize_stage_instructions
+    kernel = finalize_stage_instructions(kernel)
+
     # This is also silly, but 1D DG Schemes never need the geometry, so the quadrature
     # statement actually introduces a preamble at a stage where preambles cannot be generated
     # anymore. TODO think about how to avoid this!!!
diff --git a/python/dune/perftool/sumfact/__init__.py b/python/dune/perftool/sumfact/__init__.py
index daaea90a264472f83ea13bd9a58fc3fc356a72c9..793cbde027e0e7c6a9b8278adbfa66cd60d38156 100644
--- a/python/dune/perftool/sumfact/__init__.py
+++ b/python/dune/perftool/sumfact/__init__.py
@@ -1 +1 @@
-from dune.perftool.sumfact.amatrix import (evaluate_coefficient_gradient_sumfact)
+from dune.perftool.sumfact.sumfact import start_sumfactorization
diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py
index 8755a56abe2b8a77efd7840c33b13337c5a102eb..9d2e249886c0075fe318449cfede12183ec24d2d 100644
--- a/python/dune/perftool/sumfact/amatrix.py
+++ b/python/dune/perftool/sumfact/amatrix.py
@@ -1,10 +1,20 @@
+from dune.perftool import Restriction
+
 from dune.perftool.options import get_option
 
+from dune.perftool.pdelab.argument import name_coefficientcontainer
+
 from dune.perftool.generation import (class_member,
                                       constructor_block,
+                                      domain,
+                                      function_mangler,
                                       get_global_context_value,
+                                      globalarg,
+                                      iname,
                                       include_file,
                                       initializer_list,
+                                      temporary_variable,
+                                      valuearg
                                       )
 
 from dune.perftool.pdelab.localoperator import (name_domain_field,
@@ -12,6 +22,41 @@ from dune.perftool.pdelab.localoperator import (name_domain_field,
                                                 )
 from dune.perftool.pdelab.quadrature import estimate_quadrature_order
 
+from loopy import CallMangleInfo
+from loopy.symbolic import FunctionIdentifier
+from loopy.types import NumpyType
+
+from pytools import Record
+
+import numpy
+
+
+class AMatrix(Record):
+    def __init__(self, a_matrix, rows, cols):
+        Record.__init__(self,
+                        a_matrix=a_matrix,
+                        rows=rows,
+                        cols=cols,
+                        )
+
+
+class ColMajorAccess(FunctionIdentifier):
+    def __init__(self, amatrix):
+        self.amatrix = amatrix
+
+    def __getinitargs__(self):
+        return (self.amatrix,)
+
+    @property
+    def name(self):
+        return '{}.colmajoraccess'.format(self.amatrix)
+
+
+@function_mangler
+def colmajoraccess_mangler(target, func, dtypes):
+    if isinstance(func, ColMajorAccess):
+        return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.int32)))
+
 
 @class_member("operator")
 def define_alignment(name):
@@ -106,10 +151,14 @@ def typedef_polynomials(name):
     include_file("dune/pdelab/finiteelementmap/qkdg.hh", filetag="operatorfile")
 
     # TODO: make switching polynomials possible
-    return "using {} = Dune::QkStuff::GaussLobattoLagrangePolynomials<{}, {}, {}>;".format(name,
-                                                                                           domain_field,
-                                                                                           range_field,
-                                                                                           degree)
+    # return "using {} = Dune::QkStuff::GaussLobattoLagrangePolynomials<{}, {}, {}>;".format(name,
+    #                                                                                        domain_field,
+    #                                                                                        range_field,
+    #                                                                                        degree)
+    return "using {} = Dune::QkStuff::EquidistantLagrangePolynomials<{}, {}, {}>;".format(name,
+                                                                                          domain_field,
+                                                                                          range_field,
+                                                                                          degree)
 
 
 def type_polynomials():
@@ -142,17 +191,19 @@ def sort_quadrature_points_weights():
 
 
 @constructor_block("operator")
-def construct_theta(name):
+def construct_theta(name, transpose):
     # Make sure that the quadrature points are sorted
     sort_quadrature_points_weights()
 
-    m = name_number_of_quadrature_points_per_direction()
-    n = name_number_of_basis_functions_per_direction()
+    if transpose:
+        shape = (name_number_of_basis_functions_per_direction(), name_number_of_quadrature_points_per_direction())
+    else:
+        shape = (name_number_of_quadrature_points_per_direction(), name_number_of_basis_functions_per_direction())
     polynomials = name_polynomials()
     qp = name_oned_quadrature_points()
 
-    return ["for (std::size_t i=0; i<{}; i++){{".format(m),
-            "  for (std::size_t j=0; j<{}; j++){{".format(n),
+    return ["for (std::size_t i=0; i<{}; i++){{".format(shape[0]),
+            "  for (std::size_t j=0; j<{}; j++){{".format(shape[1]),
             "    {}.colmajoraccess(i,j) = {}.p(j,{}[i]);".format(name, polynomials, qp),
             "  }",
             "}"]
@@ -160,6 +211,7 @@ def construct_theta(name):
 
 @class_member("operator")
 def typedef_theta(name):
+    include_file("dune/perftool/sumfact/alignedmatvec.hh", filetag="operatorfile")
     alignment = name_alignment()
     range_field = lop_template_range_field()
     return "using {} = AlignedMatrix<{}, {}>;".format(name, range_field, alignment)
@@ -172,21 +224,25 @@ def type_theta():
 
 
 @class_member("operator")
-def define_theta(name):
+def define_theta(name, transpose):
     theta_type = type_theta()
-    number_qp = name_number_of_quadrature_points_per_direction()
-    number_basis = name_number_of_basis_functions_per_direction()
-    initializer_list(name, [str(number_qp), str(number_basis)], classtag="operator")
-    construct_theta(name)
+    if transpose:
+        shape = (basis_functions_per_direction(), quadrature_points_per_direction())
+    else:
+        shape = (quadrature_points_per_direction(), basis_functions_per_direction())
+    globalarg(name, shape=shape, dtype=numpy.float32, dim_tags="f,f")
+    initializer_list(name, [str(axis) for axis in shape], classtag="operator")
+    construct_theta(name, transpose)
     return "{} {};".format(theta_type, name)
 
 
 def name_theta():
     name = "Theta"
-    define_theta(name)
+    define_theta(name, False)
     return name
 
 
-def evaluate_coefficient_gradient_sumfact():
-    include_file("dune/perftool/sumfact/alignedmatvec.hh", filetag="operatorfile")
-    name_theta()
+def name_theta_transposed():
+    name = "ThetaT"
+    define_theta(name, True)
+    return name
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7ee0a042c1fab3ac5b3662877c5a4b3a841f8a4
--- /dev/null
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -0,0 +1,150 @@
+from dune.perftool.pdelab.argument import pymbolic_coefficient
+from dune.perftool.generation import (domain,
+                                      get_counter,
+                                      iname,
+                                      instruction,
+                                      silenced_warning,
+                                      )
+from dune.perftool.loopy.buffer import (get_buffer_temporary,
+                                        initialize_buffer,
+                                        switch_base_storage,
+                                        )
+from dune.perftool.pdelab.spaces import name_lfs
+from dune.perftool.sumfact.amatrix import (AMatrix,
+                                           quadrature_points_per_direction,
+                                           basis_functions_per_direction,
+                                           name_theta,
+                                           name_theta_transposed,
+                                           )
+from dune.perftool.loopy.stages import stage_insn
+from pymbolic.primitives import (Call,
+                                 Product,
+                                 Subscript,
+                                 Variable,
+                                 )
+
+from loopy import Reduction
+
+from pytools import product
+
+
+@iname
+def _sumfact_iname(bound, _type, count):
+    name = "sf_{}_{}".format(_type, str(count))
+    domain(name, bound)
+    return name
+
+
+def sumfact_iname(bound, _type):
+    count = get_counter('_sumfac_iname_{}'.format(_type))
+    name = _sumfact_iname(bound, _type, count)
+    return name
+
+
+def setup_theta(element, container, restriction, component, a_matrices):
+    number_basis = product(mat.cols for mat in a_matrices)
+    shape = (number_basis,)
+    inp = get_buffer_temporary("buffer",
+                               shape=shape)
+    silenced_warning('read_no_write({})'.format(inp))
+
+    # Write initial coefficients into buffer
+    basisiname = sumfact_iname(number_basis, "basis")
+    lfs = name_lfs(element, restriction, component)
+    coeff = pymbolic_coefficient(container, lfs, basisiname)
+    assignee = Subscript(Variable(inp), (Variable(basisiname),))
+    return instruction(assignee=assignee,
+                       expression=coeff,
+                       depends_on=frozenset({stage_insn(0)}),
+                       )
+
+
+# TODO this code is WIP and mainly used for experiments.
+def start_sumfactorization(element, container, restriction, component):
+    theta = name_theta()
+    rows = quadrature_points_per_direction()
+    cols = basis_functions_per_direction()
+    a_matrix = AMatrix(theta, rows, cols)
+    a_matrices = (a_matrix, a_matrix)
+
+    # Do stage 1
+    initialize_buffer("buffer",
+                      base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
+                      num=2
+                      )
+
+    insn_dep = setup_theta(element, container, restriction, component, a_matrices)
+
+    sum_factorization_kernel(a_matrices, "buffer", 0, frozenset({insn_dep}))
+
+    # Do stage 3 (for f=u => mass matrix)
+    theta_transposed = name_theta_transposed()
+    a_matrix_transposed = AMatrix(theta_transposed, cols, rows)
+    a_matrices_transposed = (a_matrix_transposed, a_matrix_transposed)
+
+    return sum_factorization_kernel(a_matrices_transposed, "buffer", 2)
+
+
+def sum_factorization_kernel(a_matrices, buffer, stage, insn_dep=frozenset({})):
+    """
+    Calculate a sum factorization matrix product.
+
+    Y = A_{d-1}*...*A_0*X
+
+    where X is the input matrix and Y is the output variable.
+
+    Arguments:
+    ----------
+    a_matrices: An iterable of AMatrix instances
+        The list of tensors to be applied to the input.
+        Order of application is from 0 up.
+    buffer: A string identifying the flip flop buffer in use
+        for intermediate results. The memory is expected to be
+        pre-initialized with the input.
+    insn_dep: an instruction ID that the first issued instruction
+        should depend upon. All following ones will depend on each
+        other.
+    """
+    for l, a_matrix in enumerate(a_matrices):
+        # Get a temporary that interprets the base storage of the input
+        # as a column-major matrix. In later iteration of the amatrix loop
+        # this reinterprets the output of the previous iteration.
+        inp_shape = (a_matrix.cols, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
+        inp = get_buffer_temporary(buffer,
+                                   shape=inp_shape,
+                                   dim_tags="f,f")
+
+        # The input temporary will only be read from, so we need to silence the loopy warning
+        silenced_warning('read_no_write({})'.format(inp))
+
+        switch_base_storage(buffer)
+
+        # Get a temporary that interprets the base storage of the output
+        # as row-major matrix
+        out_shape = (a_matrix.rows, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
+        out = get_buffer_temporary(buffer,
+                                   shape=out_shape,
+                                   dim_tags="c,c")
+
+        # Get the inames needed for one matrix-matrix multiplication
+        i = sumfact_iname(out_shape[0], "row")
+        j = sumfact_iname(out_shape[1], "col")
+        k = sumfact_iname(a_matrix.cols, "red")
+
+        # Construct the matrix-matrix-multiplication expression a_ik*in_kj
+        from dune.perftool.sumfact.amatrix import ColMajorAccess
+        prod = Product((Call(ColMajorAccess(a_matrix.a_matrix), (Variable(i), Variable(k))),
+                        Subscript(Variable(inp), (Variable(k), Variable(j)))
+                        ))
+
+        # Issue the reduction instruction that implements the multiplication
+        # at the same time store the instruction ID for the next instruction to depend on
+        insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j))),
+                                          expression=Reduction("sum", k, prod),
+                                          forced_iname_deps=frozenset({i, j}),
+                                          forced_iname_deps_is_final=True,
+                                          depends_on=insn_dep.union(frozenset({stage_insn(stage)})),
+                                          )
+                              })
+
+    return out
diff --git a/python/loopy b/python/loopy
index 2e562728b81caed706a1304fdffd64e86286bae4..a08c8677fbd3aaded4ad63210d24439aac18f2d0 160000
--- a/python/loopy
+++ b/python/loopy
@@ -1 +1 @@
-Subproject commit 2e562728b81caed706a1304fdffd64e86286bae4
+Subproject commit a08c8677fbd3aaded4ad63210d24439aac18f2d0
diff --git a/python/pymbolic b/python/pymbolic
index 5fa83688cb95b429d8fa4175bb2d88ad6778a311..22b3e10d489ca9c8ba21e2e8bba77791f3f54ade 160000
--- a/python/pymbolic
+++ b/python/pymbolic
@@ -1 +1 @@
-Subproject commit 5fa83688cb95b429d8fa4175bb2d88ad6778a311
+Subproject commit 22b3e10d489ca9c8ba21e2e8bba77791f3f54ade
diff --git a/python/test/dune/perftool/generation/test_backend.py b/python/test/dune/perftool/generation/test_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a1e001cd9da6dad0d0d18c46d2da47d9506da7c
--- /dev/null
+++ b/python/test/dune/perftool/generation/test_backend.py
@@ -0,0 +1,35 @@
+from dune.perftool.generation import (backend,
+                                      get_backend,
+                                      generator_factory,
+                                      )
+
+
+@backend(interface="foo", name="f1")
+@generator_factory()
+def f1():
+    return 1
+
+
+@generator_factory()
+@backend(interface="foo", name="f2")
+def f2():
+    return 2
+
+
+@backend(interface="bar", name="f3")
+@generator_factory()
+def f3():
+    return 3
+
+
+@generator_factory()
+@backend(interface="bar", name="f4")
+def f4():
+    return 4
+
+
+def test_backend():
+    assert get_backend(interface="foo", selector=lambda: "f1")() == 1
+    assert get_backend(interface="foo", selector=lambda: "f2")() == 2
+    assert get_backend(interface="bar", selector=lambda: "f3")() == 3
+    assert get_backend(interface="bar", selector=lambda: "f4")() == 4
diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt
index 4620b9a53c6a052155d08b7a42dbdf94c6e9aba5..6a7006ce4c84db13bd9376b56be9eb69770d466d 100644
--- a/test/sumfact/poisson/CMakeLists.txt
+++ b/test/sumfact/poisson/CMakeLists.txt
@@ -1,11 +1,11 @@
-# 1. Poisson Test Case: source f, bc g
-dune_add_formcompiler_system_test(UFLFILE poisson.ufl
-                                  BASENAME sumfact_poisson
-                                  INIFILE poisson.mini
-                                  )
-
-# # 2. Poisson Test Case: DG, f + pure dirichlet
-# dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl
-#                                   BASENAME sumfact_poisson_dg
-#                                   INIFILE poisson_dg.mini
+# # 1. Poisson Test Case: source f, bc g
+# dune_add_formcompiler_system_test(UFLFILE poisson.ufl
+#                                   BASENAME sumfact_poisson
+#                                   INIFILE poisson.mini
 #                                   )
+
+# 2. Poisson Test Case: DG, f + pure dirichlet
+dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl
+                                  BASENAME sumfact_poisson_dg
+                                  INIFILE poisson_dg.mini
+                                  )
diff --git a/test/sumfact/poisson/poisson.mini b/test/sumfact/poisson/poisson.mini
index 5f07c985767d2ef9bfbe5e5bebcf82d801222833..491c685915a7585fc248ef11e6047bcb7de6cc84 100644
--- a/test/sumfact/poisson/poisson.mini
+++ b/test/sumfact/poisson/poisson.mini
@@ -1,10 +1,8 @@
 __name = sumfact_poisson_{__exec_suffix}
 __exec_suffix = numdiff, symdiff | expand num
 
-lowerleft = 0.0 0.0
-upperright = 1.0 1.0
-elements = 32 32
-elementType = simplical
+cells = 1 1
+extension = 1. 1.
 
 [wrapper.vtkcompare]
 name = {__name}
diff --git a/test/sumfact/poisson/poisson.ufl b/test/sumfact/poisson/poisson.ufl
index 8c62dd3a24d2f64a3aec350c2096fd65967498d8..576c603e5c5f8b8d00bad8ea4cdcb6ec19ebd332 100644
--- a/test/sumfact/poisson/poisson.ufl
+++ b/test/sumfact/poisson/poisson.ufl
@@ -1,7 +1,9 @@
-f = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return 4.*(1.-c.two_norm2())*std::exp(-1.*c.two_norm2());")
-g = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return std::exp(-1.*c.two_norm2());")
+cell = "quadrilateral"
 
-V = FiniteElement("CG", "triangle", 1, dirichlet_expression=g)
+f = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return 4.*(1.-c.two_norm2())*std::exp(-1.*c.two_norm2());", cell=cell)
+g = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return std::exp(-1.*c.two_norm2());", cell=cell)
+
+V = FiniteElement("CG", cell, 1, dirichlet_expression=g)
 u = TrialFunction(V)
 v = TestFunction(V)
 
diff --git a/test/sumfact/poisson/poisson_dg.mini b/test/sumfact/poisson/poisson_dg.mini
index 8a2000a43db23777bd2a16ef034a794460bd3b98..f388ba3eb61b3426b5edfe9955de1c69e32281e1 100644
--- a/test/sumfact/poisson/poisson_dg.mini
+++ b/test/sumfact/poisson/poisson_dg.mini
@@ -1,18 +1,13 @@
 __name = sumfact_poisson_dg_{__exec_suffix}
 __exec_suffix = numdiff, symdiff | expand num
 
-lowerleft = 0.0 0.0
-upperright = 1.0 1.0
-elements = 32 32
-elementType = simplical
+cells = 1 1
+extension = 1. 1.
 
 [wrapper.vtkcompare]
 name = {__name}
-reference = poisson_dg_ref
 extension = vtu
 
 [formcompiler]
 numerical_jacobian = 1, 0 | expand num
-exact_solution_expression = g
-compare_l2errorsquared = 2e-5
 sumfact = 1
diff --git a/test/sumfact/poisson/poisson_dg.ufl b/test/sumfact/poisson/poisson_dg.ufl
index 0ee5a857931d2ceea62b8cf5e52c5aed2b696851..a0b5049d9218da6984e040652f90c7fb24d809a8 100644
--- a/test/sumfact/poisson/poisson_dg.ufl
+++ b/test/sumfact/poisson/poisson_dg.ufl
@@ -1,7 +1,8 @@
-f = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return 4.*(1.-c.two_norm2())*std::exp(-1.*c.two_norm2());")
-g = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return std::exp(-1.*c.two_norm2());", on_intersection=True)
+cell = "quadrilateral"
+
+f = Expression("return -2.0*x.size();", cell=cell)
+g = Expression("return x.two_norm2();", on_intersection=True, cell=cell)
 
-cell = triangle
 V = FiniteElement("DG", cell, 1)
 
 u = TrialFunction(V)
@@ -23,4 +24,4 @@ r = inner(grad(u), grad(v))*dx \
   + theta*g*inner(grad(v), n)*ds \
   - gamma*g*v*ds
 
-forms = [r]
+forms = [r]
\ No newline at end of file