diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py
index 5fecdc4c25c3b4dfc76aa194c3a9ca7a2d50bee9..3751e64a0f55b5206c72a4ff38e4bc0d9bde2a18 100644
--- a/python/dune/perftool/generation/__init__.py
+++ b/python/dune/perftool/generation/__init__.py
@@ -2,6 +2,7 @@ from __future__ import absolute_import
 from dune.perftool.generation.cache import (cached,
+                                            no_caching,
@@ -17,10 +18,12 @@ from dune.perftool.generation.cpp import (base_class,
-from dune.perftool.generation.loopy import (domain,
+from dune.perftool.generation.loopy import (constantarg,
+                                            domain,
+                                            function_mangler,
diff --git a/python/dune/perftool/generation/cache.py b/python/dune/perftool/generation/cache.py
index 27f93b19e816146d983e99a535282f815fd58c89..80849f8cf4e504851ffc142745a544ae36bd0efa 100644
--- a/python/dune/perftool/generation/cache.py
+++ b/python/dune/perftool/generation/cache.py
@@ -54,8 +54,8 @@ class _NoCachingCounter(object):
         return _NoCachingCounter.counter
-def no_caching(*a):
-    return _NoCachingCounter.get()
+def no_caching(*a, **k):
+    return _NoCachingCounter().get()
 class _CacheItemMeta(type):
diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index b99ea877dbfb679f24844a8f8dc7fc9f8d1cd778..61e04f062620a488c4b95cbbe1b5d19bdea07f1b 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -2,6 +2,7 @@
 from __future__ import absolute_import
 from dune.perftool.generation import (generator_factory,
+                                      no_caching,
@@ -11,7 +12,7 @@ import numpy
 iname = generator_factory(item_tags=("iname",))
 valuearg = generator_factory(item_tags=("argument", "valuearg"), on_store=lambda n: loopy.ValueArg(n), no_deco=True)
 pymbolic_expr = generator_factory(item_tags=("kernel", "pymbolic"))
-constantarg = generator_factory(item_tags=("kernel", "argument", "constantarg"), on_store=lambda n: loopy.ConstantArg(n))
+function_mangler = generator_factory(item_tags=("kernel", "mangler"))
 @generator_factory(item_tags=("argument", "globalarg"),
@@ -19,7 +20,17 @@ constantarg = generator_factory(item_tags=("kernel", "argument", "constantarg"),
 def globalarg(name, shape=loopy.auto, **kw):
     if isinstance(shape, str):
         shape = (shape,)
-    return loopy.GlobalArg(name, dtype=numpy.float64, shape=shape, **kw)
+    dtype = kw.pop("dtype", numpy.float64)
+    return loopy.GlobalArg(name, dtype=dtype, shape=shape, **kw)
+@generator_factory(item_tags=("argument", "constantarg"),
+                   cache_key_generator=lambda n, **kw: n)
+def constantarg(name, shape=loopy.auto, **kw):
+    if isinstance(shape, str):
+        shape = (shape,)
+    dtype = kw.pop("dtype", numpy.float64)
+    return loopy.GlobalArg(name, dtype=dtype, shape=shape, **kw)
@@ -41,11 +52,11 @@ def _temporary_type(shape_impl, shape, first=True):
     if shape_impl[0] == 'fv':
         return "Dune::FieldVector<{}, {}>".format(_temporary_type(shape_impl[1:], shape[1:], first=False), shape[0])
     if shape_impl[0] == 'fm':
-        pass
+        raise NotImplementedError
-def default_declaration(name, shape, shape_impl):
+def default_declaration(name, shape=(), shape_impl=()):
     # Determine the C++ type to use for this temporary.
     t = _temporary_type(shape_impl, shape)
     if len(shape_impl) == 0:
@@ -59,8 +70,15 @@ def default_declaration(name, shape, shape_impl):
         return '{} {}(0.0);'.format(t, name)
-@generator_factory(item_tags=("temporary",), cache_key_generator=lambda n, **kw: n)
-def temporary_variable(name, **kwargs):
+class _TemporaryCounter:
+    counter = 0
+@generator_factory(item_tags=("temporary",), cache_key_generator=no_caching)
+def temporary_variable(name=None, **kwargs):
+    if name is None:
+        name = 'expr_{}'.format(str(_TemporaryCounter.counter).zfill(4))
+        _TemporaryCounter.counter = _TemporaryCounter.counter + 1
     if 'dtype' not in kwargs:
         kwargs['dtype'] = numpy.float64
@@ -101,17 +119,17 @@ class _IDCounter:
 def _insn_cache_key(code=None, expression=None, **kwargs):
-    if code:
+    if code is not None:
         return code
-    if expression:
+    if expression is not None:
         return expression
     raise ValueError("Please specify either code or expression for instruction!")
 @generator_factory(item_tags=("insn_id",), cache_key_generator=_insn_cache_key)
 def instruction(code=None, expression=None, **kwargs):
-    assert code or expression
-    assert not (code and expression)
+    assert (code is not None) or (expression is not None)
+    assert not ((code is not None) and (expression is not None))
     assert 'id' not in kwargs
     # Get an ID for this instruction
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index dd4441ddc5e54389640723d6dd92ab060844a404..71bb18d067122592ec867ddb56bd32ee3df4ef08 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -5,6 +5,11 @@ from loopy.target import TargetBase
 from loopy.target.c.codegen.expression import LoopyCCodeMapper
+from loopy.library.reduction import (ReductionOperation,
+                                     register_reduction_parser,
+                                     )
 class AllToDouble(dict):
     """ This imitates a dict that maps everything to double and logs the requested keys """
     def __getitem__(self, key):
@@ -28,6 +33,13 @@ class MyMapper(LoopyCCodeMapper):
                 ret = ret + '[{}]'.format(str(i))
         return ret
+    def map_variable(self, expr, enclosing_prec, type_context):
+        from dune.perftool.pymbolic import VerbatimVariable
+        if isinstance(expr, VerbatimVariable):
+            return expr.name
+        else:
+            return super(MyMapper, self).map_variable(expr, enclosing_prec, type_context)
 class DuneTarget(TargetBase):
@@ -37,12 +49,9 @@ class DuneTarget(TargetBase):
     def dtype_to_typename(self, dtype):
         # For now, we do this the simplest possible way
         return _registry[dtype.dtype.name]
-#     def is_vector_dtype(self, dtype):
-#         raise NotImplementedError()
-#     def vector_dtype(self, base, count):
-#         raise NotImplementedError()
+    def is_vector_dtype(self, dtype):
+        return False
     def get_expression_to_code_mapper(self, codegen_state):
         return MyMapper(codegen_state)
diff --git a/python/dune/perftool/loopy/transformer.py b/python/dune/perftool/loopy/transformer.py
index 0120b33f905fd7870d04dcd92cd7c2f790041152..7e414082efd7b23418e6f38dcc5e59888b325459 100644
--- a/python/dune/perftool/loopy/transformer.py
+++ b/python/dune/perftool/loopy/transformer.py
@@ -24,6 +24,8 @@ from dune.perftool.pdelab.basis import (lfs_iname,
+from dune.perftool.pdelab.quadrature import quadrature_iname
+from pymbolic.primitives import Subscript, Variable
@@ -33,141 +35,132 @@ def index_sum_iname(i):
 class UFL2LoopyVisitor(ModifiedTerminalTracker, UFL2PymbolicMapper, GeometryMapper):
-    def __init__(self):
+    def __init__(self, measure):
+        self.iname_stack = [quadrature_iname()]
+        self.measure = measure
         super(UFL2LoopyVisitor, self).__init__()
-    def coefficient(self, o):
-        # All trial functions should already be handled
-        assert o.count() != 0
+    def _assign(self, o):
+        # Assign a name to the temporary variable we want our result in
+        temp = temporary_variable().name
-        # We expect all coefficients to be of type Expression!
-        from dune.perftool.ufl.execution import Expression
-        assert isinstance(o, Expression)
+        # Now we assign this expression to a new temporary variable
+        insn_id = instruction(assignee=Variable(temp),
+                              expression=o,
+                              forced_iname_deps=frozenset({i for i in self.iname_stack}),
+                              forced_iname_deps_is_final=True,
+                              )
-        # Determine the name of the parameter function
-        from dune.perftool.generation import get_global_context_value
-        name = get_global_context_value("namedata")[id(o)]
+        # Actually, if we have a cache hit, we need to change our temporary
+        from dune.perftool.generation import retrieve_cache_items
+        temp = filter(lambda i: i.id == insn_id, retrieve_cache_items("instruction"))[0].assignee_name
-        # Trigger the generation of code for this thing in the parameter class
-        from dune.perftool.pdelab.parameter import (cell_parameter_function,
-                                                    intersection_parameter_function,
-                                                    )
-        if o.on_intersection:
-            intersection_parameter_function(name, o)
+        return Variable(temp)
+    def __call__(self, o):
+        # First we do the tree traversal to get a pymbolic expression representing this expression
+        ret = self.call(o)
+        if isinstance(ret, Variable):
+            return ret
-            cell_parameter_function(name, o, self.restriction)
+            return self._assign(ret)
+    def argument(self, o):
+        # Have the issued instruction depend on the iname for this localfunction space
+        from dune.perftool.pdelab.basis import lfs_iname
+        self.iname_stack.append(lfs_iname(o.element(), argnumber=o.number()))
-        # And return a symbol
-        from pymbolic.primitives import Variable
-        return Variable(name)
+        # Correct the restriction on boundary integrals
+        restriction = self.restriction
+        if self.measure == 'exterior_facet':
+            restriction = Restriction.NEGATIVE
+        if self.grad:
+            from dune.perftool.pdelab.argument import name_testfunction_gradient
+            return Subscript(Variable(name_testfunction_gradient(o, restriction)), (Variable(lfs_iname(o.element(), argnumber=o.number())),))
+        else:
+            from dune.perftool.pdelab.argument import name_testfunction
+            return Subscript(Variable(name_testfunction(o, restriction)), (Variable(lfs_iname(o.element(), argnumber=o.number())),))
+    def coefficient(self, o):
+        # If this is a trialfunction, we do something entirely different
+        if o.count() == 0:
+            # Correct the restriction on boundary integrals
+            restriction = self.restriction
+            if self.measure == 'exterior_facet':
+                restriction = Restriction.NEGATIVE
+            if self.grad:
+                from dune.perftool.pdelab.argument import name_trialfunction_gradient
+                return Variable(name_trialfunction_gradient(o, restriction))
+            else:
+                from dune.perftool.pdelab.argument import name_trialfunction
+                return Variable(name_trialfunction(o, restriction))
+        # so this is a parameter function
+        else:
+            # We expect all coefficients to be of type Expression!
+            from dune.perftool.ufl.execution import Expression
+            assert isinstance(o, Expression)
+            # Determine the name of the parameter function
+            from dune.perftool.generation import get_global_context_value
+            name = get_global_context_value("namedata")[id(o)]
+            # Trigger the generation of code for this thing in the parameter class
+            from dune.perftool.pdelab.parameter import (cell_parameter_function,
+                                                        intersection_parameter_function,
+                                                        )
+            if o.on_intersection:
+                intersection_parameter_function(name, o)
+            else:
+                cell_parameter_function(name, o, self.restriction)
+            # And return a symbol
+            return Variable(name)
     def index_sum(self, o):
+        from loopy import Reduction
         from dune.perftool.ufl.shape import determine_shape
+        inames = ()
         # Define an iname for each of the indices in the multiindex
         for i in o.ufl_operands[1].indices():
+            inames = inames + (index_sum_iname(i),)
             shape = determine_shape(o.ufl_operands[0], i)
-            index_sum_iname(i)
             from dune.perftool.pdelab import name_index
             domain(name_index(i), shape)
-        # Now continue processing the expression
-        return self.call(o.ufl_operands[0])
+        # Recurse to get the summation expression
+        term = self.call(o.ufl_operands[0])
+        ret = self._assign(Reduction("sum", inames, term))
-class _Counter:
-    counter = 0
-def get_count():
-    c = _Counter.counter
-    _Counter.counter = c + 1
-    return c
+        return ret
 def transform_accumulation_term(term, measure, subdomain_id):
-    from dune.perftool.ufl.transformations.replace import ReplaceExpression
-    from pymbolic.primitives import Variable
-    # We always have a quadrature loop
-    from dune.perftool.pdelab.quadrature import quadrature_iname
-    quadrature_iname()
-    # Get the pymbolic expression needed for this accumulation term.
-    # This includes filling the cache with all sorts of necessary preambles!
+    # Initialize the local function spaces that we might need for this term
+    # We therefore gather a list of modified trial functions too.
     from dune.perftool.ufl.modified_terminals import extract_modified_arguments
-    test_ma = extract_modified_arguments(term, trialfunction=False, testfunction=True)
+    test_ma = extract_modified_arguments(term)
     trial_ma = extract_modified_arguments(term, trialfunction=True, testfunction=False)
-    rmap = {}
-    for ma in test_ma:
-        # If this is a boundary integral, all terms are implicitly restricted to the inside cell
-        if measure == 'exterior_facet':
-            ma.restriction = Restriction.NEGATIVE
-        # Set up the local function space structure
-        traverse_lfs_tree(ma)
-        # Get the expression for the modified argument representing the test function
-        from dune.perftool.pdelab.argument import pymbolic_testfunction
-        rmap[ma.expr] = pymbolic_testfunction(ma)
-    for ma in trial_ma:
-        # If this is a boundary integral, all terms are implicitly restricted to the inside cell
+    # All test and trial functions on boundary integrals are technically restricted
+    import itertools
+    for ma in itertools.chain(test_ma, trial_ma):
         if measure == 'exterior_facet':
             ma.restriction = Restriction.NEGATIVE
-        # Set up the local function space structure
-        # Get the expression for the modified argument representing the trial function
-        from dune.perftool.pdelab.argument import pymbolic_trialfunction
-        rmap[ma.expr] = pymbolic_trialfunction(ma)
-    # Get the transformer!
-    ufl2l_mf = UFL2LoopyVisitor()
-    re_mf = ReplaceExpression(replacemap=rmap, otherwise=ufl2l_mf)
-    ufl2l_mf.call = re_mf.__call__
-    pymbolic_expr = re_mf(term)
-    # Now simplify the expression
-    # TODO: Add a switch to disable/configure this.
-    from dune.perftool.pymbolic.simplify import simplify_pymbolic_expression
-    pymbolic_expr = simplify_pymbolic_expression(pymbolic_expr)
-    # There are some corner cases in DG, where this accumulation term vanishes completely
-    if pymbolic_expr == 0:
-        return
-    # Define a temporary variable for this expression
-    expr_tv_name = "expr_" + str(get_count()).zfill(4)
-    from pymbolic.primitives import Variable
-    # This is a bit hacky now. To correctly determine the iname dependencies of
-    # the accumulation term, we inspect it manually. This is necessary, as loopys
-    # automatic detection would also considers those inames dependencies which are
-    # duplicates of the lfs inames.
-    from dune.perftool.pymbolic.inameset import get_index_inames
-    acc_inames = get_index_inames(pymbolic_expr).union(frozenset({quadrature_iname()}))
-    insn_id = instruction(assignee=Variable(expr_tv_name),
-                          expression=pymbolic_expr,
-                          forced_iname_deps=acc_inames,
-                          forced_iname_deps_is_final=True,
-                          )
-    # Actually, if we have a cache hit, we need to change our temporary
-    from dune.perftool.generation import retrieve_cache_items
-    expr_tv_name = filter(lambda i: i.id == insn_id, retrieve_cache_items("instruction"))[0].assignee_name
-    # Now register the temporary variable with loopy
-    expr_tv = temporary_variable(expr_tv_name)
+    pymbolic_expr = UFL2LoopyVisitor(measure)(term)
     # The data that is used to collect the arguments for the accumulate function
     accumargs = [None] * (2 * len(test_ma))
     residual_shape = [None] * len(test_ma)
     arg_restr = [None] * len(test_ma)
+    accum_inames = [None] * len(test_ma)
     for ma in test_ma:
         count = ma.argexpr.number()
@@ -177,6 +170,7 @@ def transform_accumulation_term(term, measure, subdomain_id):
         accumargs[2 * count] = lfs
         accumargs[2 * count + 1] = lfsi
+        accum_inames[count] = lfsi
         arg_restr[count] = ma.restriction
         residual_shape[count] = name_lfs_bound(lfs)
@@ -193,7 +187,7 @@ def transform_accumulation_term(term, measure, subdomain_id):
     # Generate the code snippet for this accumulation instruction
     code = "{}.accumulate({}, {}*{});".format(accumvar,
                                               ", ".join(accumargs),
-                                              expr_tv_name,
+                                              pymbolic_expr.name,
@@ -232,7 +226,7 @@ def transform_accumulation_term(term, measure, subdomain_id):
     # Finally, issue the instruction
-                read_variables=frozenset({accumvar, factor, expr_tv_name}),
-                forced_iname_deps=acc_inames,
+                read_variables=frozenset({accumvar, factor, pymbolic_expr.name}),
+                forced_iname_deps=frozenset(accum_inames).union(frozenset({quadrature_iname()})),
diff --git a/python/dune/perftool/pdelab/argument.py b/python/dune/perftool/pdelab/argument.py
index 1c2e69bfd3d31717fb363e744ef80c66a8f8df90..bcb9ec3717432f0264b8554c34d7350a9d4db500 100644
--- a/python/dune/perftool/pdelab/argument.py
+++ b/python/dune/perftool/pdelab/argument.py
@@ -1,6 +1,6 @@
 """ Generator functions related to trial and test functions and the accumulation loop"""
-from dune.perftool.generation import domain, iname, pymbolic_expr, symbol, globalarg
+from dune.perftool.generation import domain, iname, pymbolic_expr, symbol, globalarg, function_mangler, constantarg, get_global_context_value
 from dune.perftool.ufl.modified_terminals import ModifiedArgumentDescriptor
 from dune.perftool.pdelab import (name_index,
@@ -13,63 +13,36 @@ from dune.perftool.pdelab.basis import (evaluate_trialfunction,
 from dune.perftool import Restriction
-from pymbolic.primitives import Subscript, Variable
+from pymbolic.primitives import Call, Subscript, Variable
-def name_testfunction_gradient(ma):
-    assert ma.grad
-    return name_basis_gradient(ma.argexpr.element(), ma.restriction)
+def name_testfunction_gradient(expr, restriction):
+    return name_basis_gradient(expr.element(), restriction)
-def name_testfunction(ma):
-    assert not ma.grad
-    return name_basis(ma.argexpr.element(), ma.restriction)
-def pymbolic_testfunction(ma):
-    # we only accept an index if we treat a gradient...
-    assert bool(ma.index) == ma.grad
-    if ma.grad:
-        return Subscript(Variable(name_testfunction_gradient(ma)), (Variable(lfs_iname(ma.argexpr.element(), argnumber=ma.argexpr.number())), Variable(name_index(ma.index))))
-    else:
-        return Subscript(Variable(name_testfunction(ma)), Variable(lfs_iname(ma.argexpr.element(), argnumber=ma.argexpr.number())))
+def name_testfunction(expr, restriction):
+    it = get_global_context_value("integral_type")
+    if it == 'exterior_facet':
+        restriction = Restriction.NEGATIVE
+    return name_basis(expr.element(), restriction)
-def name_trialfunction_gradient(ma):
-    assert ma.grad
-    name = restricted_name("gradu", ma.restriction)
-    evaluate_trialfunction_gradient(ma.argexpr.element(), name, ma.restriction)
+def name_trialfunction_gradient(expr, restriction):
+    name = restricted_name("gradu", restriction)
+    evaluate_trialfunction_gradient(expr.element(), name, restriction)
     return name
-def name_trialfunction(ma):
-    assert not ma.grad
-    name = restricted_name("u", ma.restriction)
-    evaluate_trialfunction(ma.argexpr.element(), name, ma.restriction)
+def name_trialfunction(expr, restriction):
+    name = restricted_name("u", restriction)
+    evaluate_trialfunction(expr.element(), name, restriction)
     return name
-def pymbolic_trialfunction(ma):
-    # we only accept an index if we treat a gradient...
-    assert bool(ma.index) == ma.grad
-    if ma.grad:
-        return Subscript(Variable(name_trialfunction_gradient(ma)), Variable(name_index(ma.index)))
-    else:
-        return Variable(name_trialfunction(ma))
 def name_testfunctionspace(restriction):
     return restricted_name("lfsv", restriction)
@@ -92,7 +65,29 @@ def type_trialfunctionspace():
 def name_coefficientcontainer(restriction):
-    return restricted_name("x", restriction)
+    name = restricted_name("x", restriction)
+    from dune.perftool.pdelab.basis import name_lfs_bound, lfs_iname
+    return name
+def create_function_mangler(container):
+    def _mangler(kernel, name, dtypes):
+        if name == container:
+            import loopy
+            return loopy.types.NumpyType("int"), container
+    return _mangler
+def pymbolic_coefficient(lfs, index, restriction):
+    container = name_coefficientcontainer(restriction)
+    create_function_mangler(container)
+    import loopy
+    constantarg(lfs, dtype=loopy.types.NumpyType("str"))
+    from dune.perftool.pymbolic import VerbatimVariable
+    return Call(VerbatimVariable(container), (VerbatimVariable(lfs), Variable(index),))
@@ -123,14 +118,6 @@ def name_argument(ma):
     assert False
-def pymbolic_argument(ma):
-    if ma.argexpr.number() == 0:
-        return pymbolic_testfunction(ma)
-    if ma.argexpr.number() == 1:
-        return pymbolic_trialfunction(ma)
-    assert False
 def name_jacobian(restriction1, restriction2):
     # Restrictions may only differ if NONE
diff --git a/python/dune/perftool/pdelab/basis.py b/python/dune/perftool/pdelab/basis.py
index 745ce9b2bda64e37957884ca1f8998f5fb3779e5..5bf7fe5ba9f9083a978f0e729b6c24a10f0fc406 100644
--- a/python/dune/perftool/pdelab/basis.py
+++ b/python/dune/perftool/pdelab/basis.py
@@ -14,7 +14,8 @@ from dune.perftool.generation import (cached,
 from dune.perftool.pdelab.quadrature import (name_quadrature_position_in_cell,
-from dune.perftool.pdelab.geometry import (name_dimension,
+from dune.perftool.pdelab.geometry import (dimension_iname,
+                                           name_dimension,
@@ -23,6 +24,8 @@ from dune.perftool.pdelab.localoperator import (lop_template_ansatz_gfs,
 from dune.perftool.pdelab.driver import FEM_name_mangling
 from dune.perftool.pdelab import restricted_name
+from pymbolic.primitives import Product, Subscript, Variable
+from loopy import Reduction
@@ -135,7 +138,7 @@ def _lfs_iname(element, context):
     if context:
         context = '_' + context
-    name = name + context + '_index'
+    name = name + context + 'index'
     domain(name, bound)
     return name
@@ -157,6 +160,8 @@ def lfs_iname(element, context='', argnumber=None):
     if argnumber is not None:
         context = 'arg{}_{}'.format(argnumber, context)
+    else:
+        context = context + '_'
     return _lfs_iname(element, context)
@@ -249,70 +254,34 @@ def name_basis_gradient(element, restriction):
     return name
-def reset_trialfunction(name):
-    return instruction(inames=(quadrature_iname(),
-                               ),
-                       code="{} = 0.0;".format(name),
-                       assignees=frozenset({name}),
-                       )
 def evaluate_trialfunction(element, name, restriction):
     temporary_variable(name, shape=())
-    reset = reset_trialfunction(name)
     lfs = name_lfs(element)
     index = lfs_iname(element, context='trial')
     basis = name_basis(element, restriction)
-    from dune.perftool.pdelab.argument import name_coefficientcontainer
-    coeffs = name_coefficientcontainer(restriction)
-    instruction(inames=(quadrature_iname(),
-                        index,
-                        ),
-                code='{} += {}({}, {})*{}[{}];'.format(name,
-                                                       coeffs,
-                                                       lfs,
-                                                       index,
-                                                       basis,
-                                                       index
-                                                       ),
-                assignees=frozenset({name}),
-                read_variables=frozenset({basis}),
-                depends_on=frozenset({reset}),
+    from dune.perftool.pdelab.argument import pymbolic_coefficient
+    coeff = pymbolic_coefficient(lfs, index, restriction)
+    reduction_expr = Product((coeff, Subscript(Variable(basis), Variable(index))))
+    instruction(expression=Reduction("sum", index, reduction_expr),
+                assignee=frozenset({name}),
+                forced_iname_deps=frozenset({quadrature_iname()}),
+                forced_iname_deps_is_final=True,
-def reset_trialfunction_gradient(name):
-    return instruction(inames=(quadrature_iname(),
-                               ),
-                       code='{} = 0.0;'.format(name),
-                       assignees=frozenset({name}),
-                       )
 def evaluate_trialfunction_gradient(element, name, restriction):
-    # TODO this is of course not yet correct
     temporary_variable(name, shape=(name_dimension(),), shape_impl=('fv',))
-    reset = reset_trialfunction_gradient(name)
     lfs = name_lfs(element)
     index = lfs_iname(element, context='trialgrad')
     basis = name_basis_gradient(element, restriction)
-    from dune.perftool.pdelab.argument import name_coefficientcontainer
-    coeffs = name_coefficientcontainer(restriction)
-    instruction(inames=(quadrature_iname(),
-                        index,
-                        ),
-                code='{}.axpy({}({}, {}), {}[{}]);'.format(name,
-                                                           coeffs,
-                                                           lfs,
-                                                           index,
-                                                           basis,
-                                                           index,
-                                                           ),
-                assignees=frozenset({name}),
-                read_variables=frozenset({basis}),
-                forced_iname_deps=frozenset({quadrature_iname(), index}),
+    idim = dimension_iname()
+    from dune.perftool.pdelab.argument import pymbolic_coefficient
+    coeff = pymbolic_coefficient(lfs, index, restriction)
+    reduction_expr = Product((coeff, Subscript(Variable(basis), (Variable(index), Variable(idim)))))
+    instruction(expression=Reduction("sum", index, reduction_expr),
+                assignee=Subscript(Variable(name), Variable(idim)),
+                forced_iname_deps=frozenset({quadrature_iname(), idim}),
-                depends_on=frozenset({reset}),
diff --git a/python/dune/perftool/pdelab/driver.py b/python/dune/perftool/pdelab/driver.py
index bdf8bc57bfa199187e93dc2bfa89b9230f67d370..397a1614196d3d349f466783a90e0829eaea27f8 100644
--- a/python/dune/perftool/pdelab/driver.py
+++ b/python/dune/perftool/pdelab/driver.py
@@ -801,6 +801,6 @@ def vtkoutput():
     vec = name_vector()
     vtkfile = name_vtkfile()
-    # print_matrix()
+    print_matrix()
     return ["Dune::PDELab::addSolutionToVTKWriter({}, {}, {});".format(vtkwriter, gfs, vec),
             "{}.write({}, Dune::VTK::ascii);".format(vtkwriter, vtkfile)]
diff --git a/python/dune/perftool/pdelab/geometry.py b/python/dune/perftool/pdelab/geometry.py
index 8aac1c1d7bc1613d69be39e47779d6d819b2a3ec..1817b37b5538bb08de90476ed2f4c1138213aeb6 100644
--- a/python/dune/perftool/pdelab/geometry.py
+++ b/python/dune/perftool/pdelab/geometry.py
@@ -1,6 +1,9 @@
 from dune.perftool import Restriction
 from dune.perftool.pdelab import restricted_name
-from dune.perftool.generation import (preamble,
+from dune.perftool.generation import (domain,
+                                      get_global_context_value,
+                                      iname,
+                                      preamble,
@@ -41,6 +44,21 @@ class GeometryMapper(MultiFunction):
         return Variable(name_facetarea())
+def _dimension_iname(context, count):
+    if context:
+        context = '_' + context
+    name = 'idim{}{}'.format(context, str(count))
+    formdata = get_global_context_value('formdata')
+    dim = formdata.geometric_dimension
+    domain(name, dim)
+    return name
+def dimension_iname(context='', count=0):
+    return _dimension_iname(context, count)
 def name_element_geometry_wrapper():
     return 'eg'
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index d44424bb7e13fc4b1d5acf0ec1bc243e6c361148..bb38056a8ff67a0599da008e0c7c6be4fdad27be 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -176,15 +176,30 @@ def generate_kernel(integral):
     domains = [i for i in retrieve_cache_items("domain")]
     instructions = [i for i in retrieve_cache_items("instruction")]
     temporaries = {i.name: i for i in retrieve_cache_items("temporary")}
-    preambles = [(i, p) for i, p in enumerate(retrieve_cache_items("preamble"))]
     arguments = [i for i in retrieve_cache_items("argument")]
+    manglers = [i for i in retrieve_cache_items("mangler")]
     # Create the kernel
     from loopy import make_kernel, preprocess_kernel
-    # kernel = make_kernel(domains, instructions, arguments, temporary_variables=temporaries, preambles=preambles, target=DuneTarget())
-    kernel = make_kernel(domains, instructions, arguments, temporary_variables=temporaries, target=DuneTarget(), preambles=preambles)
+    kernel = make_kernel(domains,
+                         instructions,
+                         arguments,
+                         temporary_variables=temporaries,
+                         function_manglers=manglers,
+                         target=DuneTarget()
+                         )
     kernel = preprocess_kernel(kernel)
+    # Loopy might have introduced some temporary variables during preprocessing. As I want to have my own
+    # temporary declaration code right now, I call the declaration preamble manually.
+    for added_tv in set(kernel.temporary_variables.keys()) - set(temporaries.keys()):
+        from dune.perftool.generation.loopy import default_declaration
+        default_declaration(added_tv)
+    # Now add the preambles to the kernel
+    preambles = [(i, p) for i, p in enumerate(retrieve_cache_items("preamble"))]
+    kernel = kernel.copy(preambles=preambles)
     # All items with the kernel tags can be destroyed once a kernel has been generated
     from dune.perftool.generation import delete_cache_items
     delete_cache_items("(not file) and (not clazz)")
diff --git a/python/dune/perftool/pymbolic/__init__.py b/python/dune/perftool/pymbolic/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d4a85c13e666669948ac257142656a685dc8b24b 100644
--- a/python/dune/perftool/pymbolic/__init__.py
+++ b/python/dune/perftool/pymbolic/__init__.py
@@ -0,0 +1,5 @@
+from pymbolic.primitives import Variable
+class VerbatimVariable(Variable):
+    pass
diff --git a/python/dune/perftool/ufl/modified_terminals.py b/python/dune/perftool/ufl/modified_terminals.py
index b992dc05f2fe76fb68c76b336bd46cf580c96277..6beed959b0c0cd372114be603a8eeaae2190f122 100644
--- a/python/dune/perftool/ufl/modified_terminals.py
+++ b/python/dune/perftool/ufl/modified_terminals.py
@@ -18,22 +18,14 @@ class ModifiedTerminalTracker(MultiFunction):
     def positive_restricted(self, o):
         assert self.restriction == Restriction.NONE
         self.restriction = Restriction.POSITIVE
-        from dune.perftool.generation import global_context
-        with global_context(restriction=Restriction.POSITIVE):
-            ret = self.call(o.ufl_operands[0])
+        ret = self.call(o.ufl_operands[0])
         self.restriction = Restriction.NONE
         return ret
     def negative_restricted(self, o):
         assert self.restriction == Restriction.NONE
         self.restriction = Restriction.NEGATIVE
-        from dune.perftool.generation import global_context
-        with global_context(restriction=Restriction.NEGATIVE):
-            ret = self.call(o.ufl_operands[0])
+        ret = self.call(o.ufl_operands[0])
         self.restriction = Restriction.NONE
         return ret