diff --git a/dune/perftool/common/muladd_workarounds.hh b/dune/perftool/common/muladd_workarounds.hh
new file mode 100644
index 0000000000000000000000000000000000000000..9ca81ed0402c447f7ae39a931c477057ddd4e00a
--- /dev/null
+++ b/dune/perftool/common/muladd_workarounds.hh
@@ -0,0 +1,26 @@
+#ifndef DUNE_PERFTOOL_MULADD_WORKAROUNDS_HH
+#define DUNE_PERFTOOL_MULADD_WORKAROUNDS_HH
+
+/* We are currently having some issues with FMA nodes not being
+ * eliminated correctly upon code generation. We "solve" the problem
+ * for now with overloads of the mul_add function for scalars.
+ */
+
+#include<dune/perftool/common/opcounter.hh>
+
+
+inline double mul_add(double op1, double& op2, double op3)
+{
+	return op1 * op2 + op3;
+}
+
+#ifdef ENABLE_COUNTER
+
+op::OpCounter<double> mul_add(op::OpCounter<double> op1, op::OpCounter<double>& op2, op::OpCounter<double> op3)
+{
+	return op1 * op2 + op3;
+}
+
+#endif
+
+#endif
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index bd73b61e9fe0bfb523fc14fe4c942db79e88dac1..83d42e9d4bd25fadce5801cc7eb5e3c9f12f8601 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -84,6 +84,7 @@ class DuneExpressionToCExpressionMapper(ExpressionToCExpressionMapper):
 
     def map_fused_multiply_add(self, expr, type_context):
         if self.codegen_state.vectorization_info:
+            include_file("dune/perftool/common/muladd_workarounds.hh", filetag="operatorfile")
             # If this is vectorized we call the VCL function mul_add
             return prim.Call(prim.Variable("mul_add"),
                              (self.rec(expr.mul_op1, type_context),
diff --git a/python/dune/perftool/pdelab/tensors.py b/python/dune/perftool/pdelab/tensors.py
index 496c5250ed2e5b6a72d22a00d1446727b2610b1e..a8e15d846be6cbb958508652b105cb8a1c64a83e 100644
--- a/python/dune/perftool/pdelab/tensors.py
+++ b/python/dune/perftool/pdelab/tensors.py
@@ -19,12 +19,14 @@ def define_list_tensor(name, expr, visitor, stack=()):
         if isinstance(child, ListTensor):
             define_list_tensor(name, child, visitor, stack=stack + (i,))
         else:
-            # TODO: the dependency below is not very nice!
+            visexpr = visitor.call(child)
+            from loopy.symbolic import DependencyMapper
+            deps = DependencyMapper(include_subscripts=False, include_lookups=False, include_calls=False)(visexpr)
             instruction(assignee=prim.Subscript(prim.Variable(name), stack + (i,)),
                         expression=visitor.call(child),
                         forced_iname_deps=frozenset(visitor.interface.quadrature_inames()),
+                        depends_on=frozenset({lp.match.Or(tuple(lp.match.Writes(v.name) for v in deps))}).union(frozenset({lp.match.Tagged("sumfact_stage1")})),
                         tags=frozenset({"quad"}),
-                        depends_on=frozenset({"sumfact_stage1"}),
                         )
 
 
diff --git a/python/dune/perftool/sumfact/__init__.py b/python/dune/perftool/sumfact/__init__.py
index 6a07678ac59cf1d3252cd5388d3ce7519238e72d..3b519622af83ca6d535dacf5c553c61e1455a3f0 100644
--- a/python/dune/perftool/sumfact/__init__.py
+++ b/python/dune/perftool/sumfact/__init__.py
@@ -1,3 +1,5 @@
+from dune.perftool.generation import get_backend
+from dune.perftool.options import option_switch
 from dune.perftool.pdelab.argument import (name_applycontainer,
                                            name_coefficientcontainer,
                                            )
@@ -60,7 +62,7 @@ class SumFactInterface(PDELabInterface):
 #        return to_global(pymbolic_quadrature_position())
 
     def pymbolic_spatial_coordinate(self, visitor=None):
-        from dune.perftool.sumfact.geometry import pymbolic_spatial_coordinate
-        ret, indices = pymbolic_spatial_coordinate(visitor.indices)
+        import dune.perftool.sumfact.geometry
+        ret, indices = get_backend(interface="spatial_coordinate", selector=option_switch("diagonal_transformation_matrix"))(visitor.indices)
         visitor.indices = indices
         return ret
diff --git a/python/dune/perftool/sumfact/geometry.py b/python/dune/perftool/sumfact/geometry.py
index 02e7a75b2c687322ba6c556e31985de6c1e762e1..9a8d4f1f46f0f44c88dace42b93fe1927bc0b698 100644
--- a/python/dune/perftool/sumfact/geometry.py
+++ b/python/dune/perftool/sumfact/geometry.py
@@ -1,23 +1,29 @@
 """ Sum factorized geometry evaluations """
 
-from dune.perftool.generation import (domain,
+from dune.perftool.generation import (backend,
+                                      domain,
                                       get_backend,
                                       get_counted_variable,
                                       iname,
                                       instruction,
                                       kernel_cached,
+                                      preamble,
                                       temporary_variable,
+                                      globalarg,
                                       )
 from dune.perftool.loopy.buffer import get_buffer_temporary
 from dune.perftool.pdelab.geometry import (local_dimension,
                                            world_dimension,
+                                           name_geometry,
                                            )
 from dune.perftool.sumfact.symbolic import SumfactKernelInputBase
 from dune.perftool.sumfact.vectorization import attach_vectorization_info
+from dune.perftool.options import option_switch
 
 from pytools import ImmutableRecord
 
 import pymbolic.primitives as prim
+import numpy as np
 
 
 @iname
@@ -38,8 +44,6 @@ class GeoCornersInput(SumfactKernelInputBase, ImmutableRecord):
                                     )
 
         ciname = corner_iname()
-
-        from dune.perftool.pdelab.geometry import name_geometry
         geo = name_geometry()
 
         # NB: We need to realize this as a C instruction, because the corner
@@ -61,9 +65,9 @@ class GeoCornersInput(SumfactKernelInputBase, ImmutableRecord):
                     tags=frozenset({"sumfact_stage{}".format(sf.stage)}),
                     )
 
-
 @kernel_cached
-def pymbolic_spatial_coordinate(visitor_indices):
+@backend(interface="spatial_coordinate", name="default")
+def pymbolic_spatial_coordinate_multilinear(visitor_indices):
     assert len(visitor_indices) == 1
 
     # Construct the matrix sequence for the evaluation of the global coordinate.
@@ -89,3 +93,62 @@ def pymbolic_spatial_coordinate(visitor_indices):
     var, _ = realize_sum_factorization_kernel(vsf)
 
     return prim.Subscript(var, vsf.quadrature_index(sf)), None
+
+
+@preamble
+def define_corner(name, low):
+    geo = name_geometry()
+    return "auto {} = {}.corner({});".format(name,
+                                             geo,
+                                             0 if low else 2 ** local_dimension() - 1)
+
+
+@preamble
+def define_mesh_width(name):
+    define_corner(name, False)
+    lower = name_lowerleft_corner()
+    return "{} -= {};".format(name, lower)
+
+
+def name_lowerleft_corner():
+    name = "lowerleft_corner"
+    globalarg(name, dtype=np.float64, shape=(world_dimension(),))
+    define_corner(name, True)
+    return name
+
+
+def name_meshwidth():
+    name = "meshwidth"
+    globalarg(name, dtype=np.float64, shape=(world_dimension(),))
+    define_mesh_width(name)
+    return name
+
+
+@kernel_cached
+@backend(interface="spatial_coordinate", name="diagonal_transformation_matrix")
+def pymbolic_spatial_coordinate_axiparallel(visitor_indices):
+    assert len(visitor_indices) == 1
+    index, = visitor_indices
+
+    # Urgh: *SOMEHOW* construct a face direction
+    from dune.perftool.pdelab.restriction import Restriction
+    restriction = Restriction.NONE
+    from dune.perftool.generation import get_global_context_value
+    if get_global_context_value("integral_type") == "interior_facet":
+        restriction = Restriction.NEGATIVE
+    from dune.perftool.sumfact.switch import get_facedir
+    face = get_facedir(restriction)
+
+    lowcorner = name_lowerleft_corner()
+    meshwidth = name_meshwidth()
+
+    if index == face:
+        x = 0
+    else:
+        iindex = index
+        if face is not None and index > face:
+            iindex = iindex - 1
+        from dune.perftool.sumfact.quadrature import pymbolic_quadrature_position
+        x = prim.Subscript(pymbolic_quadrature_position(), (iindex,))
+
+    return prim.Subscript(prim.Variable(lowcorner), (index,)) + x * prim.Subscript(prim.Variable(meshwidth), (index,)), None