diff --git a/applications/knl/poisson_dg/poisson_dg.ufl b/applications/knl/poisson_dg/poisson_dg.ufl
index 2c0f2c34436f1c6b76817d6abad344fae3183d62..7b5c3e548f81688a0735256411fd54e9d49fa9f4 100644
--- a/applications/knl/poisson_dg/poisson_dg.ufl
+++ b/applications/knl/poisson_dg/poisson_dg.ufl
@@ -12,21 +12,21 @@ n = FacetNormal(cell)('+')
 
 alpha = 1.0
 h_ext = CellVolume(cell) / FacetArea(cell)
-cse_gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
 h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
-cse_gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 theta = -1.0
 
 r = inner(grad(u), grad(v))*dx \
   + inner(n, avg(grad(u)))*jump(v)*dS \
-  + cse_gamma_int*jump(u)*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
   - theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
-  + cse_gamma_ext*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
   - f*v*dx \
   - theta*g*inner(grad(v), n)*ds \
-  - cse_gamma_ext*g*v*ds
+  - gamma_ext*g*v*ds
 
 forms = [r]
diff --git a/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl b/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl
index 00267f15b9331b4705138cfb1a4698ee357a5767..d3c95a5226e26981cf78e5d8b78a686150a9310d 100644
--- a/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl
+++ b/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl
@@ -16,20 +16,20 @@ n = FacetNormal(cell)('+')
 
 alpha = 3.0
 h_ext = CellVolume(cell) / FacetArea(cell)
-cse_gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
 h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
-cse_gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 theta = -1.0
 
 r = (inner(A*grad(u), grad(v)) + (c*u-f)*v)*dx \
   + inner(n, A*avg(grad(u)))*jump(v)*dS \
-  + cse_gamma_int*jump(u)*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
   - theta*jump(u)*inner(A*avg(grad(v)), n)*dS \
   - inner(n, A*grad(u))*v*ds \
-  + cse_gamma_ext*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(A*grad(v), n)*ds \
   - theta*g*inner(A*grad(v), n)*ds \
-  - cse_gamma_ext*g*v*ds
+  - gamma_ext*g*v*ds
 
 forms = [r]
diff --git a/applications/poisson_dg/poisson_dg.ufl b/applications/poisson_dg/poisson_dg.ufl
index 2c0f2c34436f1c6b76817d6abad344fae3183d62..7b5c3e548f81688a0735256411fd54e9d49fa9f4 100644
--- a/applications/poisson_dg/poisson_dg.ufl
+++ b/applications/poisson_dg/poisson_dg.ufl
@@ -12,21 +12,21 @@ n = FacetNormal(cell)('+')
 
 alpha = 1.0
 h_ext = CellVolume(cell) / FacetArea(cell)
-cse_gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
 h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
-cse_gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 theta = -1.0
 
 r = inner(grad(u), grad(v))*dx \
   + inner(n, avg(grad(u)))*jump(v)*dS \
-  + cse_gamma_int*jump(u)*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
   - theta*jump(u)*inner(avg(grad(v)), n)*dS \
   - inner(n, grad(u))*v*ds \
-  + cse_gamma_ext*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(grad(v), n)*ds \
   - f*v*dx \
   - theta*g*inner(grad(v), n)*ds \
-  - cse_gamma_ext*g*v*ds
+  - gamma_ext*g*v*ds
 
 forms = [r]
diff --git a/applications/poisson_dg_tensor/poisson_dg_tensor.ufl b/applications/poisson_dg_tensor/poisson_dg_tensor.ufl
index 00267f15b9331b4705138cfb1a4698ee357a5767..d3c95a5226e26981cf78e5d8b78a686150a9310d 100644
--- a/applications/poisson_dg_tensor/poisson_dg_tensor.ufl
+++ b/applications/poisson_dg_tensor/poisson_dg_tensor.ufl
@@ -16,20 +16,20 @@ n = FacetNormal(cell)('+')
 
 alpha = 3.0
 h_ext = CellVolume(cell) / FacetArea(cell)
-cse_gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
 h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
-cse_gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
 
 theta = -1.0
 
 r = (inner(A*grad(u), grad(v)) + (c*u-f)*v)*dx \
   + inner(n, A*avg(grad(u)))*jump(v)*dS \
-  + cse_gamma_int*jump(u)*jump(v)*dS \
+  + gamma_int*jump(u)*jump(v)*dS \
   - theta*jump(u)*inner(A*avg(grad(v)), n)*dS \
   - inner(n, A*grad(u))*v*ds \
-  + cse_gamma_ext*u*v*ds \
+  + gamma_ext*u*v*ds \
   + theta*u*inner(A*grad(v), n)*ds \
   - theta*g*inner(A*grad(v), n)*ds \
-  - cse_gamma_ext*g*v*ds
+  - gamma_ext*g*v*ds
 
 forms = [r]
diff --git a/python/dune/perftool/compile.py b/python/dune/perftool/compile.py
index d2639ac6934b10a936711744c515890eba2d0a02..8d1dcc503ff8971db3d11858e47d372f834c7948 100644
--- a/python/dune/perftool/compile.py
+++ b/python/dune/perftool/compile.py
@@ -84,9 +84,6 @@ def read_ufl(uflfile):
     # Enrich data by some additional objects we deem worth keeping
     if get_option("exact_solution_expression"):
         data.object_by_name[get_option("exact_solution_expression")] = namespace[get_option("exact_solution_expression")]
-    for name, expr in namespace.items():
-        if name.startswith("cse"):
-            data.object_by_name[name] = namespace[name]
 
     formdatas = []
     forms = data.forms
diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py
index d8ab37fbb57f176e4776eb0436de1ef88363254d..c819772e874a6264a05276873d7ff553fb41188b 100644
--- a/python/dune/perftool/generation/__init__.py
+++ b/python/dune/perftool/generation/__init__.py
@@ -32,7 +32,6 @@ from dune.perftool.generation.cpp import (base_class,
 
 from dune.perftool.generation.loopy import (barrier,
                                             constantarg,
-                                            construct_subst_rule,
                                             domain,
                                             function_mangler,
                                             get_temporary_name,
@@ -43,8 +42,6 @@ from dune.perftool.generation.loopy import (barrier,
                                             kernel_cached,
                                             noop_instruction,
                                             silenced_warning,
-                                            set_subst_rule,
-                                            subst_rule,
                                             temporary_variable,
                                             transform,
                                             valuearg,
diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index d098c798fe13f1bdd34fd4502fc3b8a0b4f6fa1c..8cc073fc68c89572dd0dd5b9e5947cc5690f5439 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -198,31 +198,3 @@ def loopy_class_member(name, classtag=None, potentially_vectorized=False, **kwar
     globalarg(name, **kwargs)
 
     return name
-
-
-@generator_factory(item_tags=("substrule_name",),
-                   context_tags="kernel",
-                   cache_key_generator=lambda e, n: e)
-def _substrule_name(expr, name):
-    return name
-
-
-@generator_factory(item_tags=("substrule_impl",),
-                   context_tags="kernel",
-                   cache_key_generator=lambda n, e, **ex: e,
-                   )
-def subst_rule(name, expr, exists=False):
-    _substrule_name(expr, name)
-    return exists
-
-
-def set_subst_rule(name, expr):
-    subst_rule(name, expr, exists=True)
-
-
-@generator_factory(item_tags=("substrule",),
-                   context_tags="kernel")
-def construct_subst_rule(expr, visitor):
-    name = _substrule_name(expr, None)
-    assert name
-    return lp.SubstitutionRule(name, (), visitor(expr))
diff --git a/python/dune/perftool/loopy/transformations/collect_rotate.py b/python/dune/perftool/loopy/transformations/collect_rotate.py
index b64503b4f52eb853edd770de513cda8faa28c6ff..167de058f85b773978d4d6cef101101df72bf840 100644
--- a/python/dune/perftool/loopy/transformations/collect_rotate.py
+++ b/python/dune/perftool/loopy/transformations/collect_rotate.py
@@ -3,6 +3,7 @@ is filled and then does vector computations """
 
 from dune.perftool.generation import (function_mangler,
                                       include_file,
+                                      loopy_class_member,
                                       )
 from dune.perftool.loopy.vcl import get_vcl_type, get_vcl_type_size
 from dune.perftool.loopy.transformations.vectorview import (add_temporary_with_vector_view,
@@ -11,7 +12,7 @@ from dune.perftool.loopy.transformations.vectorview import (add_temporary_with_v
                                                             )
 from dune.perftool.loopy.symbolic import substitute
 from dune.perftool.sumfact.quadrature import quadrature_inames
-from dune.perftool.tools import get_pymbolic_basename, get_pymbolic_tag
+from dune.perftool.tools import get_pymbolic_basename, get_pymbolic_tag, ceildiv
 from dune.perftool.options import get_option
 
 from loopy.kernel.creation import parse_domains
@@ -19,6 +20,7 @@ from loopy.symbolic import pw_aff_to_expr
 from loopy.match import Tagged
 
 from loopy.symbolic import DependencyMapper
+from pytools import product
 
 import pymbolic.primitives as prim
 import loopy as lp
@@ -234,6 +236,23 @@ def collect_vector_data_rotate(knl):
                 replacemap_vec[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)),
                                                       (vector_indices.get(1), prim.Variable(new_iname)),
                                                       )
+        elif quantity in [a.name for a in knl.args]:
+            arg, = [a for a in knl.args if a.name == quantity]
+            tags = set(get_pymbolic_tag(expr) for expr in quantity_exprs)
+            if tags and tags.pop() == "operator_precomputed":
+                expr, = quantity_exprs
+                shape=(ceildiv(product(s for s in arg.shape), vec_size), vec_size)
+                name = loopy_class_member(quantity,
+                                          shape=shape,
+                                          dim_tags="f,vec",
+                                          potentially_vectorized=True,
+                                          classtag="operator",
+                                          dtype=np.float64,
+                                          )
+                knl = knl.copy(args=knl.args + [lp.GlobalArg(name, shape=shape, dim_tags="c,vec", dtype=np.float64)])
+                replacemap_vec[expr] = prim.Subscript(prim.Variable(name),
+                                                      (vector_indices.get(1), prim.Variable(new_iname)),
+                                                      )
 
     new_insns = [i.copy(expression=substitute(i.expression, replacemap_arr)) for i in new_insns]
 
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 7d7712d91e394998aeb0377a11bd62eb7cdf9c62..8dff610bd5750df4176f684aeb3542ad476ffcab 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -65,6 +65,7 @@ class PerftoolOptionsArray(ImmutableRecord):
     # Arguments that are mainly to be set by logic depending on other options
     max_vector_width = PerftoolOption(default=256, helpstr=None)
     unroll_dimension_loops = PerftoolOption(default=False, helpstr="whether loops over the gemetric dimension should be unrolled.")
+    precompute_quadrature_info = PerftoolOption(default=True, helpstr="whether loops over the gemetric dimension should be unrolled.")
 
 
 # Until more sophisticated logic is needed, we keep the actual option data in this module
diff --git a/python/dune/perftool/pdelab/__init__.py b/python/dune/perftool/pdelab/__init__.py
index ac9ad0e46f5203a23b52ef52090f28cd2a811b72..d05bcad57ddde467de202a09f337c873317c5452 100644
--- a/python/dune/perftool/pdelab/__init__.py
+++ b/python/dune/perftool/pdelab/__init__.py
@@ -35,6 +35,10 @@ from dune.perftool.pdelab.tensors import pymbolic_list_tensor, pymbolic_identity
 
 
 class PDELabInterface(object):
+    def __init__(self):
+        # The visitor instance will be registered by its init method
+        self.visitor = None
+
     #
     # TODO: The following ones are actually entirely PDELab independent!
     # They should be placed elsewhere and be used directly in the visitor.
@@ -63,16 +67,16 @@ class PDELabInterface(object):
     def pymbolic_reference_gradient(self, element, restriction, number):
         return pymbolic_reference_gradient(element, restriction, number)
 
-    def pymbolic_trialfunction_gradient(self, element, restriction, component, visitor=None):
+    def pymbolic_trialfunction_gradient(self, element, restriction, component):
         return pymbolic_trialfunction_gradient(element, restriction, component)
 
-    def pymbolic_apply_function_gradient(self, element, restriction, component, visitor=None):
+    def pymbolic_apply_function_gradient(self, element, restriction, component):
         return pymbolic_apply_function_gradient(element, restriction, component)
 
-    def pymbolic_trialfunction(self, element, restriction, component, visitor=None):
+    def pymbolic_trialfunction(self, element, restriction, component):
         return pymbolic_trialfunction(element, restriction, component)
 
-    def pymbolic_apply_function(self, element, restriction, component, visitor=None):
+    def pymbolic_apply_function(self, element, restriction, component):
         return pymbolic_apply_function(element, restriction, component)
 
     #
@@ -89,8 +93,8 @@ class PDELabInterface(object):
     # Tensor expression related generator functions
     #
 
-    def pymbolic_list_tensor(self, o, visitor):
-        return pymbolic_list_tensor(o, visitor)
+    def pymbolic_list_tensor(self, o):
+        return pymbolic_list_tensor(o, self.visitor)
 
     def pymbolic_identity(self, o):
         return pymbolic_identity(o)
@@ -99,7 +103,7 @@ class PDELabInterface(object):
     # Geometry related generator functions
     #
 
-    def pymbolic_spatial_coordinate(self, visitor=None):
+    def pymbolic_spatial_coordinate(self):
         return to_global(pymbolic_quadrature_position())
 
     def name_facet_jacobian_determinant(self):
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 3bae1ea9554c7c5f5a751be17128bf215c8b0b55..4e7227e12b1925ee59588023685471460046c3e6 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -19,7 +19,6 @@ from dune.perftool.generation import (backend,
                                       post_include,
                                       retrieve_cache_functions,
                                       retrieve_cache_items,
-                                      set_subst_rule,
                                       template_parameter,
                                       )
 from dune.perftool.cgen.clazz import (AccessModifier,
@@ -408,25 +407,6 @@ def visit_integrals(integrals):
             from dune.perftool.ufl.visitor import UFL2LoopyVisitor
             visitor = UFL2LoopyVisitor(interface, measure, indexmap)
 
-            # Inspect the UFL file for manual common subexpression elimination stuff!
-            data = get_global_context_value("data")
-            for name, expr in data.object_by_name.items():
-                if name.startswith("cse"):
-                    set_subst_rule(name, expr)
-
-            # Ensure CSE on detjac * quadrature weight
-            domain = accterm.term.ufl_domain()
-            if measure == "cell":
-                set_subst_rule("integration_factor_cell1",
-                               uc.QuadratureWeight(domain) * uc.Abs(uc.JacobianDeterminant(domain)))
-                set_subst_rule("integration_factor_cell2",
-                               uc.Abs(uc.JacobianDeterminant(domain)) * uc.QuadratureWeight(domain))
-            else:
-                set_subst_rule("integration_factor_facet1",
-                               uc.FacetJacobianDeterminant(domain) * uc.QuadratureWeight(domain))
-                set_subst_rule("integration_factor_facet2",
-                               uc.QuadratureWeight(domain) * uc.FacetJacobianDeterminant(domain))
-
             get_backend(interface="accum_insn")(visitor, accterm, measure, subdomain_id)
 
 
diff --git a/python/dune/perftool/pdelab/tensors.py b/python/dune/perftool/pdelab/tensors.py
index a8e15d846be6cbb958508652b105cb8a1c64a83e..2996ac5e24ecb5abf59980cf5ded3af223a4d4bd 100644
--- a/python/dune/perftool/pdelab/tensors.py
+++ b/python/dune/perftool/pdelab/tensors.py
@@ -25,7 +25,7 @@ def define_list_tensor(name, expr, visitor, stack=()):
             instruction(assignee=prim.Subscript(prim.Variable(name), stack + (i,)),
                         expression=visitor.call(child),
                         forced_iname_deps=frozenset(visitor.interface.quadrature_inames()),
-                        depends_on=frozenset({lp.match.Or(tuple(lp.match.Writes(v.name) for v in deps))}).union(frozenset({lp.match.Tagged("sumfact_stage1")})),
+                        depends_on=frozenset({lp.match.Tagged("sumfact_stage1")}),
                         tags=frozenset({"quad"}),
                         )
 
diff --git a/python/dune/perftool/sumfact/__init__.py b/python/dune/perftool/sumfact/__init__.py
index f860a1fcade1275b12f80ecd8d85bbc70799efca..36d18d54abc1d6bc3c5fa9b19f7cce2142662c29 100644
--- a/python/dune/perftool/sumfact/__init__.py
+++ b/python/dune/perftool/sumfact/__init__.py
@@ -31,24 +31,24 @@ class SumFactInterface(PDELabInterface):
     def pymbolic_reference_gradient(self, element, restriction, number):
         return pymbolic_reference_gradient(element, restriction, number)
 
-    def pymbolic_trialfunction_gradient(self, element, restriction, component, visitor=None):
-        ret, indices = pymbolic_coefficient_gradient(element, restriction, component, name_coefficientcontainer, visitor.indices)
-        visitor.indices = indices
+    def pymbolic_trialfunction_gradient(self, element, restriction, component):
+        ret, indices = pymbolic_coefficient_gradient(element, restriction, component, name_coefficientcontainer, self.visitor.indices)
+        self.visitor.indices = indices
         return ret
 
-    def pymbolic_trialfunction(self, element, restriction, component, visitor=None):
-        ret, indices = pymbolic_coefficient(element, restriction, component, name_coefficientcontainer, visitor.indices)
-        visitor.indices = indices
+    def pymbolic_trialfunction(self, element, restriction, component):
+        ret, indices = pymbolic_coefficient(element, restriction, component, name_coefficientcontainer, self.visitor.indices)
+        self.visitor.indices = indices
         return ret
 
-    def pymbolic_apply_function_gradient(self, element, restriction, component, visitor=None):
-        ret, indices = pymbolic_coefficient_gradient(element, restriction, component, name_applycontainer, visitor.indices)
-        visitor.indices = indices
+    def pymbolic_apply_function_gradient(self, element, restriction, component):
+        ret, indices = pymbolic_coefficient_gradient(element, restriction, component, name_applycontainer, self.visitor.indices)
+        self.visitor.indices = indices
         return ret
 
-    def pymbolic_apply_function(self, element, restriction, component, visitor=None):
-        ret, indices = pymbolic_coefficient(element, restriction, component, name_applycontainer, visitor.indices)
-        visitor.indices = indices
+    def pymbolic_apply_function(self, element, restriction, component):
+        ret, indices = pymbolic_coefficient(element, restriction, component, name_applycontainer, self.visitor.indices)
+        self.visitor.indices = indices
         return ret
 
     def quadrature_inames(self):
@@ -57,8 +57,8 @@ class SumFactInterface(PDELabInterface):
     def pymbolic_quadrature_weight(self):
         return quadrature_weight()
 
-    def pymbolic_spatial_coordinate(self, visitor=None):
+    def pymbolic_spatial_coordinate(self):
         import dune.perftool.sumfact.geometry
-        ret, indices = get_backend(interface="spatial_coordinate", selector=option_switch("diagonal_transformation_matrix"))(visitor.indices)
-        visitor.indices = indices
+        ret, indices = get_backend(interface="spatial_coordinate", selector=option_switch("diagonal_transformation_matrix"))(self.visitor.indices)
+        self.visitor.indices = indices
         return ret
diff --git a/python/dune/perftool/sumfact/geometry.py b/python/dune/perftool/sumfact/geometry.py
index 9a8d4f1f46f0f44c88dace42b93fe1927bc0b698..fce0ec9b8b403ed2b5439f70b418421823ff1804 100644
--- a/python/dune/perftool/sumfact/geometry.py
+++ b/python/dune/perftool/sumfact/geometry.py
@@ -149,6 +149,6 @@ def pymbolic_spatial_coordinate_axiparallel(visitor_indices):
         if face is not None and index > face:
             iindex = iindex - 1
         from dune.perftool.sumfact.quadrature import pymbolic_quadrature_position
-        x = prim.Subscript(pymbolic_quadrature_position(), (iindex,))
+        x = pymbolic_quadrature_position(iindex)
 
     return prim.Subscript(prim.Variable(lowcorner), (index,)) + x * prim.Subscript(prim.Variable(meshwidth), (index,)), None
diff --git a/python/dune/perftool/sumfact/quadrature.py b/python/dune/perftool/sumfact/quadrature.py
index 5e6935c3ba368e94300d2e22e6885f98e544af29..d12b7aeb84da5c107235c7cf5872245a0e0500e1 100644
--- a/python/dune/perftool/sumfact/quadrature.py
+++ b/python/dune/perftool/sumfact/quadrature.py
@@ -4,6 +4,7 @@ from dune.perftool.generation import (backend,
                                       get_global_context_value,
                                       iname,
                                       instruction,
+                                      loopy_class_member,
                                       temporary_variable,
                                       )
 
@@ -15,6 +16,7 @@ from dune.perftool.pdelab.argument import name_accumulation_variable
 from dune.perftool.pdelab.geometry import (dimension_iname,
                                            local_dimension,
                                            )
+from dune.perftool.options import get_option
 
 from loopy import CallMangleInfo
 from loopy.symbolic import FunctionIdentifier
@@ -26,7 +28,9 @@ from pymbolic.primitives import (Call,
                                  Variable,
                                  )
 
-import numpy
+import pymbolic.primitives as prim
+import loopy as lp
+import numpy as np
 
 
 def nest_quadrature_loops(kernel, inames):
@@ -58,7 +62,7 @@ class BaseWeight(FunctionIdentifier):
 @function_mangler
 def base_weight_function_mangler(target, func, dtypes):
     if isinstance(func, BaseWeight):
-        return CallMangleInfo(func.name, (NumpyType(numpy.float64),), ())
+        return CallMangleInfo(func.name, (NumpyType(np.float64),), ())
 
 
 def pymbolic_base_weight():
@@ -82,6 +86,18 @@ def quadrature_inames():
     return tuple(sumfact_quad_iname(d, quadrature_points_per_direction()) for d in range(local_dimension()))
 
 
+@iname(kernel="operator")
+def constructor_quad_iname(name, d, bound):
+    name = "{}_{}".format(name, d)
+    domain(name, quadrature_points_per_direction(), kernel="operator")
+    return name
+
+
+def constructor_quadrature_inames(dim, num1d):
+    name = "quadiname_dim{}_num{}".format(dim, num1d)
+    return tuple(constructor_quad_iname(name, d, quadrature_points_per_direction()) for d in range(local_dimension()))
+
+
 def define_recursive_quadrature_weight(name, dir):
     iname = quadrature_inames()[dir]
     temporary_variable(name, shape=(), shape_impl=())
@@ -107,25 +123,76 @@ def recursive_quadrature_weight(dir=0):
 
 
 def quadrature_weight():
-    return recursive_quadrature_weight()
+    # Return non-precomputed version
+    if not get_option("precompute_quadrature_info"):
+        return recursive_quadrature_weight()
+
+    dim = local_dimension()
+    num1d = quadrature_points_per_direction()
+    name = "quad_weights_dim{}_num{}".format(dim, num1d)
+
+    # Add a class member
+    loopy_class_member(name,
+                       dtype=np.float64,
+                       shape=(num1d,) * dim,
+                       classtag="operator",
+                       dim_tags=",".join(["f"] * dim),
+                       managed=True,
+                       potentially_vectorized=True,
+                       )
+
+    # Precompute it in the constructor
+    inames = constructor_quadrature_inames(dim, num1d)
+    instruction(assignee=prim.Subscript(prim.Variable(name), tuple(prim.Variable(i) for i in inames)),
+                expression=prim.Product(tuple(Subscript(Variable(name_oned_quadrature_weights()), (prim.Variable(i),)) for i in inames)),
+                within_inames=frozenset(inames),
+                kernel="operator",
+                )
 
+    return prim.Subscript(lp.symbolic.TaggedVariable(name, "operator_precomputed"), tuple(prim.Variable(i) for i in quadrature_inames()))
 
-def define_quadrature_position(name):
-    for i in range(local_dimension()):
-        instruction(expression=Subscript(Variable(name_oned_quadrature_points()), (Variable(quadrature_inames()[i]),)),
-                    assignee=Subscript(Variable(name), (i,)),
-                    forced_iname_deps=frozenset(quadrature_inames()),
-                    forced_iname_deps_is_final=True,
-                    tags=frozenset({"quad"}),
-                    )
+
+def define_quadrature_position(name, index):
+    instruction(expression=Subscript(Variable(name_oned_quadrature_points()), (Variable(quadrature_inames()[index]),)),
+                assignee=Subscript(Variable(name), (index,)),
+                forced_iname_deps=frozenset(quadrature_inames()),
+                forced_iname_deps_is_final=True,
+                tags=frozenset({"quad"}),
+                )
 
 
 @backend(interface="quad_pos", name="sumfact")
-def pymbolic_quadrature_position():
-    name = 'pos'
-    temporary_variable(name, shape=(local_dimension(),), shape_impl=("fv",))
-    define_quadrature_position(name)
-    return Variable(name)
+def pymbolic_quadrature_position(index):
+    # Return the non-precomputed version
+    if not get_option("precompute_quadrature_info"):
+        name = 'pos'
+        temporary_variable(name, shape=(local_dimension(),), shape_impl=("fv",))
+        define_quadrature_position(name, index)
+        return prim.Subscript(prim.Variable(name), (index,))
+
+    assert isinstance(index, int)
+    dim = local_dimension()
+    num1d = quadrature_points_per_direction()
+    name = "quad_points_dim{}_num{}_dir{}".format(dim, num1d, index)
+
+    loopy_class_member(name,
+                       dtype=np.float64,
+                       shape=(num1d,) * dim,
+                       classtag="operator",
+                       dim_tags=",".join(["f"] * dim),
+                       managed=True,
+                       potentially_vectorized=True,
+                       )
+
+    # Precompute it in the constructor
+    inames = constructor_quadrature_inames(dim, num1d)
+    instruction(assignee=prim.Subscript(prim.Variable(name), tuple(prim.Variable(i) for i in inames)),
+                expression=Subscript(Variable(name_oned_quadrature_points()), (prim.Variable(inames[index]))),
+                within_inames=frozenset(inames),
+                kernel="operator",
+                )
+
+    return prim.Subscript(lp.symbolic.TaggedVariable(name, "operator_precomputed"), tuple(prim.Variable(i) for i in quadrature_inames()))
 
 
 @backend(interface="qp_in_cell", name="sumfact")
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index 9ccd5891c60d6840ceb2e9e3baf4981a54a62ed9..c0405c6a244faa71fe52e4fbe7ce326cab220722 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -25,7 +25,7 @@ from dune.perftool.pdelab.localoperator import (name_domain_field,
                                                 lop_template_range_field,
                                                 )
 from dune.perftool.pdelab.quadrature import quadrature_order
-from dune.perftool.tools import maybe_wrap_subscript
+from dune.perftool.tools import maybe_wrap_subscript, ceildiv
 from loopy import CallMangleInfo
 from loopy.symbolic import FunctionIdentifier
 from loopy.types import NumpyType
@@ -37,10 +37,6 @@ import loopy as lp
 import numpy as np
 
 
-def ceildiv(a, b):
-    return -(-a // b)
-
-
 class BasisTabulationMatrixBase(object):
     pass
 
diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py
index e6259e44b0cd4d030587741f8361e804b8fd3e55..a0dfaf29af06abcdc3f3e49d71291e663536e130 100644
--- a/python/dune/perftool/tools.py
+++ b/python/dune/perftool/tools.py
@@ -59,3 +59,7 @@ def get_pymbolic_tag(expr):
         return get_pymbolic_tag(expr.aggregate)
 
     raise NotImplementedError("Cannot determine tag on {}".format(expr))
+
+
+def ceildiv(a, b):
+    return -(-a // b)
diff --git a/python/dune/perftool/ufl/visitor.py b/python/dune/perftool/ufl/visitor.py
index 6c7ddb1afa67c9abc3ed1e007a91e4ad4dbd94ee..9ec9afd1bc65990126f9c225335d975a305733c5 100644
--- a/python/dune/perftool/ufl/visitor.py
+++ b/python/dune/perftool/ufl/visitor.py
@@ -3,11 +3,7 @@ This module defines the main visitor algorithm transforming ufl expressions
 to pymbolic and loopy.
 """
 from dune.perftool.error import PerftoolUFLError
-from dune.perftool.generation import (construct_subst_rule,
-                                      domain,
-                                      get_global_context_value,
-                                      subst_rule,
-                                      )
+from dune.perftool.generation import get_global_context_value, domain
 from dune.perftool.ufl.flatoperators import get_operands
 from dune.perftool.ufl.modified_terminals import (ModifiedTerminalTracker,
                                                   Restriction,
@@ -39,11 +35,11 @@ import pymbolic.primitives as prim
 
 
 class UFL2LoopyVisitor(ModifiedTerminalTracker):
-    def __init__(self, interface, measure, dimension_indices, donot_check_substrules=None):
+    def __init__(self, interface, measure, dimension_indices):
         self.interface = interface
+        self.interface.visitor = self
         self.measure = measure
         self.dimension_indices = dimension_indices
-        self.donot_check_substrules = donot_check_substrules
 
         # Call base class constructors
         super(UFL2LoopyVisitor, self).__init__()
@@ -56,18 +52,7 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
 
         return self.call(o)
 
-    def call(self, o):
-        if o != self.donot_check_substrules and subst_rule(None, o):
-            rule = construct_subst_rule(o,
-                                        type(self)(self.interface,
-                                                   self.measure,
-                                                   self.dimension_indices,
-                                                   donot_check_substrules=o,
-                                                   )
-                                        )
-            return prim.Call(prim.Variable(rule.name), ())
-        else:
-            return MultiFunction.__call__(self, o)
+    call = MultiFunction.__call__
 
     #
     # Form argument/coefficients handlers:
@@ -131,14 +116,14 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
 
             if self.reference_grad:
                 if o.count() == 0:
-                    return self.interface.pymbolic_trialfunction_gradient(o.ufl_element(), restriction, self.component, visitor=self)
+                    return self.interface.pymbolic_trialfunction_gradient(o.ufl_element(), restriction, self.component)
                 else:
-                    return self.interface.pymbolic_apply_function_gradient(o.ufl_element(), restriction, self.component, visitor=self)
+                    return self.interface.pymbolic_apply_function_gradient(o.ufl_element(), restriction, self.component)
             else:
                 if o.count() == 0:
-                    return self.interface.pymbolic_trialfunction(o.ufl_element(), restriction, self.component, visitor=self)
+                    return self.interface.pymbolic_trialfunction(o.ufl_element(), restriction, self.component)
                 else:
-                    return self.interface.pymbolic_apply_function(o.ufl_element(), restriction, self.component, visitor=self)
+                    return self.interface.pymbolic_apply_function(o.ufl_element(), restriction, self.component)
 
         # Check if this is a parameter function
         else:
@@ -231,7 +216,12 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         return self._index_or_fixed_index(o)
 
     def list_tensor(self, o):
-        return self.interface.pymbolic_list_tensor(o, self)
+        if all(isinstance(i, int) for i in self.indices):
+            index = self.indices[0]
+            self.indices = self.indices[1:]
+            return self.call(o.ufl_operands[index])
+        else:
+            return self.interface.pymbolic_list_tensor(o)
 
     def identity(self, o):
         return self.interface.pymbolic_identity(o)
@@ -355,12 +345,21 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         if get_global_context_value("driver", False):
             return prim.Variable("x")
         else:
-            return self.interface.pymbolic_spatial_coordinate(self)
+            return self.interface.pymbolic_spatial_coordinate()
 
     def facet_normal(self, o):
         # The normal must be restricted to be well-defined
         assert self.restriction is not Restriction.NONE
 
+        # Optimize facet normal on axiparallel grids
+        from dune.perftool.options import get_option
+        if get_option("diagonal_transformation_matrix"):
+            index, = self.indices
+            from dune.perftool.sumfact.switch import get_facedir
+            if isinstance(index, int) and index != get_facedir(self.restriction):
+                self.indices = None
+                return 0
+
         if self.restriction == Restriction.POSITIVE:
             return Variable(self.interface.name_unit_outer_normal())
         if self.restriction == Restriction.NEGATIVE:
diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt
index 72fbf5af11794ff5d3b80ed16fe6576c071103ea..824f2721f163a46d6a76ac9c256c95b3f9885c4c 100644
--- a/test/sumfact/poisson/CMakeLists.txt
+++ b/test/sumfact/poisson/CMakeLists.txt
@@ -13,7 +13,7 @@ dune_add_formcompiler_system_test(UFLFILE poisson_3d.ufl
 #================
 # Opcounter tests
 #================
-dune_add_formcompiler_system_test(UFLFILE opcount_poisson_2d_order2.ufl
+dune_add_formcompiler_system_test(UFLFILE poisson_2d.ufl
                                   BASENAME opcount_sumfact_poisson_2d_order2
                                   INIFILE opcount_poisson_2d_order2.mini
                                   )
diff --git a/test/sumfact/poisson/opcount_poisson_2d_order2.mini b/test/sumfact/poisson/opcount_poisson_2d_order2.mini
index 627f5bb3eb7cee357a8dc10db82353491153d648..d5fe76eca69bc4e3cd2e12800f816675d83ff50a 100644
--- a/test/sumfact/poisson/opcount_poisson_2d_order2.mini
+++ b/test/sumfact/poisson/opcount_poisson_2d_order2.mini
@@ -18,3 +18,6 @@ compare_l2errorsquared = 1e-8
 sumfact = 1
 opcounter = 1
 instrumentation_level = 4
+
+[formcompiler.ufl_variants]
+degree = 2
diff --git a/test/sumfact/poisson/opcount_poisson_2d_order2.ufl b/test/sumfact/poisson/opcount_poisson_2d_order2.ufl
deleted file mode 100644
index 23982245c5917beb2e74fbedc0ecf399fec549c2..0000000000000000000000000000000000000000
--- a/test/sumfact/poisson/opcount_poisson_2d_order2.ufl
+++ /dev/null
@@ -1,10 +0,0 @@
-cell = "quadrilateral"
-
-f = Expression("Dune::FieldVector<oc::OpCounter<double>,2> c(0.5); c-= x; return 4.*(1.-c.two_norm2())*exp(-1.*c.two_norm2());", cell=cell)
-g = Expression("Dune::FieldVector<oc::OpCounter<double>,2> c(0.5); c-= x; return exp(-1.*c.two_norm2());", cell=cell)
-
-V = FiniteElement("CG", cell, 2, dirichlet_expression=g)
-u = TrialFunction(V)
-v = TestFunction(V)
-
-forms = [(inner(grad(u), grad(v)) - f*v)*dx]