diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index e53422138b7b2ae549373816cfe244ed47e4d6bb..115aede7f16b081399ccc8016961f035162869d2 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -491,9 +491,15 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
     preambles = [(i, p) for i, p in enumerate(retrieve_cache_items("{} and preamble".format(tag)))]
     kernel = kernel.copy(preambles=preambles)
 
+    # Remove inames that have become obsolete
+    kernel = lp.remove_unused_inames(kernel)
+
     # Do the loopy preprocessing!
     kernel = preprocess_kernel(kernel)
 
+    # *REALLY* ignore boostability. This is - so far - necessary due to a mystery bug.
+    kernel = kernel.copy(instructions=[i.copy(boostable=False, boostable_into=frozenset()) for i in kernel.instructions])
+
     if wrap_in_cgen:
         # Wrap the kernel in something which can generate code
         from dune.perftool.pdelab.signatures import assembly_routine_signature
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index a7b3c4c98ac3c5b262ff1f7dee9b5acecba33aa9..40f042ef96f1cf167487008d3e07b1cfe61374e6 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -247,9 +247,9 @@ def evaluate_reference_gradient(element, name, restriction):
     i = 0
     for d in range(local_dimension()):
         if d == facedir:
-            i = i+1
+            i = i + 1
         quadinamemapping[i] = quad_inames[d]
-        i = i+1
+        i = i + 1
 
     for d in range(dim):
         prod = []
@@ -261,7 +261,7 @@ def evaluate_reference_gradient(element, name, restriction):
         if facedir is not None:
             facemod = get_facemod(restriction)
             from dune.perftool.sumfact.amatrix import PolynomialLookup, name_polynomials
-            prod.append(prim.Call(PolynomialLookup(name_polynomials(), facedir==d),
+            prod.append(prim.Call(PolynomialLookup(name_polynomials(), facedir == d),
                                   (prim.Variable(inames[facedir]), facemod)),)
 
         assignee = prim.Subscript(prim.Variable(name), (d,))
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index 539d2f698b8c23c5f13b30f66229def5137705be..4205aedb8276c4f1a3bcf61fc57985a47e756935 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -308,7 +308,6 @@ def sum_factorization_kernel(a_matrices, buf, stage,
         # Get the inames needed for one matrix-matrix multiplication
         i = sumfact_iname(out_shape[0], "row")
         j = sumfact_iname(out_shape[1], "col")
-        k = sumfact_iname(a_matrix.cols, "red")
 
         # Maybe introduce a vectorization iname for this matrix-matrix multiplication
         vec_iname = ()
@@ -317,15 +316,27 @@ def sum_factorization_kernel(a_matrices, buf, stage,
             vec_iname = (prim.Variable(iname),)
             transform(lp.tag_inames, [(iname, "vec")])
 
-        # Construct the matrix-matrix-multiplication expression a_ik*in_kj
-        prod = Product((Subscript(Variable(a_matrix.name), (Variable(i), Variable(k)) + vec_iname),
-                        Subscript(Variable(inp), (Variable(k), Variable(j)) + vec_iname)
-                        ))
+        if a_matrix.cols == 1:
+            # A trivial reduction is implemented as a product, otherwise we run into
+            # a code generation corner case producing way too complicated code. This
+            # could be fixed upstream, but the loopy code realizing reductions is not
+            # trivial and the priority is kind of low.
+            matprod = Product((Subscript(Variable(a_matrix.name), (Variable(i), 0) + vec_iname),
+                               Subscript(Variable(inp), (0, Variable(j)) + vec_iname)
+                               ))
+        else:
+            k = sumfact_iname(a_matrix.cols, "red")
+
+            # Construct the matrix-matrix-multiplication expression a_ik*in_kj
+            prod = Product((Subscript(Variable(a_matrix.name), (Variable(i), Variable(k)) + vec_iname),
+                            Subscript(Variable(inp), (Variable(k), Variable(j)) + vec_iname)
+                            ))
+            matprod = Reduction("sum", k, prod)
 
         # Issue the reduction instruction that implements the multiplication
         # at the same time store the instruction ID for the next instruction to depend on
         insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j)) + vec_iname),
-                                          expression=Reduction("sum", k, prod),
+                                          expression=matprod,
                                           forced_iname_deps=frozenset({i, j}).union(additional_inames),
                                           forced_iname_deps_is_final=True,
                                           depends_on=insn_dep,
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index a9e050fb68c3f583d283a62a39943f1deef53093..197e6e61de48661f733a6f28c0faaf3ddf95b8cf 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -38,7 +38,7 @@ def no_vectorization(sumfacts):
                            sumf.restriction,
                            sumf.a_matrices,
                            get_counted_variable("buffer"),
-                           get_counted_variable(restricted_name("input", sumf.restriction)),
+                           get_counted_variable("input"),
                            None)
 
 
@@ -135,4 +135,4 @@ class HasSumfactMapper(lp.symbolic.CombineMapper):
 
 
 def find_sumfact(expr):
-    return HasSumfactMapper()(expr)
+    return HasSumfactMapper()(expr)
\ No newline at end of file
diff --git a/python/loopy b/python/loopy
index 36c9bb5c0a5905022fc850c3efc5ad7661e5f897..c16057b7c361584d04edb59132f0742ecaa38226 160000
--- a/python/loopy
+++ b/python/loopy
@@ -1 +1 @@
-Subproject commit 36c9bb5c0a5905022fc850c3efc5ad7661e5f897
+Subproject commit c16057b7c361584d04edb59132f0742ecaa38226