diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py
index 029ca0e3937c18119c8d8aea4166359de0746057..135fcc3d4cf50bf8e5e53822c29c8997516c9d2b 100644
--- a/python/dune/perftool/sumfact/amatrix.py
+++ b/python/dune/perftool/sumfact/amatrix.py
@@ -37,17 +37,18 @@ import numpy
 
 
 class AMatrix(ImmutableRecord):
-    def __init__(self, rows, cols, transpose=False, derivative=False):
+    def __init__(self, rows, cols, transpose=False, derivative=False, face=None):
         ImmutableRecord.__init__(self,
                                  rows=rows,
                                  cols=cols,
                                  transpose=transpose,
                                  derivative=derivative,
+                                 face=face,
                                  )
 
     @property
     def name(self):
-        return name_theta(self.transpose, self.derivative)
+        return name_theta(self.transpose, self.derivative, face=self.face)
 
     @property
     def vectorized(self):
@@ -216,7 +217,7 @@ def polynomial_lookup_mangler(target, func, dtypes):
         return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64)))
 
 
-def define_theta(name, shape, transpose, derivative, additional_indices=()):
+def define_theta(name, shape, transpose, derivative, face=None, additional_indices=()):
     sort_quadrature_points_weights()
     polynomials = name_polynomials()
     qp = name_oned_quadrature_points()
@@ -238,28 +239,38 @@ def define_theta(name, shape, transpose, derivative, additional_indices=()):
     i = theta_iname("i", shape[0])
     j = theta_iname("j", shape[1])
 
+    inames = i, j
     if transpose:
-        args = (prim.Variable(i), prim.Subscript(prim.Variable(qp), (prim.Variable(j),)))
-    else:
-        args = (prim.Variable(j), prim.Subscript(prim.Variable(qp), (prim.Variable(i),)))
+        inames = j, i
+
+    args = [prim.Variable(inames[1]), prim.Subscript(prim.Variable(qp), (prim.Variable(inames[0]),))]
+    if face is not None:
+        args[1] = face
 
     instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j)) + additional_indices),
-                expression=prim.Call(PolynomialLookup(polynomials, derivative), args),
+                expression=prim.Call(PolynomialLookup(polynomials, derivative), tuple(args)),
                 kernel="operator",
                 )
 
 
-def name_theta(transpose=False, derivative=False):
-    name = "{}Theta{}".format("d" if derivative else "", "T" if transpose else "")
+def name_theta(transpose=False, derivative=False, face=None):
+    name = "{}{}Theta{}".format("face{}_".format(face) if face is not None else "",
+                                "d" if derivative else "",
+                                "T" if transpose else "",
+                                )
+
+    shape = [quadrature_points_per_direction(), basis_functions_per_direction()]
+    if face is not None:
+        shape[0] = 1
     if transpose:
-        shape = (basis_functions_per_direction(), quadrature_points_per_direction())
-    else:
-        shape = (quadrature_points_per_direction(), basis_functions_per_direction())
-    define_theta(name, shape, transpose, derivative)
+        shape = shape[1], shape[0]
+    shape = tuple(shape)
+
+    define_theta(name, shape, transpose, derivative, face=face)
     return name
 
 
-def construct_amatrix_sequence(transpose=False, derivative=None, face=None):
+def construct_amatrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None):
     dim = world_dimension()
     result = [None] * dim
 
@@ -267,12 +278,14 @@ def construct_amatrix_sequence(transpose=False, derivative=None, face=None):
         rows = quadrature_points_per_direction()
         cols = basis_functions_per_direction()
 
-        if face == i:
+        onface = None
+        if facedir == i:
             rows = 1
+            onface = facemod
 
         if transpose:
             rows, cols = cols, rows
 
-        result[i] = AMatrix(rows, cols, transpose=transpose, derivative=derivative == i)
+        result[i] = AMatrix(rows, cols, transpose=transpose, derivative=derivative == i, face=onface)
 
     return tuple(result)
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 0965bdce0bef66127b56e144085f282180ead61c..794c3467d286374b2ba3894c2ca40c43792f1dc5 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -66,7 +66,10 @@ def pymbolic_trialfunction_gradient(element, restriction, component, visitor):
     insn_dep = None
     for i in range(dim):
         # Construct the matrix sequence for this sum factorization
-        a_matrices = construct_amatrix_sequence(derivative=i, face=get_facedir(restriction))
+        a_matrices = construct_amatrix_sequence(derivative=i,
+                                                facedir=get_facedir(restriction),
+                                                facemod=get_facemod(restriction),
+                                                )
 
         # Get the vectorization info. If this happens during the dry run, we get dummies
         from dune.perftool.sumfact.vectorization import get_vectorization_info
@@ -130,7 +133,8 @@ def pymbolic_trialfunction(element, restriction, component, visitor):
     dim = world_dimension()
 
     # Construct the matrix sequence for this sum factorization
-    a_matrices = construct_amatrix_sequence(face=get_facedir(restriction))
+    a_matrices = construct_amatrix_sequence(facedir=get_facedir(restriction),
+                                            facemod=get_facemod(restriction),)
 
     # Get the vectorization info. If this happens during the dry run, we get dummies
     from dune.perftool.sumfact.vectorization import get_vectorization_info
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index ed21964794496bc431ac99c3886672bdfe57d6e1..0f17c0ebc23bfd655f8c1c647c50c2a47be2e8a4 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -43,7 +43,9 @@ from dune.perftool.sumfact.amatrix import (AMatrix,
                                            basis_functions_per_direction,
                                            construct_amatrix_sequence,
                                            )
-from dune.perftool.sumfact.switch import get_facedir
+from dune.perftool.sumfact.switch import (get_facedir,
+                                          get_facemod,
+                                          )
 from dune.perftool.loopy.symbolic import SumfactKernel
 from dune.perftool.tools import get_pymbolic_basename
 from dune.perftool.error import PerftoolError
@@ -106,7 +108,6 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         return
 
     dim = world_dimension()
-    facedir = get_facedir(accterm.argument.restriction)
 
     # Collect buffers we need
     buffers = []
@@ -128,7 +129,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         # Construct the matrix sequence for this sum factorization
         a_matrices = construct_amatrix_sequence(transpose=True,
                                                 derivative=i if accterm.argument.index else None,
-                                                face=facedir,
+                                                facedir=get_facedir(accterm.argument.restriction),
+                                                facemod=get_facemod(accterm.argument.restriction),
                                                 )
 
         # Get the vectorization info. If this happens during the dry run, we get dummies