diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 46a639fb9c5b269db31ab10721f8630f74ca6d97..86a9b354efa1d1b6bc07d90edc45b8c531ff577c 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -108,6 +108,7 @@ def pymbolic_trialfunction_gradient(element, restriction, component, visitor):
                                                  preferred_position=i,
                                                  insn_dep=insn_dep,
                                                  restriction=restriction,
+                                                 outshape=tuple(mat.rows for mat in a_matrices if mat.face is None),
                                                  direct_input=direct_input,
                                                  )
         buffers.append(var)
@@ -174,7 +175,7 @@ def pymbolic_trialfunction(element, restriction, component, visitor):
                                       1,
                                       preferred_position=None,
                                       insn_dep=frozenset({Writes(inp)}),
-                                      outshape=tuple(mat.rows for mat in a_matrices if mat.rows != 1),
+                                      outshape=tuple(mat.rows for mat in a_matrices if mat.face is None),
                                       restriction=restriction,
                                       direct_input=direct_input,
                                       )
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index e135fb75a1a8650a088852f95d67d1da03a4b77e..2429da4ef3d688ebe651f7d4de140ca76a8a779a 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -144,7 +144,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, (accterm.argument.restriction, restriction))
 
         # Initialize a base storage for this buffer and get a temporay pointing to it
-        shape = tuple(mat.cols for mat in a_matrices if mat.cols != 1)
+        shape = tuple(mat.cols for mat in a_matrices if mat.face is None)
         dim_tags = ",".join(['f'] * local_dimension())
         if index is not None:
             shape = shape + (4,)
@@ -434,7 +434,9 @@ def sum_factorization_kernel(a_matrices,
                               })
 
     if outshape is None:
-        outshape = tuple(mat.rows for mat in a_matrices if mat.rows != 1)
+        assert stage == 3
+        outshape = tuple(mat.rows for mat in a_matrices)
+
     dim_tags = ",".join(['f'] * len(outshape))
 
     if next(iter(a_matrices)).vectorized: