diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index d56e080b5856367170f882b2df43fbea7e27f738..408302b22a0bf38fec226c1b80d81b8df78089ed 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -167,7 +167,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                                  name=inp,
                                                  )
 
-        # Those input fields, that are padded need to be set to zero in order to do a horizontal_add lateron
+        # Those input fields, that are padded need to be set to zero
+        # in order to do a horizontal_add lateron
         for pad in padding:
             instruction(assignee=prim.Subscript(prim.Variable(temp), tuple(Variable(i) for i in quadrature_inames()) + (pad,)),
                         expression=0,
@@ -217,17 +218,6 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                               depends_on=insn_dep,
                                               within_inames=frozenset(visitor.inames))})
 
-        # Add a sum factorization kernel that implements the multiplication
-        # with the test function (stage 3)
-        pref_pos = i if accterm.argument.index else None
-        result, insn_dep = sum_factorization_kernel(a_matrices,
-                                                    buf,
-                                                    3,
-                                                    insn_dep=insn_dep,
-                                                    additional_inames=frozenset(visitor.inames),
-                                                    preferred_position=pref_pos,
-                                                    restriction=(accterm.argument.restriction, restriction),
-                                                    )
 
         inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i) for i, mat in enumerate(a_matrices))
 
@@ -242,7 +232,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure)
         rank = 2 if visitor.inames else 1
         if rank == 2:
-            # TODO the next line should get its inames from elsewhere. This is *NOT* robust (but works right now)
+            # TODO the next line should get its inames from
+            # elsewhere. This is *NOT* robust (but works right now)
             ansatz_lfs.index = flatten_index(tuple(Variable(visitor.inames[i]) for i in range(world_dimension())),
                                              (basis_functions_per_direction(),) * dim,
                                              order="f"
@@ -251,6 +242,34 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         # Construct the expression representing "{r,jac}.accumulate(..)"
         accum = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
 
+        # We can directly accumulate the solution in the last step of
+        # the sumfactorization. Currently this is only implemented for
+        # FastDGGridOperator.
+        direct_output = None
+        if get_option('fastdg'):
+            ft = get_global_context_value("form_type")
+            accum = accum + ".data()"
+            if ft == 'residual' or ft == 'jacobian_apply':
+                direct_output = accum
+            else:
+                assert ft == 'jacobian'
+                direct_output = accum
+
+
+        # Add a sum factorization kernel that implements the multiplication
+        # with the test function (stage 3)
+        pref_pos = i if accterm.argument.index else None
+        result, insn_dep = sum_factorization_kernel(a_matrices,
+                                                    buf,
+                                                    3,
+                                                    insn_dep=insn_dep,
+                                                    additional_inames=frozenset(visitor.inames),
+                                                    preferred_position=pref_pos,
+                                                    restriction=(accterm.argument.restriction, restriction),
+                                                    direct_output=direct_output,
+                                                    visitor=visitor
+                                                    )
+
         # Determine the expression to accumulate with. This depends on the vectorization strategy!
         result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames))
         vecinames = ()
@@ -263,38 +282,22 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                (maybe_wrap_subscript(result, prim.Variable(iname)),),
                                )
 
-        # In the case of FastDGGridOperator we can write directly into the resiudal/jacobi
-        #
-        # TODO: At the moment this only works if we do not vectorize
-        # (over gradients) because loopy tries to acces a vectorclass
-        # variable.
+        # In the fastdg case we accumulate directly in the last step of the SF!
         if get_option('fastdg'):
-            ft = get_global_context_value("form_type")
-            if ft == 'residual' or ft == 'jacobian_apply':
-                accum = accum + ".data()"
-                size = basis_functions_per_direction() ** world_dimension()
-                globalarg(accum, dtype=np.float64, shape=(size,), managed=False)
-                assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index,))
-                expression = prim.Sum((assignee, result))
-                instruction(assignee=assignee,
-                            expression=expression,
+            # In the dry run we need an instruction with a
+            # sumfactorization node. We use this to decide on a
+            # vectorization strategy. This is just a dummy
+            # instruction, in the real run the accumulation is done
+            # directly in the sumfactorization.
+            if get_global_context_value("dry_run", False):
+                dummy = "dummy"
+                globalarg(dummy, dtype=np.float64)
+                instruction(assignee = prim.Variable(dummy),
+                            expression = result,
                             forced_iname_deps=frozenset(inames),
                             forced_iname_deps_is_final=True,
                             depends_on=insn_dep,
                             )
-            else:
-                assert ft == 'jacobian'
-                accum = accum + ".data()"
-                size = basis_functions_per_direction() ** world_dimension()
-                globalarg(accum, dtype=np.float64, shape=(size, size), managed=True)
-                assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index, ansatz_lfs.index))
-                expression = prim.Sum((assignee, result))
-                instruction(assignee=assignee,
-                            expression=expression,
-                            forced_iname_deps=frozenset(inames + visitor.inames),
-                            forced_iname_deps_is_final=True,
-                            depends_on=insn_dep,
-                            )
         # Default: Generate accumulation instructions
         else:
             expr = Call(PDELabAccumulationFunction(accum, rank),
@@ -368,7 +371,7 @@ def _sf_flop_cost(a_matrices):
 
 
 def _sf_permutation_strategy(a_matrices, stage):
-    """Choose permutation of a_matices list based on computational cost
+    """Choose permutation of a_matrices list based on computational cost
 
     Note: If there are multiple permutations with the same cost a
     heuristic is used to pick one.
@@ -418,6 +421,8 @@ def sum_factorization_kernel(a_matrices,
                              outshape=None,
                              restriction=0,
                              direct_input=None,
+                             direct_output=None,
+                             visitor=None,
                              ):
     """Create a sum factorization kernel
 
@@ -481,6 +486,9 @@ def sum_factorization_kernel(a_matrices,
     direct_input: Global data structure containing input for
         sumfactorization (e.g. when using FastDGGridOperator).
     """
+    # Return a pymbolic SumfactKernel node in the dry run. This will
+    # be used to decide on an appropriate vectorization strategy
+    # before we do the real thing.
     if get_global_context_value("dry_run", False):
         return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset()
 
@@ -614,7 +622,36 @@ def sum_factorization_kernel(a_matrices,
         output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),)
         if l == len(a_matrices)-1:
             output_inames = _permute_backward(output_inames, perm)
-        assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
+
+        # In case of direct output we directly accumulate the result
+        # of the Sumfactorization into some global data structure.
+        if l == len(a_matrices)-1 and direct_output is not None:
+            ft = get_global_context_value("form_type")
+            novec_ftags = ",".join(["f"]*len(a_matrices))
+            if ft == 'residual' or ft == 'jacobian_apply':
+                globalarg(direct_output, dtype=np.float64, shape=output_shape, dim_tags=novec_ftags)
+                assignee = prim.Subscript(prim.Variable(direct_output), output_inames)
+            else:
+                assert ft == 'jacobian'
+                globalarg(direct_output,
+                          dtype=np.float64,
+                          shape=output_shape + output_shape,
+                          dim_tags = novec_ftags + "," + novec_ftags)
+                # TODO the next line should get its inames from
+                # elsewhere. This is *NOT* robust (but works right
+                # now)
+                _ansatz_inames = tuple(Variable(visitor.inames[i]) for i in range(world_dimension()))
+                assignee = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + output_inames)
+
+            # In case of vectorization we need to apply a horizontal add
+            if a_matrix.vectorized:
+                matprod = prim.Call(prim.Variable("horizontal_add"),
+                                    (matprod,))
+
+            # We need to accumulate
+            matprod = prim.Sum((assignee, matprod))
+        else:
+            assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
 
         # Issue the reduction instruction that implements the multiplication
         # at the same time store the instruction ID for the next instruction to depend on