diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py
index 40d92670b3524be708d41ad75e5e40aa051bc6ab..7801d1a1bd9982cb474ef1e750fe2a48d1721e6f 100644
--- a/python/dune/perftool/generation/__init__.py
+++ b/python/dune/perftool/generation/__init__.py
@@ -32,6 +32,7 @@ from dune.perftool.generation.loopy import (constantarg,
                                             globalarg,
                                             iname,
                                             instruction,
+                                            noop_instruction,
                                             pymbolic_expr,
                                             silenced_warning,
                                             temporary_variable,
diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index 0fbbfbfc222599371c8feaff98754b275d19c3d9..2b95668fc6db5cd5b8d2be57f14adc5ac40322f9 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -196,3 +196,8 @@ def instruction(code=None, expression=None, **kwargs):
 
     # return the ID, as it is the only useful information to the user
     return id
+
+
+@generator_factory(item_tags=("instruction",), cache_key_generator=lambda **kw: kw['id'])
+def noop_instruction(**kwargs):
+    return loopy.NoOpInstruction(**kwargs)
\ No newline at end of file
diff --git a/python/dune/perftool/loopy/stages.py b/python/dune/perftool/loopy/stages.py
new file mode 100644
index 0000000000000000000000000000000000000000..1048ca7a59b18034a1be3fb6645598d3326feb79
--- /dev/null
+++ b/python/dune/perftool/loopy/stages.py
@@ -0,0 +1,19 @@
+""" loopy instructions to mark stages of computations """
+
+from dune.perftool.generation import noop_instruction
+
+
+def stage_insn(n, **kwargs):
+    assert 'id' not in kwargs
+
+    # Get an ID for this instruction
+    id = 'stage_insn_{}'.format(n)
+
+    # Chain dependencies of stage instructions
+    if n > 0:
+        kwargs['depends_on'] = kwargs.get('depends_on', frozenset([])).union(frozenset([stage_insn(n-1, **kwargs)]))
+
+    # Actually issue the instruction
+    noop_instruction(id=id, **kwargs)
+
+    return id
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index 8585f15357cd2a8e03b956c73a967c151c67d67b..0d839d614fefa3f10aaeec39a74a72ecf196556c 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -59,7 +59,7 @@ def start_sumfactorization():
     return sum_factorization_kernel(a_matrices, inp, "buffer")
 
 
-def sum_factorization_kernel(a_matrices, inp, buffer):
+def sum_factorization_kernel(a_matrices, inp, buffer, stage=0):
     """
     Calculate a sum factorization matrix product.
 
@@ -76,6 +76,10 @@ def sum_factorization_kernel(a_matrices, inp, buffer):
     buffer: A string identifying the flip flop buffer in use
         for intermediate results.
     """
+    # Get the stage instruction
+    from dune.perftool.loopy.stages import stage_insn
+    insn_dep = stage_insn(stage)
+
     for l, a_matrix in enumerate(a_matrices):
         # Get a temporary that interprets the base storage of the input
         # as a column-major matrix. In later iteration of the amatrix loop
@@ -108,10 +112,12 @@ def sum_factorization_kernel(a_matrices, inp, buffer):
                         ))
 
         # Issue the reduction instruction that implements the multiplication
-        instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j))),
-                    expression=Reduction("sum", k, prod),
-                    forced_iname_deps=frozenset({i, j}),
-                    forced_iname_deps_is_final=True,
-                    )
+        # at the same time store the instruction ID for the next instruction to depend on
+        insn_dep = instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j))),
+                               expression=Reduction("sum", k, prod),
+                               forced_iname_deps=frozenset({i, j}),
+                               forced_iname_deps_is_final=True,
+                               depends_on=frozenset({insn_dep}),
+                               )
 
     return out