diff --git a/python/dune/perftool/loopy/transformations/instrumentation.py b/python/dune/perftool/loopy/transformations/instrumentation.py
index 89b08b6f0e191ca06db56d820c585ebe585e250b..2771ba141ccf5978067e3569038c4307bd01a357 100644
--- a/python/dune/perftool/loopy/transformations/instrumentation.py
+++ b/python/dune/perftool/loopy/transformations/instrumentation.py
@@ -22,7 +22,7 @@ def _union(a):
     return frozenset.union(*a)
 
 
-def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', operator=False):
+def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', operator=False, depends_on=frozenset()):
     """ Transform loopy kernel to contain instrumentation code
 
     Arguments:
@@ -32,6 +32,9 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
     identifier : The name of the counter to start and stop
     level : The instrumentation level this measurement is defined at
     filetag : The tag of the file that should contain the counter definitions
+    depends_on: Additional dependencies to add to the start instruction. This is used to correct
+                currently wrong behaviour of the transformation in cases where a lot of structure
+                of the instrumentation is known a priori.
     """
     # If the instrumentation level is not high enough, this is a no-op
     if level > get_option("instrumentation_level"):
@@ -53,6 +56,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
     insn_inames = _intersect(tuple(i.within_inames for i in insns))
     other_inames = _union(tuple(i.within_inames for i in lp.find_instructions(knl, lp.match.Not(match))))
     within = _intersect((insn_inames, other_inames))
+    uniontags = _intersect(tuple(i.tags for i in insns))
 
     # Get a unique identifer - note that the same timer could be started and stopped several times
     # within one kernel...
@@ -67,8 +71,9 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
                                  "HP_TIMER_START({});".format(identifier),
                                  id=start_id,
                                  within_inames=within,
-                                 depends_on=start_depends,
+                                 depends_on=depends_on.union(start_depends),
                                  boostable_into=frozenset(),
+                                 tags=uniontags,
                                  )
 
     # Add dependencies on the timing instructions
@@ -82,6 +87,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
                                 within_inames=within,
                                 depends_on=frozenset(i.id for i in insns),
                                 boostable_into=frozenset(),
+                                tags=uniontags,
                                 )
 
     # Find all the instructions that should depend on stop
@@ -98,4 +104,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
     other_insns = list(filter(lambda i: i.id not in [j.id for j in rewritten_insns], knl.instructions))
 
     # Add all the modified instructions into the kernel object
-    return knl.copy(instructions=rewritten_insns + other_insns + [start_insn, stop_insn])
+    knl = knl.copy(instructions=rewritten_insns + other_insns + [start_insn, stop_insn])
+
+    from loopy.kernel.creation import resolve_dependencies
+    return resolve_dependencies(knl)
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 04dad9834dae0a3b1fa6bd06ec204ec1e5d32135..f38b76d7e8f8d65819d4d357f037ecc8f1a3208c 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -569,8 +569,8 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin
     if add_timings and get_form_option("sumfact"):
         from dune.perftool.pdelab.signatures import assembler_routine_name
         kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage1"), "{}_kernel_stage1".format(assembler_routine_name()), 4)
-        kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage2"), "{}_kernel_quadratureloop".format(assembler_routine_name()), 4)
-        kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage3"), "{}_kernel_stage3".format(assembler_routine_name()), 4)
+        kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage2"), "{}_kernel_quadratureloop".format(assembler_routine_name()), 4, depends_on=frozenset({lp.match.Tagged("sumfact_stage1")}))
+        kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage3"), "{}_kernel_stage3".format(assembler_routine_name()), 4, depends_on=frozenset({lp.match.Tagged("sumfact_stage2")}))
 
     if wrap_in_cgen:
         # Wrap the kernel in something which can generate code