diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index df47d5e9fb1ac03d8ed6bde6ec1de203470f4d2f..a4d8292f5f88bc315980efb521bf7c2cf6a95153 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -140,10 +140,9 @@ def _insn_cache_key(code=None, expression=None, **kwargs):
 def instruction(code=None, expression=None, **kwargs):
     assert (code is not None) or (expression is not None)
     assert not ((code is not None) and (expression is not None))
-    assert 'id' not in kwargs
 
     # Get an ID for this instruction
-    id = 'insn_{}'.format(str(get_counter('__insn_id')).zfill(4))
+    id = kwargs.pop("id", 'insn_{}'.format(str(get_counter('__insn_id')).zfill(4)))
 
     # Now create the actual instruction
     if code:
diff --git a/python/dune/perftool/pdelab/driver/__init__.py b/python/dune/perftool/pdelab/driver/__init__.py
index c859c99d0ab964e298a55bf2c9854f170c507599..3877165c574ac8ef7b6d7197b56f97d071f38f42 100644
--- a/python/dune/perftool/pdelab/driver/__init__.py
+++ b/python/dune/perftool/pdelab/driver/__init__.py
@@ -298,10 +298,10 @@ def generate_driver():
     add_section("constraints", "Set up constraints container...")
     add_section("gridoperator", "Set up grid grid operators...")
     add_section("vector", "Set up solution vectors...")
+    add_section("timings", "Maybe take performance measurements...")
     add_section("solver", "Set up (non)linear solvers...")
     add_section("vtk", "Do visualization...")
     add_section("instat", "Set up instationary stuff...")
-    add_section("timings", "Maybe take performance measurements...")
     add_section("printing", "Maybe print residuals and matrices to stdout...")
     add_section("error", "Maybe calculate errors for test results...")
 
diff --git a/python/dune/perftool/pdelab/driver/solve.py b/python/dune/perftool/pdelab/driver/solve.py
index 8f3ef96ff9fd2babe838a2b3ceb7baeeb63d67f7..0648df399d809fd84d30624711dfc2fb02bbc631 100644
--- a/python/dune/perftool/pdelab/driver/solve.py
+++ b/python/dune/perftool/pdelab/driver/solve.py
@@ -54,7 +54,8 @@ def dune_solve():
     print_matrix()
 
     if get_option('instrumentation_level') >= 2:
-        from dune.perftool.pdelab.driver.timings import setup_timer, name_timing_stream
+        from dune.perftool.pdelab.driver.timings import setup_timer, name_timing_stream, name_timing_identifier
+        timestream = name_timing_stream()
         setup_timer()
         from dune.perftool.generation import post_include
         post_include("HP_DECLARE_TIMER(solve);", filetag="driver")
@@ -68,7 +69,6 @@ def dune_solve():
         if get_option('instrumentation_level') >= 3:
             from dune.perftool.pdelab.driver.gridoperator import name_localoperator
             lop_name = name_localoperator(form_ident)
-            timestream = name_timing_stream()
             solve.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
 
     return solve
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 7d1ded787fa9fc6cd71a5bd975c6123231dfeffe..7fe95bbe858be0aa74df2c313ad6bd96a7507e68 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -495,7 +495,7 @@ def generate_kernels_per_integral(integrals):
     yield generate_kernel(integrals)
 
 
-def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True):
+def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timings=True):
     # Now extract regular loopy kernel components
     from dune.perftool.loopy.target import DuneTarget
     domains = [i for i in retrieve_cache_items("{} and domain".format(tag))]
@@ -567,7 +567,7 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True):
         if signature is None:
             from dune.perftool.pdelab.signatures import assembly_routine_signature
             signature = assembly_routine_signature()
-        kernel = LoopyKernelMethod(signature, kernel)
+        kernel = LoopyKernelMethod(signature, kernel, add_timings=add_timings)
 
     return kernel
 
diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
index 8881a510fefed4ed72fa49d2222de57e9c3cf7fb..e258f7adfa9acb4a9d6e841a2ed863b20a76cb5f 100644
--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -166,7 +166,8 @@ class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord):
                               forced_iname_deps=frozenset(inames + additional_inames + self.within_inames),
                               forced_iname_deps_is_final=True,
                               depends_on=insn_dep,
-                              predicates=sf.predicates
+                              predicates=sf.predicates,
+                              tags=frozenset({"sumfact_stage3"}),
                               )
 
         return frozenset({dep})
@@ -211,7 +212,10 @@ class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord):
             lhs = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + inames)
 
         result = prim.Sum((lhs, result))
-        return frozenset({instruction(assignee=lhs, expression=result, **args)})
+        return frozenset({instruction(assignee=lhs,
+                                      expression=result,
+                                      tags=frozenset({"sumfact_stage3"}),
+                                      **args)})
 
 
 def _local_sizes(element):
@@ -449,12 +453,18 @@ def generate_accumulation_instruction(expr, visitor):
     # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1)
     timer_dep = frozenset()
     if get_option("instrumentation_level") >= 4:
-        timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
+        timer_name ="{}_kernel_stage1".format(assembler_routine_name())
+        timer_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+                               depends_on=frozenset({lp.match.Tagged("sumfact_stage1"), 'hptimerstart_{}'.format(timer_name)}),
+                               id="hptimerstop_{}".format(timer_name)
+                               )})
+        timer_name = '{}_kernel_quadratureloop'.format(assembler_routine_name())
         post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
         dump_accumulate_timer(timer_name)
-        if(jacobian_inames):
-            timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
-                                               within_inames=frozenset(jacobian_inames))})
+        timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+                                           within_inames=frozenset(jacobian_inames),
+                                           id="hptimerstart_{}".format(timer_name),
+                                           depends_on=timer_dep)})
 
     # Determine dependencies
     from loopy.match import Or, Writes
@@ -478,9 +488,10 @@ def generate_accumulation_instruction(expr, visitor):
         insn_dep = frozenset({contrib_dep})
 
     if get_option("instrumentation_level") >= 4:
-        insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
-                                          depends_on=insn_dep,
-                                          within_inames=frozenset(jacobian_inames))})
+        insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+                                             depends_on=insn_dep,
+                                             within_inames=frozenset(jacobian_inames),
+                                             id="hptimerstop_{}".format(timer_name))}))
 
     # Add a sum factorization kernel that implements the multiplication
     # with the test function (stage 3)
@@ -488,4 +499,12 @@ def generate_accumulation_instruction(expr, visitor):
     result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep)))
 
     if not get_form_option("fastdg"):
-        vsf.output.realize(vsf, result, insn_dep)
+        insn_dep = vsf.output.realize(vsf, result, insn_dep)
+
+    if get_option("instrumentation_level") >= 4:
+        assert vsf.stage == 3
+        timer_name = '{}_kernel_stage{}'.format(assembler_routine_name(), vsf.stage)
+        insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+                                          depends_on=frozenset({lp.match.Tagged("sumfact_stage3")}),
+                                          within_inames=frozenset(jacobian_inames),
+                                          id="hptimerstop_{}".format(timer_name))})
diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py
index 620faab65b9c089b55b47bf2f5d3e859b2798706..dfe080284986f20c1bc7fd641b8cf6b4345d2997 100644
--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -77,6 +77,32 @@ def name_buffer_storage(buff, which):
 def _realize_sum_factorization_kernel(sf):
     insn_dep = sf.insn_dep
 
+    # Measure times and count operations in c++ code
+    if get_option("instrumentation_level") >= 4:
+        setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
+        timer_dep = frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer),
+                                           id="hptimerstop_{}".format(setuptimer))})
+
+        timer_name = '{}_kernel_stage1'.format(assembler_routine_name())
+        post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+        dump_accumulate_timer(timer_name)
+        timer_dep = timer_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+                                                           id="hptimerstart_{}".format(timer_name),
+                                                           depends_on=timer_dep,
+                                                           ),
+                                               }))
+
+        timer_name = '{}_kernel_stage{}'.format(assembler_routine_name(), sf.stage)
+        post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+        dump_accumulate_timer(timer_name)
+        timer_dep = timer_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+                                                           id="hptimerstart_{}".format(timer_name),
+                                                           within_inames=frozenset(sf.within_inames),
+                                                           depends_on=timer_dep.union(insn_dep),
+                                                           ),
+                                               }))
+        insn_dep = insn_dep.union(timer_dep)
+
     # Get all the necessary pieces for a function call
     funcname = name_kernel_implementation_function(sf)
     buffers = tuple(name_buffer_storage(sf.buffer, i) for i in range(2))
@@ -101,10 +127,11 @@ def _realize_sum_factorization_kernel(sf):
 
     # Call the function
     code = "{}({}, {});".format(funcname, *buffers)
+    tag = "sumfact_stage{}".format(sf.stage)
     insn_dep = frozenset({instruction(code=code,
                                       depends_on=insn_dep,
                                       within_inames=frozenset(sf.within_inames),
-                                      tags=frozenset({"sumfact_stage{}".format(sf.stage)}),
+                                      tags=frozenset({tag}),
                                       )
                           })
 
@@ -290,7 +317,7 @@ def realize_sumfact_kernel_function(sf):
     name = name_kernel_implementation_function(sf)
     from dune.perftool.pdelab.localoperator import extract_kernel_from_cache
     signature = "void {}(const char* buffer0, const char* buffer1) const".format(name)
-    kernel = extract_kernel_from_cache("kernel_default", name, [signature])
+    kernel = extract_kernel_from_cache("kernel_default", name, [signature], add_timings=False)
     delete_cache_items("kernel_default")
     return kernel