diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index a8b69b7326f76ab51fe26635cfd7d4557750bc19..5797c08c7e6966d76c51860598994407e4c206ad 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -614,10 +614,6 @@ class LoopyKernelMethod(ClassMember):
 
         content.append('{')
         if kernel is not None:
-            # Add kernel preamble
-            for i, p in kernel.preambles:
-                content.append('  ' + p)
-
             # Start timer
             if add_timings and get_option('instrumentation_level') >= 3:
                 from dune.perftool.pdelab.signatures import assembler_routine_name
@@ -627,6 +623,15 @@ class LoopyKernelMethod(ClassMember):
                 content.append('  ' + 'HP_TIMER_START({});'.format(timer_name))
                 dump_accumulate_timer(timer_name)
 
+            if add_timings and get_option("instrumentation_level") >= 4:
+                setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
+                post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile')
+                content.append('  HP_TIMER_START({});'.format(setuptimer))
+
+            # Add kernel preamble
+            for i, p in kernel.preambles:
+                content.append('  ' + p)
+
             # Add kernel body
             content.extend(l for l in generate_body(kernel).split('\n')[1:-1])
 
diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py
index 5fe9ebb121a925d1d33ade412fa0c3cb9b4a794b..13acc6acb6a65607165961d59c33a6fba18fb876 100644
--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -59,6 +59,12 @@ def _realize_sum_factorization_kernel(sf):
 
     # Measure times and count operations in c++ code
     if get_option("instrumentation_level") >= 4:
+        if sf.stage == 1:
+            setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
+            insn_dep = insn_dep.union(frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer),
+                                                             within_inames=frozenset(sf.within_inames),
+                                                             depends_on=insn_dep)}))
+
         timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(sf.stage)
         post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
         dump_accumulate_timer(timer_name)