diff --git a/cmake/modules/GeneratedSystemtests.cmake b/cmake/modules/GeneratedSystemtests.cmake
index 2e3de48d33d9939aed33ffbec4d90035f50eb42b..1803b1e508e43188c22ee1b846bd3d42abf0211f 100644
--- a/cmake/modules/GeneratedSystemtests.cmake
+++ b/cmake/modules/GeneratedSystemtests.cmake
@@ -49,7 +49,7 @@ function(dune_add_formcompiler_system_test)
 
     add_generated_executable(TARGET ${tname}
                              UFLFILE ${SYSTEMTEST_UFLFILE}
-                             FORM_COMPILER_ARGS --ini-file ${inifile} --timer
+                             FORM_COMPILER_ARGS --ini-file ${inifile}
                              )
 
     # Exclude the target from all
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 5f27ede95ccb273d089f231fc64083cb0c854d51..b521bf6dc61b99c0c614116fada26f476926e51a 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -39,8 +39,9 @@ def get_form_compiler_arguments():
     parser.add_argument("--diagonal-transformation-matrix", action="store_true", help="set option if the jacobian of the transformation is diagonal (axiparallel grids)")
     parser.add_argument("--constant-transformation-matrix", action="store_true", help="set option if the jacobian of the transformation is constant on a cell")
     parser.add_argument("--ini-file", type=str, help="An inifile to use. A generated driver will be hard-coded to it, a [formcompiler] section will be used as default values to form compiler arguments (use snake case)")
-    parser.add_argument("--timer", action="store_true", help="measure times")
-    parser.add_argument("--opcounter", action="store_true", default=False, help="Count operations. Should only be used with yaspgrid. Timer should be set.")
+    parser.add_argument("--opcounter", action="store_true", help="Count operations. Note: In this case only oparor applications are generated since solving and operator counting does not work.")
+    parser.add_argument("--time-opcounter", action="store_true", help="Generate opcounter codepath. Can be used for timing opcounter programs without setting the opcounter option.")
+    parser.add_argument("--instrumentation-level", type=int, default=0, help="Control time/opcounter measurements. 0-do nothing, 1-measure program as a whole, 2-operator applications, 3-measure kernel (eg. alpha-volume, ...), 4-parts of kernel (eg. stage 1-3 of SF)")
     parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project")
     parser.add_argument("--fastdg", action="store_true", help="Use FastDGGridOperator from PDELab.")
     # TODO at some point this help description should be updated
diff --git a/python/dune/perftool/pdelab/driver.py b/python/dune/perftool/pdelab/driver.py
index 62a72fcea2c6ebc82596be76fa17f1d4d19dc15c..775b3bda292210ab142d4c37dcfde16cf3852fe5 100644
--- a/python/dune/perftool/pdelab/driver.py
+++ b/python/dune/perftool/pdelab/driver.py
@@ -1188,10 +1188,10 @@ def dune_solve():
         snp = name_stationarynonlinearproblemsolver(go_type, go)
         solve = "{}.apply();".format(snp)
 
-    if get_option('timer'):
+    if get_option('instrumentation_level') >= 2:
         setup_timer()
         from dune.perftool.generation import post_include
-        post_include("HP_DECLARE_TIMER(total);", filetag="driver")
+        post_include("HP_DECLARE_TIMER(solve);", filetag="driver")
 
         # Print times after solving
         from dune.perftool.generation import get_global_context_value
@@ -1202,12 +1202,14 @@ def dune_solve():
             lop_name = name_localoperator(formdata)
             timestream = name_timing_stream()
             print_times.append("{}.dump_timers({}, argv[0], true);".format(lop_name, timestream))
-        solve = ["HP_TIMER_START(total);",
+
+        solve = ["HP_TIMER_START(solve);",
                  "{}".format(solve),
-                 "HP_TIMER_STOP(total);",
-                 "DUMP_TIMER(total, {}, true);".format(timestream),
+                 "HP_TIMER_STOP(solve);",
+                 "DUMP_TIMER(solve, {}, true);".format(timestream),
                  ]
-        solve.extend(print_times)
+        if get_option('instrumentation_level') >= 3:
+            solve.extend(print_times)
 
     return solve
 
@@ -1391,7 +1393,7 @@ def name_test_fail_variable():
 
 @cached
 def setup_timer():
-    assert(get_option('timer'))
+    assert(get_option('instrumentation_level') >= 1)
 
     # Necessary includes and defines
     from dune.perftool.generation import pre_include
@@ -1697,7 +1699,9 @@ def generate_driver(formdatas, data):
     set_driver_data(formdatas, data)
 
     # Entrypoint for driver generation
-    if get_option("opcounter"):
+    if get_option("opcounter") or get_option("time_opcounter"):
+        if get_option("time_opcounter"):
+            assert(not get_option("opcounter"))
         assert(any(_driver_data['form'].ufl_cell().cellname() in x for x in
                    ["vertex", "interval", "quadrilateral", "hexahedron"]))
         # In case of operator conunting we only assemble the matrix and evaluate the residual
@@ -1714,7 +1718,18 @@ def generate_driver(formdatas, data):
     from dune.perftool.generation import retrieve_cache_items
     from cgen import FunctionDeclaration, FunctionBody, Block, Value
     driver_signature = FunctionDeclaration(Value('bool', 'driver'), [Value('int', 'argc'), Value('char**', 'argv')])
-    driver_body = Block(contents=[i for i in retrieve_cache_items("preamble", make_generable=True)])
+    contents = [i for i in retrieve_cache_items("preamble", make_generable=True)]
+
+    from cgen import Line
+    if get_option("instrumentation_level") >= 1:
+        from dune.perftool.generation import post_include
+        post_include("HP_DECLARE_TIMER(driver);\n", filetag="driver")
+        contents.insert(0, Line(text="HP_TIMER_START(driver);\n"))
+        contents.insert(len(contents) - 1, Line(text="HP_TIMER_STOP(driver);\n"))
+        timestream = name_timing_stream()
+        contents.insert(len(contents) - 1, Line(text="DUMP_TIMER(driver, {}, true);\n".format(timestream)))
+    contents.insert(0, Line(text="\n"))
+    driver_body = Block(contents)
     driver = FunctionBody(driver_signature, driver_body)
 
     filename = get_option("driver_file")
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 0686f44c49001424ef3cb819c9df134243ab5b18..839c57f8f5bef06cd85a241a283684ab87311f4b 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -556,7 +556,6 @@ class TimerMethod(ClassMember):
         knl = name_example_kernel()
         assert(knl is not None)
 
-        # TODO: operator counting only works if alpha_volume_kernel exists
         content = ["template <typename Stream>",
                    "void dump_timers(Stream& {}, char* {}, bool {})".format(os, ex, reset),
                    "{",
@@ -595,7 +594,7 @@ class LoopyKernelMethod(ClassMember):
                 content.append('  ' + p)
 
             # Start timer
-            if add_timings and get_option('timer'):
+            if add_timings and get_option('instrumentation_level') >= 3:
                 from dune.perftool.pdelab.signatures import assembler_routine_name
                 timer_name = assembler_routine_name() + '_kernel'
                 name_example_kernel(name=timer_name)
@@ -607,7 +606,7 @@ class LoopyKernelMethod(ClassMember):
             content.extend(l for l in generate_body(kernel).split('\n')[1:-1])
 
             # Stop timer
-            if add_timings and get_option('timer'):
+            if add_timings and get_option('instrumentation_level') >= 3:
                 content.append('  ' + 'HP_TIMER_STOP({});'.format(timer_name))
 
         content.append('}')
@@ -814,7 +813,7 @@ def generate_localoperator_file(formdata, kernels, filename):
     for k in kernels.values():
         operator_methods.extend(k)
 
-    if get_option('timer'):
+    if get_option('instrumentation_level') >= 3:
         include_file('dune/perftool/common/timer.hh', filetag='operatorfile')
         operator_methods.append(TimerMethod())
 
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index 35d91cc0d553f803988c8c6e932aed18144c6337..216ce95c0f7a06ab99cedd7210f01b740b69999b 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -10,6 +10,7 @@ from dune.perftool.generation import (backend,
                                       barrier,
                                       built_instruction,
                                       domain,
+                                      dump_accumulate_timer,
                                       function_mangler,
                                       generator_factory,
                                       get_counter,
@@ -17,6 +18,7 @@ from dune.perftool.generation import (backend,
                                       globalarg,
                                       iname,
                                       instruction,
+                                      post_include,
                                       kernel_cached,
                                       retrieve_cache_items,
                                       silenced_warning,
@@ -32,6 +34,7 @@ from dune.perftool.loopy.buffer import (get_buffer_temporary,
 from dune.perftool.sumfact.quadrature import nest_quadrature_loops
 from dune.perftool.pdelab.localoperator import determine_accumulation_space
 from dune.perftool.pdelab.restriction import restricted_name
+from dune.perftool.pdelab.signatures import assembler_routine_name
 from dune.perftool.pdelab.spaces import (name_lfs,
                                          name_lfs_bound,
                                          )
@@ -178,6 +181,16 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
             replace_dict[Variable(iname)] = i
         expression = substitute(pymbolic_expr, replace_dict)
 
+        # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1)
+        timer_dep = frozenset()
+        if get_option("instrumentation_level") >= 4:
+            timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
+            post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+            dump_accumulate_timer(timer_name)
+            if(visitor.inames):
+                timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+                                                   within_inames=frozenset(visitor.inames))})
+
         # Determine dependencies
         from loopy.match import Or, Writes
         from loopy.symbolic import DependencyMapper
@@ -192,12 +205,17 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                   forced_iname_deps=frozenset(quadrature_inames() + visitor.inames),
                                   forced_iname_deps_is_final=True,
                                   tags=frozenset({"quadvec"}).union(vectag),
-                                  depends_on=frozenset({deps})
+                                  depends_on=frozenset({deps}).union(timer_dep)
                                   )
 
         if insn_dep is None:
             insn_dep = frozenset({contrib_dep})
 
+        if get_option("instrumentation_level") >= 4:
+            insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+                                              depends_on=insn_dep,
+                                              within_inames=frozenset(visitor.inames))})
+
         # Add a sum factorization kernel that implements the multiplication
         # with the test function (stage 3)
         pref_pos = i if accterm.argument.index else None
@@ -351,6 +369,14 @@ def sum_factorization_kernel(a_matrices,
         ctags = ctags + ",vec"
         vec_shape = (4,)
 
+    if get_option("instrumentation_level") >= 4:
+        timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(stage)
+        post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+        dump_accumulate_timer(timer_name)
+        insn_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+                                          depends_on=insn_dep,
+                                          within_inames=additional_inames)})
+
     insn_dep = frozenset({barrier(depends_on=insn_dep,
                                   within_inames=additional_inames,
                                   )})
@@ -432,6 +458,17 @@ def sum_factorization_kernel(a_matrices,
                                           )
                               })
 
+    if get_option("instrumentation_level") >= 4:
+        insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+                                          depends_on=insn_dep,
+                                          within_inames=additional_inames)})
+        if stage == 1:
+            qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
+            post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+            dump_accumulate_timer(timer_name)
+            insn_dep = instruction(code="HP_TIMER_START({});".format(qp_timer_name),
+                                   depends_on=insn_dep)
+
     if outshape is None:
         assert stage == 3
         outshape = tuple(mat.rows for mat in a_matrices)