diff --git a/bin/donkey_benchmark_compilation_wrapper.sh b/bin/donkey_benchmark_compilation_wrapper.sh
index e2691b5d71db2473b731fabcea6a1095ec2516d9..a786d111264ef0e67769100713f1776e3e002c3d 100755
--- a/bin/donkey_benchmark_compilation_wrapper.sh
+++ b/bin/donkey_benchmark_compilation_wrapper.sh
@@ -13,7 +13,4 @@ ml parmetis
 
 ("$@")
 code=$?
-echo "Code: $code"
-sleep 0.1s
-
 exit $code
diff --git a/bin/donkey_benchmark_execution_wrapper.py b/bin/donkey_benchmark_execution_wrapper.py
index 7951b8b06bfdeb217bd3328c9b3b88f229bbc606..d383963318291ae947c70b1f083c320a60250303 100755
--- a/bin/donkey_benchmark_execution_wrapper.py
+++ b/bin/donkey_benchmark_execution_wrapper.py
@@ -13,8 +13,3 @@ ret = subprocess.call(command)
 # If that failed - fail!
 if ret != 0:
     sys.exit(ret)
-
-# If that was succesful, wait for the output file to be available on the filesystem
-# This step is necessary because the NFS synchronization is too slow for our workflow.
-while not os.path.isfile(sys.argv[2]):
-    time.sleep(0.1)
diff --git a/cmake/modules/DuneCodegenMacros.cmake b/cmake/modules/DuneCodegenMacros.cmake
index bd3f9649116d72afee7118e5d3940e5f11a54e4e..61713109b32cc1af4716b0f9f133879592fec49c 100644
--- a/cmake/modules/DuneCodegenMacros.cmake
+++ b/cmake/modules/DuneCodegenMacros.cmake
@@ -81,6 +81,10 @@
 
 find_package(benchmark)
 
+if (DUNE_CODEGEN_PROFILING)
+  find_package(likwid)
+endif()
+
 add_custom_target(generation)
 
 # Gather a list of form compiler sources to add as dependencies
@@ -174,11 +178,8 @@ function(add_generated_executable)
 
   if(DUNE_CODEGEN_PROFILING)
     # This is a bit silly, but cProfile only finds entry point scripts
-    # if their full path is provided. So we resort to using which.
-    dune_execute_process(COMMAND which generate_operators
-                         OUTPUT_VARIABLE fullcommand
-                         OUTPUT_STRIP_TRAILING_WHITESPACE
-                         )
+    # if their full path is provided. 
+    set(fullcommand "${DUNE_PYTHON_VIRTUALENV_PATH}/bin/generate_operators")
   endif()
 	
   # Define build rules for all operator header files and gather a list of them
diff --git a/cmake/modules/Findlikwid.cmake b/cmake/modules/Findlikwid.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..778901280ee0b778bd7131cf1fcee4b3181557f3
--- /dev/null
+++ b/cmake/modules/Findlikwid.cmake
@@ -0,0 +1,104 @@
+# .. cmake_module::
+#
+#    Module that checks whether likwid is available and usable.
+#
+#    Variables used by this module which you may want to set:
+#
+#    :ref:`likwid_ROOT`
+#       Path list to search for likwid.
+#
+#    Sets the following variables:
+#
+#    :code:`likwid_FOUND`
+#       True if likwid available.
+#
+#    :code:`likwid_INCLUDE_DIRS`
+#       Path to the likwid include directories.
+#
+#
+#    :code:`likwid_LIBRARIES`
+#       Link against these libraries to use likwid.
+#
+# .. cmake_variable:: likwid_ROOT
+#
+#    You may set this variable to have :ref:`Findlikwid` look
+#    for the likwid package in the given paths before inspecting
+#    system paths.
+#
+find_path(LIKWID_INCLUDE_DIR
+        NAMES "likwid.h"
+        PATHS ${likwid_ROOT}
+        PATH_SUFFIXES "include" "include/likwid"
+        NO_DEFAULT_PATH)
+find_path(LIKWID_INCLUDE_DIR
+        NAMES "likwid.h"
+        PATH_SUFFIXES "include" "include/likwid")
+
+find_library(LIKWID_LIBRARY
+        NAMES "likwid"
+        PATHS ${likwid_ROOT}
+        PATH_SUFFIXES "lib" "lib32" "lib64"
+        NO_DEFAULT_PATH)
+find_library(LIKWID_LIBRARY
+        NAMES "likwid"
+        PATH_SUFFIXES "lib" "lib32" "lib64")
+
+include(CMakePushCheckState)
+cmake_push_check_state()
+
+if(LIKWID_INCLUDE_DIR)
+    set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${LIKWID_INCLUDE_DIR})
+endif()
+if(LIKWID_LIBRARY)
+    set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} ${LIKWID_LIBRARY})
+endif()
+
+cmake_pop_check_state()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+        "likwid"
+        DEFAULT_MSG
+        LIKWID_INCLUDE_DIR
+        LIKWID_LIBRARY
+)
+
+mark_as_advanced(LIKWID_INCLUDE_DIR LIKWID_LIBRARY)
+
+# if headers are found, store results
+if(likwid_FOUND)
+    set(likwid_INCLUDE_DIRS ${LIKWID_INCLUDE_DIR})
+    set(likwid_LIBRARIES ${LIKWID_LIBRARY})
+    # log result
+    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+            "Determing location of likwid succeeded:\n"
+            "Include directory: ${likwid_INCLUDE_DIRS}\n"
+            "Libraries to link against: ${likwid_LIBRARIES}\n\n")
+
+    set(likwid_DUNE_COMPILE_FLAGS "-I${likwid_INCLUDE_DIRS}"
+            CACHE STRING "Compile Flags used by DUNE when compiling with likwid programs")
+    set(likwid_DUNE_LIBRARIES ${likwid_LIBRARIES}
+            CACHE STRING "Libraries used by DUNE when linking likwid programs")
+else()
+    # log errornous result
+    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+            "Determing location of likwid failed:\n"
+            "Include directory: ${likwid_INCLUDE_DIRS}\n"
+            "Libraries to link against: ${likwid_LIBRARIES}\n\n")
+endif()
+
+# set HAVE_LIKWID for config.h
+set(HAVE_LIKWID ${likwid_FOUND})
+
+
+# register all likwid related flags
+if(likwid_FOUND)
+    dune_register_package_flags(COMPILE_DEFINITIONS "ENABLE_LIKWID=1"
+            LIBRARIES "${likwid_LIBRARIES}"
+            INCLUDE_DIRS "${likwid_INCLUDE_DIRS}")
+endif()
+
+# text for feature summary
+set_package_properties("LIKWID" PROPERTIES
+        DESCRIPTION "likwid"
+        PURPOSE "Performance monitoring and benchmarking suite.")
\ No newline at end of file
diff --git a/dune/codegen/common/simdtraits.hh b/dune/codegen/common/simdtraits.hh
new file mode 100644
index 0000000000000000000000000000000000000000..73ee4caff1bb743d275e20173556416cc0bd2d30
--- /dev/null
+++ b/dune/codegen/common/simdtraits.hh
@@ -0,0 +1,16 @@
+#ifndef DUNE_CODEGEN_COMMON_SIMD_TRAITS_HH
+#define DUNE_CODEGEN_COMMON_SIMD_TRAITS_HH
+
+/** This is just the declaration of the traits classes, specialization for VCL and
+ *  OpCounter VCL are elsewhere.
+ */
+
+template<typename T>
+struct base_floatingpoint
+{};
+
+template<typename T>
+struct simd_size
+{};
+
+#endif
diff --git a/dune/codegen/common/vcltraits.hh b/dune/codegen/common/vcltraits.hh
new file mode 100644
index 0000000000000000000000000000000000000000..de764dbe6f78a51d1bf5934856ca36d4b90a77d0
--- /dev/null
+++ b/dune/codegen/common/vcltraits.hh
@@ -0,0 +1,86 @@
+#ifndef DUNE_CODEGEN_COMMON_VCLTRAITS_HH
+#define DUNE_CODEGEN_COMMON_VCLTRAITS_HH
+
+/** A collection of traits tools for the Vector Class Library */
+
+#include<dune/codegen/common/vectorclass.hh>
+
+
+template<>
+struct base_floatingpoint<Vec2d>
+{
+  using value = double;
+};
+
+template<>
+struct base_floatingpoint<Vec4f>
+{
+  using value = float;
+};
+
+template<>
+struct simd_size<Vec2d>
+{
+  static constexpr std::size_t value = 2;
+};
+
+template<>
+struct simd_size<Vec4f>
+{
+  static constexpr std::size_t value = 4;
+};
+
+#if MAX_VECTOR_SIZE >= 256
+template<>
+struct base_floatingpoint<Vec4d>
+{
+  using value = double;
+};
+
+template<>
+struct base_floatingpoint<Vec8f>
+{
+  using value = float;
+};
+
+template<>
+struct simd_size<Vec4d>
+{
+  static constexpr std::size_t value = 4;
+};
+
+template<>
+struct simd_size<Vec8f>
+{
+  static constexpr std::size_t value = 8;
+};
+#endif
+
+#if MAX_VECTOR_SIZE >= 512
+template<>
+struct base_floatingpoint<Vec8d>
+{
+  using value = double;
+};
+
+template<>
+struct base_floatingpoint<Vec16f>
+{
+  using value = float;
+};
+
+template<>
+struct simd_size<Vec8d>
+{
+  static constexpr std::size_t value = 8;
+};
+
+template<>
+struct simd_size<Vec16f>
+{
+  static constexpr std::size_t value = 16;
+};
+
+#endif
+
+#endif
diff --git a/dune/codegen/common/vectorclass.hh b/dune/codegen/common/vectorclass.hh
index aa3bba7b98f9cf57dcc223f61755e3aa26c5bdca..648ea52fe24490efc8af431fe54de4089515af33 100644
--- a/dune/codegen/common/vectorclass.hh
+++ b/dune/codegen/common/vectorclass.hh
@@ -1,71 +1,23 @@
 #ifndef DUNE_CODEGEN_COMMON_VECTORCLASS_HH
 #define DUNE_CODEGEN_COMMON_VECTORCLASS_HH
 
-
-template<typename T>
-struct base_floatingpoint
-{};
+#include<dune/codegen/common/simdtraits.hh>
 
 #ifdef ENABLE_COUNTER
+
 #if HAVE_DUNE_OPCOUNTER
 #include<dune/opcounter/vectorclass.hh>
-
-template<typename F, int size>
-struct base_floatingpoint<OpCounter::impl::OpCounterVector<F, size>>
-{
-  using value = OpCounter::OpCounter<F>;
-};
-
-
 #else
 #error "dune-opcounter is needed for opcounted vector types"
 #endif
+
 #else
+
 #include<dune/codegen/vectorclass/vectorclass.h>
 #include<dune/codegen/vectorclass/vectormath_exp.h>
 #include<dune/codegen/vectorclass/vectormath_hyp.h>
 #include<dune/codegen/vectorclass/vectormath_trig.h>
-
-template<>
-struct base_floatingpoint<Vec2d>
-{
-  using value = double;
-};
-
-template<>
-struct base_floatingpoint<Vec4f>
-{
-  using value = float;
-};
-
-
-#if MAX_VECTOR_SIZE >= 256
-template<>
-struct base_floatingpoint<Vec4d>
-{
-  using value = double;
-};
-
-template<>
-struct base_floatingpoint<Vec8f>
-{
-  using value = float;
-};
-#endif
-
-#if MAX_VECTOR_SIZE >= 512
-template<>
-struct base_floatingpoint<Vec8d>
-{
-  using value = double;
-};
-
-template<>
-struct base_floatingpoint<Vec16f>
-{
-  using value = float;
-};
-#endif
+#include<dune/codegen/common/vcltraits.hh>
 
 #endif
 
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b96bd06cbce6361f1130e17bb7cb4d97efe08ec7..8881504f934e7b1d755580ffc46d47f2a1467346 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -25,5 +25,5 @@ add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc)
 target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing)
 
 if(benchmark_FOUND)
-  target_link_libraries(_autotune_target benchmark::benchmark)
+  target_link_libraries(_autotune_target benchmark)
 endif()
diff --git a/python/dune/codegen/generation/__init__.py b/python/dune/codegen/generation/__init__.py
index d0cf1d4dc8b6880db13bfdec4458d815ea8208c2..bed0256407b7259bab61b6e932c4a17761097e75 100644
--- a/python/dune/codegen/generation/__init__.py
+++ b/python/dune/codegen/generation/__init__.py
@@ -16,6 +16,7 @@ from dune.codegen.generation.cpp import (base_class,
                                          class_member,
                                          constructor_parameter,
                                          dump_accumulate_timer,
+                                         register_liwkid_timer,
                                          end_of_file,
                                          include_file,
                                          initializer_list,
diff --git a/python/dune/codegen/generation/cpp.py b/python/dune/codegen/generation/cpp.py
index 29384f98554ab895d37670c017fe5ecc4f191655..b918291067f45c5f988bc8fdcea55651d538a9db 100644
--- a/python/dune/codegen/generation/cpp.py
+++ b/python/dune/codegen/generation/cpp.py
@@ -50,3 +50,8 @@ def dump_accumulate_timer(name):
 
     code = "DUMP_TIMER({},{},{},{});".format(get_option("instrumentation_level"), name, os, reset)
     return code
+
+
+@generator_factory(item_tags=("register_likwid_timers",))
+def register_liwkid_timer(name):
+    return "LIKWID_MARKER_REGISTER(\"{}\");".format(name)
diff --git a/python/dune/codegen/loopy/symbolic.py b/python/dune/codegen/loopy/symbolic.py
index 799c7d6040e2e7d9e40c296d73c391aeb43cc6f3..c76fbb063b88b62732ac54bbce39f532ba96aa9a 100644
--- a/python/dune/codegen/loopy/symbolic.py
+++ b/python/dune/codegen/loopy/symbolic.py
@@ -122,6 +122,7 @@ lp.type_inference.TypeInferenceMapper.map_vectorized_sumfact_kernel = needs_reso
 
 # FusedMultiplyAdd node
 lp.symbolic.IdentityMapper.map_fused_multiply_add = identity_map_fused_multiply_add
+lp.symbolic.SubstitutionMapper.map_fused_multiply_add = identity_map_fused_multiply_add
 lp.symbolic.WalkMapper.map_fused_multiply_add = walk_map_fused_multiply_add
 lp.symbolic.StringifyMapper.map_fused_multiply_add = stringify_map_fused_multiply_add
 lp.symbolic.DependencyMapper.map_fused_multiply_add = dependency_map_fused_multiply_add
diff --git a/python/dune/codegen/loopy/transformations/instrumentation.py b/python/dune/codegen/loopy/transformations/instrumentation.py
index 7b13a09e597490dd1bef85344c9be6bdfb859dd3..2fab53a6215a15f0e06e7d42a18f4736b46da34a 100644
--- a/python/dune/codegen/loopy/transformations/instrumentation.py
+++ b/python/dune/codegen/loopy/transformations/instrumentation.py
@@ -1,9 +1,11 @@
 """ Add instrumentation instructions to a kernel """
 
 from dune.codegen.generation import (dump_accumulate_timer,
+                                     register_liwkid_timer,
                                      post_include,
                                      )
 from dune.codegen.options import get_option
+from dune.codegen.pdelab.driver.timings import start_region_timer_instruction, stop_region_timer_instruction
 
 import loopy as lp
 
@@ -67,28 +69,25 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
     # Define the start instruction and correct dependencies for it
     start_id = "{}_start".format(ident)
     start_depends = _union(tuple(i.depends_on for i in insns)).difference(frozenset(i.id for i in insns))
-    start_insn = lp.CInstruction([],
-                                 "HP_TIMER_START({});".format(identifier),
-                                 id=start_id,
-                                 within_inames=within,
-                                 depends_on=depends_on.union(start_depends),
-                                 boostable_into=frozenset(),
-                                 tags=uniontags,
-                                 )
+    start_insn = start_region_timer_instruction(identifier,
+                                                id=start_id,
+                                                within_inames=within,
+                                                depends_on=depends_on.union(start_depends),
+                                                boostable_into=frozenset(),
+                                                tags=uniontags,)
 
     # Add dependencies on the timing instructions
     rewritten_insns.extend([i.copy(depends_on=i.depends_on.union(frozenset({start_id}))) for i in insns])
 
     # Define the stop instruction and correct dependencies for it
     stop_id = "{}_stop".format(ident)
-    stop_insn = lp.CInstruction([],
-                                "HP_TIMER_STOP({});".format(identifier),
-                                id=stop_id,
-                                within_inames=within,
-                                depends_on=frozenset(i.id for i in insns),
-                                boostable_into=frozenset(),
-                                tags=uniontags,
-                                )
+    stop_insn = stop_region_timer_instruction(identifier,
+                                              id=stop_id,
+                                              within_inames=within,
+                                              depends_on=frozenset(i.id for i in insns),
+                                              boostable_into=frozenset(),
+                                              tags=uniontags,
+                                              )
 
     # Find all the instructions that should depend on stop
     dep_insns = filter(lambda i: _intersect((i.depends_on, frozenset(i.id for i in insns))),
@@ -97,8 +96,11 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
     rewritten_insns.extend([i.copy(depends_on=i.depends_on.union(frozenset({stop_id}))) for i in dep_insns])
 
     # Trigger code generation on the file/operator level
-    post_include('HP_DECLARE_TIMER({});'.format(identifier), filetag=filetag)
-    dump_accumulate_timer(identifier)
+    if get_option("use_likwid"):
+        register_liwkid_timer(identifier)
+    else:
+        post_include('HP_DECLARE_TIMER({});'.format(identifier), filetag=filetag)
+        dump_accumulate_timer(identifier)
 
     # Filter all the instructions which were untouched
     other_insns = list(filter(lambda i: i.id not in [j.id for j in rewritten_insns], knl.instructions))
diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index 76a7f54f9d3516ee047913cb0ebd4947c189536b..c8eece5274d632fc764286bbcf1f346ba28092bd 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -55,6 +55,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
     target_name = CodegenOption(default=None, helpstr="The target name from CMake")
     operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
     debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).")
+    use_likwid = CodegenOption(default=False, helpstr="Use likwid instead of own performance measurements.")
     autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
 
     # Arguments that are mainly to be set by logic depending on other options
diff --git a/python/dune/codegen/pdelab/driver/__init__.py b/python/dune/codegen/pdelab/driver/__init__.py
index 6526dcbf85cfb60d663a3f0193dde07351a361bd..b60544c1c78242f1490c76c46d4be6a2c4448501 100644
--- a/python/dune/codegen/pdelab/driver/__init__.py
+++ b/python/dune/codegen/pdelab/driver/__init__.py
@@ -270,15 +270,14 @@ def generate_driver():
 
     # Make sure that timestream is declared before retrieving chache items
     if get_option("instrumentation_level") >= 1:
-        from dune.codegen.pdelab.driver.timings import setup_timer, name_timing_stream
+        from dune.codegen.pdelab.driver.timings import setup_timer
         setup_timer()
-        timestream = name_timing_stream()
 
     from dune.codegen.pdelab.driver.error import return_statement
     return_statement()
 
     from dune.codegen.generation import retrieve_cache_items
-    from cgen import FunctionDeclaration, FunctionBody, Block, Value, LineComment, Line
+    from cgen import FunctionDeclaration, FunctionBody, Block, Value, LineComment, Line, Generable
     driver_signature = FunctionDeclaration(Value('int', 'main'), [Value('int', 'argc'), Value('char**', 'argv')])
 
     contents = []
@@ -292,6 +291,11 @@ def generate_driver():
             contents.append(Line("\n"))
 
     add_section("init", "Initialize basic stuff...")
+
+    if get_option("instrumentation_level") >= 1:
+        init_contents = contents
+        contents = []
+
     add_section("grid", "Setup grid (view)...")
     add_section("fem", "Set up finite element maps...")
     add_section("gfs", "Set up grid function spaces...")
@@ -306,13 +310,14 @@ def generate_driver():
     add_section("error", "Maybe calculate errors for test results...")
 
     if get_option("instrumentation_level") >= 1:
-        from dune.codegen.generation import post_include
-        post_include("HP_DECLARE_TIMER(driver);\n", filetag="driver")
-        contents.insert(0, Line(text="HP_TIMER_START(driver);\n"))
-        contents.insert(len(contents) - 1, Line(text="HP_TIMER_STOP(driver);\n"))
-        contents.insert(len(contents) - 1, Line(text="DUMP_TIMER({}, driver, {}, true);\n".format(get_option("instrumentation_level"), timestream)))
-    contents.insert(0, Line(text="\n"))
-    driver_body = Block(contents)
+        from dune.codegen.pdelab.driver.timings import timed_region
+        contents = init_contents + timed_region('driver', contents)
+
+    add_section("end", "Stuff that should happen at the end...")
+    add_section("return_stmt", "Return statement...")
+
+    contents.insert(0, "\n")
+    driver_body = Block([c if isinstance(c, Generable) else Line(c + '\n') for c in contents])
 
     # Wrap a try/catch block around the driver body
     from dune.codegen.cgen import CatchBlock, TryCatchBlock, Value, Block, Line
diff --git a/python/dune/codegen/pdelab/driver/error.py b/python/dune/codegen/pdelab/driver/error.py
index cf9fe42933ca41f2696b39324bb4f443c35d9003..02207b4986dccd7a652c1943402243e755a8c681 100644
--- a/python/dune/codegen/pdelab/driver/error.py
+++ b/python/dune/codegen/pdelab/driver/error.py
@@ -186,8 +186,7 @@ def compare_L2_squared():
             "  {} = true;".format(fail)]
 
 
-@preamble(section="error")
+@preamble(section="return_stmt")
 def return_statement():
-    from dune.codegen.pdelab.driver.error import name_test_fail_variable
     fail = name_test_fail_variable()
     return "return {};".format(fail)
diff --git a/python/dune/codegen/pdelab/driver/solve.py b/python/dune/codegen/pdelab/driver/solve.py
index 79dfac051c66c2faa50c4e262eb419ed30b5fde1..4a6a3c9e7e235ee5368927fc6f97ac817ffde5f5 100644
--- a/python/dune/codegen/pdelab/driver/solve.py
+++ b/python/dune/codegen/pdelab/driver/solve.py
@@ -57,23 +57,8 @@ def dune_solve():
     if get_form_option("generate_jacobians"):
         print_matrix()
 
-    if get_option('instrumentation_level') >= 2:
-        from dune.codegen.pdelab.driver.timings import setup_timer, name_timing_stream, name_timing_identifier
-        timestream = name_timing_stream()
-        setup_timer()
-        from dune.codegen.generation import post_include
-        post_include("HP_DECLARE_TIMER(solve);", filetag="driver")
-
-        solve = ["HP_TIMER_START(solve);",
-                 "{}".format(solve),
-                 "HP_TIMER_STOP(solve);",
-                 "DUMP_TIMER({}, solve, {}, true);".format(get_option("instrumentation_level"), timestream),
-                 ]
-
-        if get_option('instrumentation_level') >= 3:
-            from dune.codegen.pdelab.driver.gridoperator import name_localoperator
-            lop_name = name_localoperator(form_ident)
-            solve.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
+    from dune.codegen.pdelab.driver.timings import timed_region
+    solve = timed_region('solve', solve)
 
     return solve
 
diff --git a/python/dune/codegen/pdelab/driver/timings.py b/python/dune/codegen/pdelab/driver/timings.py
index 714f263a353c5a8a3a6b3b83dbc651ffd3401961..aeca64d46c73f2327b48c22f07dca7a85a044104 100644
--- a/python/dune/codegen/pdelab/driver/timings.py
+++ b/python/dune/codegen/pdelab/driver/timings.py
@@ -1,12 +1,11 @@
 """ Timing related generator functions """
 
-from dune.codegen.options import get_option
 from dune.codegen.generation import (cached,
                                      include_file,
                                      pre_include,
-                                     post_include,
                                      preamble,
                                      )
+from dune.codegen.options import get_option
 from dune.codegen.pdelab.driver import (get_form_ident,
                                         is_linear,
                                         name_initree,
@@ -21,7 +20,7 @@ from dune.codegen.pdelab.driver.gridoperator import (name_gridoperator,
                                                      type_gridoperator,
                                                      )
 from dune.codegen.pdelab.driver.solve import (name_vector,
-                                              type_vector,
+                                              define_vector,
                                               )
 
 
@@ -90,109 +89,172 @@ def name_timing_stream():
     return name
 
 
+def name_temporary_vector(name, form):
+    name = "{}_{}".format(name, form)
+    define_vector(name, form)
+    return name
+
+
+@preamble(section="timings")
+def define_jacobian(name, form_ident):
+    t_go = type_gridoperator(form_ident)
+    n_go = name_gridoperator(form_ident)
+    return ["using M_{} = typename {}::Traits::Jacobian;".format(form_ident, t_go),
+            "M_{} {}({});".format(form_ident, name, n_go)]
+
+
+def name_jacobian(form_ident):
+    name = "J_{}".format(form_ident)
+    define_jacobian(name, form_ident)
+    return name
+
+
+@preamble(section="init")
+def init_likwid():
+    return ["LIKWID_MARKER_INIT;", "LIKWID_MARKER_THREADINIT;"]
+
+
+@preamble(section="end")
+def finalize_likwid():
+    return ["LIKWID_MARKER_CLOSE;"]
+
+
+@preamble(section="timings")
+def local_operator_likwid():
+    lop_name = name_localoperator(get_form_ident())
+    return "{}.register_likwid_timers();".format(lop_name)
+
+
 @cached
 def setup_timer():
     # TODO check that we are using YASP?
-    if get_option('opcounter'):
-        pre_include("#define ENABLE_COUNTER", filetag="driver")
-    pre_include("#define ENABLE_HP_TIMERS", filetag="driver")
-    include_file("dune/codegen/common/timer.hh", filetag="driver")
+    if get_option("use_likwid"):
+        pre_include("#define LIKWID_PERFMON", filetag="driver")
+        include_file("likwid.h", filetag="driver")
+        init_likwid()
+        if get_option('instrumentation_level') >= 3:
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.warning("timings: using instrumentation level >= 3 with likwid will slow down your code considerably")
+            local_operator_likwid()
+        finalize_likwid()
+    else:
+        from dune.codegen.loopy.target import type_floatingpoint
+        pre_include("#define HP_TIMER_OPCOUNTER {}".format(type_floatingpoint()), filetag="driver")
+        if get_option('opcounter'):
+            pre_include("#define ENABLE_COUNTER", filetag="driver")
+        pre_include("#define ENABLE_HP_TIMERS", filetag="driver")
+        include_file("dune/codegen/common/timer.hh", filetag="driver")
 
 
-@preamble(section="timings")
-def evaluate_residual_timer():
-    n_go = name_gridoperator(get_form_ident())
-    v = name_vector(get_form_ident())
-    t_v = type_vector(get_form_ident())
-    setup_timer()
+@preamble(section="init")
+def init_likwid_timer(region):
+    return ["LIKWID_MARKER_REGISTER(\"{}\");".format(region)]
 
-    if get_option('instrumentation_level') >= 2:
-        # Write back times
+
+def init_region_timer(region):
+    setup_timer()
+    if get_option("use_likwid"):
+        init_likwid_timer(region)
+    else:
         from dune.codegen.generation import post_include
-        post_include("HP_DECLARE_TIMER(residual_evaluation);", filetag="driver")
+        post_include("HP_DECLARE_TIMER({});".format(region), filetag="driver")
+
+
+def start_region_timer(region):
+    if get_option("use_likwid"):
+        return ["LIKWID_MARKER_START(\"{}\");".format(region)]
+    else:
+        return ["HP_TIMER_START({});".format(region)]
+
+
+def stop_region_timer(region):
+    if get_option("use_likwid"):
+        return ["LIKWID_MARKER_STOP(\"{}\");".format(region)]
+    else:
         timestream = name_timing_stream()
-        print_times = []
+        return ["HP_TIMER_STOP({});".format(region),
+                "DUMP_TIMER({}, {}, {}, true);".format(get_option("instrumentation_level"), region, timestream)]
 
-    lop_name = name_localoperator(get_form_ident())
-    if get_option('instrumentation_level') >= 3:
-        print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
 
-    if get_option('instrumentation_level') >= 2:
-        evaluation = ["HP_TIMER_START(residual_evaluation);",
-                      "{}.residual({}, r);".format(n_go, v),
-                      "HP_TIMER_STOP(residual_evaluation);",
-                      "DUMP_TIMER({}, residual_evaluation, {}, true);".format(get_option("instrumentation_level"), timestream)]
-        evaluation.extend(print_times)
+def start_region_timer_instruction(region, **kwargs):
+    if get_option("use_likwid"):
+        code = "LIKWID_MARKER_START(\"{}\");".format(region)
     else:
-        evaluation = ["{}.residual({}, r);".format(n_go, v)]
+        code = "HP_TIMER_START({});".format(region)
+    from loopy import CInstruction
+    return CInstruction([], code, **kwargs)
 
-    evaluation = ["{} r({});".format(t_v, v), "r=0.0;"] + evaluation
 
-    return evaluation
+def stop_region_timer_instruction(region, **kwargs):
+    if get_option("use_likwid"):
+        code = "LIKWID_MARKER_STOP(\"{}\");".format(region)
+    else:
+        code = "HP_TIMER_STOP({});".format(region)
+    from loopy import CInstruction
+    return CInstruction([], code, **kwargs)
 
 
-@preamble(section="timings")
-def apply_jacobian_timer():
-    n_go = name_gridoperator(get_form_ident())
-    v = name_vector(get_form_ident())
-    t_v = type_vector(get_form_ident())
-    setup_timer()
+def timed_region(region, actions):
+    if isinstance(actions, str):
+        actions = [actions]
+
+    assert(isinstance(actions, list))
 
     if get_option('instrumentation_level') >= 2:
-        # Write back times
-        from dune.codegen.generation import post_include
-        post_include("HP_DECLARE_TIMER(apply_jacobian);", filetag="driver")
-        timestream = name_timing_stream()
+        assembly = []
         print_times = []
 
-    lop_name = name_localoperator(get_form_ident())
-    if get_option('instrumentation_level') >= 3:
-        print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
+        init_region_timer(region)
 
-    if is_linear():
-        declaration = ["{} j({});".format(t_v, v), "j=0.0;"]
-        evaluation = ["{}.jacobian_apply({}, j);".format(n_go, v)]
-    else:
-        declaration = ["{} j0({});".format(t_v, v), "j0=0.0;",
-                       "{} j1({});".format(t_v, v), "j1=0.0;"]
-        evaluation = ["{}.nonlinear_jacobian_apply({}, j0, j1);".format(n_go, v)]
+        if get_option('instrumentation_level') >= 3 and not get_option('use_likwid'):
+            timestream = name_timing_stream()
+            lop_name = name_localoperator(get_form_ident())
+            print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
 
-    if get_option('instrumentation_level') >= 2:
-        evaluation = ["HP_TIMER_START(apply_jacobian);"] + evaluation + ["HP_TIMER_STOP(apply_jacobian);", "DUMP_TIMER({}, apply_jacobian, {}, true);".format(get_option("instrumentation_level"), timestream)]
-        evaluation.extend(print_times)
+        assembly += start_region_timer(region)
+        assembly += actions
+        assembly += stop_region_timer(region)
 
-    return declaration + evaluation
+        return assembly + print_times
+    else:
+        return actions
 
 
 @preamble(section="timings")
-def assemble_matrix_timer():
-    t_go = type_gridoperator(get_form_ident())
+def evaluate_residual_timer():
     n_go = name_gridoperator(get_form_ident())
     v = name_vector(get_form_ident())
-    t_v = type_vector(get_form_ident())
-    setup_timer()
+    r = name_temporary_vector("r", get_form_ident())
 
-    if get_option('instrumentation_level') >= 2:
-        # Write back times
-        from dune.codegen.generation import post_include
-        post_include("HP_DECLARE_TIMER(matrix_assembly);", filetag="driver")
-        timestream = name_timing_stream()
-        print_times = []
+    action = "{}.residual({}, {});".format(n_go, v, r)
 
-    lop_name = name_localoperator(get_form_ident())
-    if get_option('instrumentation_level') >= 3:
-        print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
+    return timed_region("residual_evaluation", action)
 
-    if get_option('instrumentation_level') >= 2:
-        assembly = ["HP_TIMER_START(matrix_assembly);",
-                    "{}.jacobian({},m);".format(n_go, v),
-                    "HP_TIMER_STOP(matrix_assembly);",
-                    "DUMP_TIMER({}, matrix_assembly, {}, true);".format(get_option("instrumentation_level"), timestream)]
-        assembly.extend(print_times)
+
+@preamble(section="timings")
+def apply_jacobian_timer():
+    form = get_form_ident()
+    n_go = name_gridoperator(form)
+    v = name_vector(form)
+
+    if is_linear():
+        j = name_temporary_vector("j", form)
+        action = "{}.jacobian_apply({}, {});".format(n_go, v, j)
     else:
-        assembly = ["{}.jacobian({},m);".format(n_go, v)]
+        j0 = name_temporary_vector("j0", form)
+        j1 = name_temporary_vector("j1", form)
+        action = "{}.nonlinear_jacobian_apply({}, {}, {});".format(n_go, v, j0, j1)
+
+    return timed_region("apply_jacobian", action)
+
+
+@preamble(section="timings")
+def assemble_matrix_timer():
+    n_go = name_gridoperator(get_form_ident())
+    v = name_vector(get_form_ident())
+    m = name_jacobian(get_form_ident())
 
-    assembly = ["using M = typename {}::Traits::Jacobian;".format(t_go),
-                "M m({});".format(n_go)] + assembly
+    action = "{}.jacobian({},{});".format(n_go, v, m)
 
-    return assembly
+    return timed_region("matrix_assembly", action)
diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py
index 3c06fa2bc0dcd14f68248a4c063dc0ba906b2568..dfc553fbefee7fa8c2e6aa1d92ae7ea047acdd38 100644
--- a/python/dune/codegen/pdelab/localoperator.py
+++ b/python/dune/codegen/pdelab/localoperator.py
@@ -17,6 +17,7 @@ from dune.codegen.generation import (accumulation_mixin,
                                      constructor_parameter,
                                      domain,
                                      dump_accumulate_timer,
+                                     register_liwkid_timer,
                                      end_of_file,
                                      function_mangler,
                                      generator_factory,
@@ -677,6 +678,19 @@ class TimerMethod(ClassMember):
         ClassMember.__init__(self, content)
 
 
+class RegisterLikwidMethod(ClassMember):
+    def __init__(self):
+        knl = name_example_kernel()
+        assert(knl is not None)
+
+        content = ["void register_likwid_timers()"
+                   "{"]
+        register_liwkid_timers = [i for i in retrieve_cache_items(condition='register_likwid_timers')]
+        content.extend(map(lambda x: '  ' + x, register_liwkid_timers))
+        content += ["}"]
+        ClassMember.__init__(self, content)
+
+
 class LoopyKernelMethod(ClassMember):
     def __init__(self, signature, kernel, add_timings=True, initializer_list=[]):
         from loopy import generate_body
@@ -697,26 +711,49 @@ class LoopyKernelMethod(ClassMember):
                 from dune.codegen.pdelab.signatures import assembler_routine_name
                 timer_name = assembler_routine_name() + '_kernel'
                 name_example_kernel(name=timer_name)
-                post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-                content.append('  ' + 'HP_TIMER_START({});'.format(timer_name))
-                dump_accumulate_timer(timer_name)
 
-            if add_timings and get_option("instrumentation_level") >= 4:
-                setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
-                post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile')
-                content.append('  HP_TIMER_START({});'.format(setuptimer))
-                dump_accumulate_timer(setuptimer)
+                if get_option('use_likwid'):
+                    from dune.codegen.pdelab.driver.timings import init_likwid_timer
+                    include_file("likwid.h", filetag="operatorfile")
+                    init_likwid_timer(timer_name)
+                    content.append('  ' + 'LIKWID_MARKER_START(\"{}\");'.format(timer_name))
+                    register_liwkid_timer(timer_name)
+                else:
+                    post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+                    content.append('  ' + 'HP_TIMER_START({});'.format(timer_name))
+                    dump_accumulate_timer(timer_name)
+
+                if add_timings and get_option("instrumentation_level") >= 4:
+                    setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
+                    if get_option('use_likwid'):
+                        from dune.codegen.pdelab.driver.timings import init_likwid_timer
+                        init_likwid_timer(setuptimer)
+                        content.append('  ' + 'LIKWID_MARKER_START(\"{}\");'.format(setuptimer))
+                        register_liwkid_timer(setuptimer)
+                    else:
+                        post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile')
+                        content.append('  HP_TIMER_START({});'.format(setuptimer))
+                        dump_accumulate_timer(setuptimer)
 
             # Add kernel preamble
             for i, p in kernel.preambles:
                 content.append('  ' + p)
 
+            if add_timings and get_option('instrumentation_level') >= 4:
+                if get_option('use_likwid'):
+                    content.append('  ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(setuptimer))
+                else:
+                    content.append('  ' + 'HP_TIMER_STOP({});'.format(setuptimer))
+
             # Add kernel body
             content.extend(l for l in generate_body(kernel).split('\n')[1:-1])
 
             # Stop timer
             if add_timings and get_option('instrumentation_level') >= 3:
-                content.append('  ' + 'HP_TIMER_STOP({});'.format(timer_name))
+                if get_option('use_likwid'):
+                    content.append('  ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(timer_name))
+                else:
+                    content.append('  ' + 'HP_TIMER_STOP({});'.format(timer_name))
 
         content.append('}')
         ClassMember.__init__(self, content, name=kernel.name if kernel is not None else "")
@@ -1145,7 +1182,10 @@ def generate_localoperator_file(kernels, filename):
 
     if get_option('instrumentation_level') >= 3:
         include_file('dune/codegen/common/timer.hh', filetag='operatorfile')
-        operator_methods.append(TimerMethod())
+        if get_option('use_likwid'):
+            operator_methods.append(RegisterLikwidMethod())
+        else:
+            operator_methods.append(TimerMethod())
     elif get_option('opcounter'):
         include_file('dune/codegen/common/timer.hh', filetag='operatorfile')
 
diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index b4a48b49473c64b3b46136f7654c34fc53f43ca0..97e7f1d540823d1753197272f0ccd0a415d8a8b2 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -7,6 +7,7 @@ import filelock
 import hashlib
 import json
 from operator import mul
+import time
 
 import loopy as lp
 from pytools import product
@@ -96,10 +97,8 @@ def write_global_data(sf, filename):
 
 
 def write_setup_code(sf, filename, define_thetas=True):
-    opcounting = get_option("opcounter")
     with open(filename, "a") as f:
         # Setup a polynomial object (normally done in the LocalOperator members)
-        set_option("opcounter", False)
         from dune.codegen.loopy.target import type_floatingpoint
         real = type_floatingpoint()
         f.write("  using RF = {};\n".format(real))
@@ -118,11 +117,12 @@ def write_setup_code(sf, filename, define_thetas=True):
         constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
 
         # Allocate buffers
+        alignment = get_option("max_vector_width") // 8
         size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width,
                    product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width)
         size = int(size * (get_option("precision_bits") / 8))
-        f.writelines(["  char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size),
-                      "  char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size),
+        f.writelines(["  char buffer0[{}] __attribute__ ((aligned ({})));\n".format(size, alignment),
+                      "  char buffer1[{}] __attribute__ ((aligned ({})));\n".format(size, alignment),
                       ])
 
         # Setup fastdg inputs
@@ -131,7 +131,7 @@ def write_setup_code(sf, filename, define_thetas=True):
                 f.write("{} = 0;\n".format(arg))
             else:
                 size = sf.interface.fastdg_interface_object_size
-                f.write("  RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size))
+                f.write("  RF {}[{}] __attribute__ ((aligned ({})));\n".format(arg.split()[-1], size, alignment))
 
         # Write stuff into the input buffer
         f.writelines(["  {0} *input = ({0} *)buffer0;\n".format(real),
@@ -174,12 +174,15 @@ def write_setup_code(sf, filename, define_thetas=True):
                       "  rng.seed(42);\n",
                       "  std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
                       ])
-    return opcounting
 
 
 def generate_standalone_code_google_benchmark(sf, filename):
     delete_cache_items("kernel_default")
 
+    # Turn off opcounting
+    opcounting = get_option("opcounter")
+    set_option("opcounter", False)
+
     # Extract sum factorization kernel
     from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
     knl = realize_sumfact_kernel_function(sf)
@@ -207,6 +210,10 @@ def generate_standalone_code_google_benchmark(sf, filename):
     write_global_data(sf, filename)
 
     with open(filename, "a") as f:
+        arguments = ', '.join(sf.interface.signature_args)
+        if len(arguments) > 0:
+            arguments = ', ' + arguments
+        arguments = 'const char* buffer0, const char* buffer1' + arguments
         f.write("void sumfact_kernel({})\n".format(arguments))
         for line in knl.member.lines[1:]:
             f.write("{}\n".format(line))
@@ -214,7 +221,7 @@ def generate_standalone_code_google_benchmark(sf, filename):
         f.write("\n\n")
         f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
 
-    opcounting = write_setup_code(sf, filename, define_thetas=False)
+    write_setup_code(sf, filename, define_thetas=False)
 
     additional_arguments = [i.split()[-1] for i in sf.interface.signature_args]
     additional_arguments = ', '.join(additional_arguments)
@@ -229,12 +236,18 @@ def generate_standalone_code_google_benchmark(sf, filename):
                       "\n",
                       "BENCHMARK_MAIN();"
                       ])
+
+    # Maybe turn opcounting on again
     set_option("opcounter", opcounting)
 
 
 def generate_standalone_code(sf, filename):
     delete_cache_items("kernel_default")
 
+    # Turn off opcounting
+    opcounting = get_option("opcounter")
+    set_option("opcounter", False)
+
     # Extract sum factorization kernel
     from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
     knl = realize_sumfact_kernel_function(sf)
@@ -259,7 +272,7 @@ def generate_standalone_code(sf, filename):
                       "{\n",
                       ])
 
-    opcounting = write_setup_code(sf, filename)
+    write_setup_code(sf, filename)
 
     # Write measurement
     with open(filename, "a") as f:
@@ -285,133 +298,12 @@ def generate_standalone_code(sf, filename):
                       "  std::cout << accum;\n",
                       "}\n",
                       ])
-        set_option("opcounter", opcounting)
-
-
-def generate_standalone_kernel_code(kernel, signature, filename):
-    with open(filename, 'w') as f:
-        # Write headers
-        headers = ['#include "config.h"',
-                   '#include <iostream>',
-                   '#include <fstream>',
-                   '#include <random>',
-                   '#include "benchmark/benchmark.h"',
-                   '#include <dune/codegen/common/vectorclass.hh>',
-                   '#include <dune/codegen/sumfact/horizontaladd.hh>',
-                   ]
-        f.write("\n".join(headers))
-
-        # Get a list of the function argument names
-        assert len(signature) == 1
-        sig = signature[0]
-        arguments = sig[sig.find('(') +1:sig.find(')')].split(',')
-        arguments = [a.split(' ')[-1] for a in arguments]
-
-        global_args = [a for a in kernel.args if a.name not in arguments]
-
-        # Declare global arguments
-        f.write('\n\n')
-        target = DuneTarget()
-        for g in global_args:
-            decl_info = g.decl_info(target, True, g.dtype)
-            for idi in decl_info:
-                ast_builder = target.get_device_ast_builder()
-                arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
-                arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
-                arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
-                f.write('{}\n'.format(arg_decl))
-
-        # Generate function we want to benchmark
-        f.write('\n')
-        f.write(sig[0:sig.find(')')+1])
-        f.writelines(lp.generate_body(kernel))
-        f.write('\n\n')
-
-        # Generate function that will do the benchmarking
-        f.write('static void BM_sumfact_kernel(benchmark::State& state){\n')
-
-        # Declare random generators
-        real = type_floatingpoint()
-        lines = ['  std::uniform_real_distribution<{}> unif(0,1);'.format(real),
-                 '  std::uniform_int_distribution<int> unif_int(0,128);',
-                 '  std::default_random_engine re;']
-        f.write('\n'.join(lines) + '\n')
-
-        # Declare function arguments
-        function_arguments = [a for a in kernel.args if a.name in arguments]
-        for arg in function_arguments:
-            if 'buffer' in arg.name:
-                byte_size = reduce(mul, arg.shape) * 8
-                f.write('  char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
-                                                                                 byte_size,
-                                                                                 arg.alignment),)
-            elif isinstance(arg, lp.ValueArg):
-                assert 'jacobian_offset' in arg.name
-                decl = arg.get_arg_decl(ast_builder)
-                decl = Initializer(decl, 'unif_int(re)')
-                f.write('  {}\n'.format(decl))
-            else:
-                assert 'fastdg' in arg.name
-                size = reduce(mul, arg.shape)
-                alignment = arg.dtype.itemsize
-                f.write('  {} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
-                                                                               arg.name,
-                                                                               size,
-                                                                               alignment))
-
-        # Initialize arguments
-        def _initialize_arg(arg):
-            if isinstance(arg, lp.ValueArg):
-                return []
-            real = type_floatingpoint()
-            size = reduce(mul, arg.shape)
-            fill_name = arg.name + '_fill'
-            lines = ['  {}* {} = (double *) {};'.format(real, fill_name, arg.name),
-                     '  for (std::size_t i=0; i<{}; ++i){{'.format(size),
-                     '    {}[i] = unif(re);'.format(fill_name),
-                     '  }']
-            return lines
-
-        for arg in kernel.args:
-            lines = _initialize_arg(arg)
-            f.write('\n'.join(lines) + '\n')
-
-        # Benchmark loop
-        function_call = kernel.name + '({})'.format(','.join(arguments))
-        f.writelines(['  for (auto _ : state){\n',
-                      '    {};\n'.format(function_call),
-                      '  }\n',
-                      ])
-        f.write('}\n')
-
-        # Benchmark main
-        main = ['',
-                'BENCHMARK(BM_sumfact_kernel);',
-                '',
-                'BENCHMARK_MAIN();']
-        f.write('\n'.join(main))
-
-
-def autotune_realization(sf=None, kernel=None, signature=None):
-    """"Generate an executable run a benchmark and return time
-
-    The benchmark can be generated from a SumfactKernel or a LoopKernel with a
-    function signature. For SumfactKernels you can generate benchmarks with or
-    without google benchmark, for LoopKernels only google benchmarks are
-    possible.
-
-    Parameters
-    ----------
-    sf : SumfactKernel or VectorizedSumfactKernel
-    kernel : loopy.kernel.LoopKernel
-    signature : str
-    """
-    if sf is None:
-        assert kernel and signature
-        assert get_option("autotune_google_benchmark")
-    else:
-        assert kernel is None and signature is None
 
+    # Maybe turn opcounting on again
+    set_option("opcounter", opcounting)
+
+
+def autotune_realization(sf):
     # Make sure that the benchmark directory exists
     dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
     if not os.path.exists(dir):
@@ -452,6 +344,10 @@ def autotune_realization(sf=None, kernel=None, signature=None):
                 if ret != 0:
                     raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call)))
 
+                # File system synchronization!
+                while not os.path.exists(executable):
+                    time.sleep(0.01)
+
                 # Check whether the user specified an execution wrapper
                 call = []
                 wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_EXECUTION_WRAPPER")
@@ -469,10 +365,19 @@ def autotune_realization(sf=None, kernel=None, signature=None):
                 if ret != 0:
                     raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
 
+                # File system synchronization!
+                while not os.path.exists(logname):
+                    time.sleep(0.01)
+
             # Extract the result form the log file
             if get_option("autotune_google_benchmark"):
+                import json
                 with open(logname) as json_file:
-                    data = json.load(json_file)
-                    return data['benchmarks'][0]['cpu_time']
+                    try:
+                        data = json.load(json_file)
+                        return data['benchmarks'][0]['cpu_time']
+                    except Exception as e:
+                        print("Error while loading file {}".format(logname))
+                        raise e
             else:
                 return float(next(iter(open(logname, "r")))) / 1000000