diff --git a/cmake/modules/DuneCodegenMacros.cmake b/cmake/modules/DuneCodegenMacros.cmake
index 787f9d48399d35ae25062f59d8d2343cce2f9607..bd3f9649116d72afee7118e5d3940e5f11a54e4e 100644
--- a/cmake/modules/DuneCodegenMacros.cmake
+++ b/cmake/modules/DuneCodegenMacros.cmake
@@ -79,6 +79,8 @@
 #    generator is run.
 #
 
+find_package(benchmark)
+
 add_custom_target(generation)
 
 # Gather a list of form compiler sources to add as dependencies
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 84ac56603f1c749cf8979d52d3d890ed9d2e0ee4..b96bd06cbce6361f1130e17bb7cb4d97efe08ec7 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -21,6 +21,9 @@ dune_python_add_test(NAME pep8-ourcode
 
 add_subdirectory(test)
 
-# Add a dummy target to extract compiler flags for the autotune tool chain
 add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc)
 target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing)
+
+if(benchmark_FOUND)
+  target_link_libraries(_autotune_target benchmark::benchmark)
+endif()
diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index bb6bbafaf09dd0b0ae7330b174dfa5805769d09f..8642486908dbbea2c9bde300fe377cd4ae94c1ba 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -55,6 +55,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
     target_name = CodegenOption(default=None, helpstr="The target name from CMake")
     operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
     debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).")
+    autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
 
     # Arguments that are mainly to be set by logic depending on other options
     max_vector_width = CodegenOption(default=256, helpstr=None)
diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index 664772014a85f177f90702c94fe66558493e902e..717b9d804ef849b29e1c3fe3c4d7702010efc6e1 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -65,28 +65,38 @@ def compiler_invocation(name, filename):
     return compile_flags
 
 
-def generate_standalone_code(sf, filename):
-    delete_cache_items("kernel_default")
+def write_global_data(sf, filename):
+    opcounting = get_option("opcounter")
+    with open(filename, "a") as f:
+        # Get kernel
+        from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
+        knl = realize_sumfact_kernel_function(sf)
+        constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
+        constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
+        constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
 
-    with open(filename, "w") as f:
-        f.writelines(["#include \"config.h\"\n",
-                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
-                      "#include<dune/codegen/common/tsc.hh>\n",
-                      "#include<dune/codegen/common/vectorclass.hh>\n",
-                      "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
-                      "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
-                      "#include<random>\n",
-                      "#include<fstream>\n",
-                      "#include<iostream>\n",
-                      "\n"
-                      ])
+        target = DuneTarget()
+        from loopy.codegen import CodeGenerationState
+        codegen_state = CodeGenerationState(kernel=constructor_knl,
+                                            implemented_data_info=None,
+                                            implemented_domain=None,
+                                            implemented_predicates=frozenset(),
+                                            seen_dtypes=frozenset(),
+                                            seen_functions=frozenset(),
+                                            seen_atomic_dtypes=frozenset(),
+                                            var_subst_map={},
+                                            allow_complex=False,
+                                            is_generating_device_code=True,
+                                            )
+
+        for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
+            f.write("{}\n".format(next(iter(decl.generate()))))
 
-        f.writelines(["int main(int argc, char** argv)\n",
-                      "{\n",
-                      ])
 
+def write_setup_code(sf, filename, define_thetas=True):
+    opcounting = get_option("opcounter")
+    with open(filename, "a") as f:
         # Setup a polynomial object (normally done in the LocalOperator members)
-        opcounting = get_option("opcounter")
         set_option("opcounter", False)
         from dune.codegen.loopy.target import type_floatingpoint
         real = type_floatingpoint()
@@ -98,7 +108,7 @@ def generate_standalone_code(sf, filename):
         for deg in set(degs):
             f.write("  Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
 
-        # Get kernels
+        # Get kernel
         from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
         knl = realize_sumfact_kernel_function(sf)
         constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
@@ -142,8 +152,9 @@ def generate_standalone_code(sf, filename):
                                             is_generating_device_code=True,
                                             )
 
-        for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
-            f.write("  {}\n".format(next(iter(decl.generate()))))
+        if define_thetas:
+            for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
+                f.write("  {}\n".format(next(iter(decl.generate()))))
 
         for _, line in constructor_knl.preambles:
             if "gfsu" not in line:
@@ -161,7 +172,90 @@ def generate_standalone_code(sf, filename):
                       "  rng.seed(42);\n",
                       "  std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
                       ])
+    return opcounting
+
+
+def generate_standalone_code_google_benchmark(sf, filename):
+    delete_cache_items("kernel_default")
+
+    # Extract sum factorization kernel
+    from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
+    knl = realize_sumfact_kernel_function(sf)
+
+    # Add the implementation of the kernel.
+    # TODO: This can probably done in a safer way?
+    first_line = knl.member.lines[0]
+    arguments = first_line[first_line.find("(") + 1:first_line.find(")")]
+
+    with open(filename, "w") as f:
+        f.writelines(["// {}".format(first_line),
+                      "\n",
+                      "#include \"config.h\"\n",
+                      "#include \"benchmark/benchmark.h\"\n",
+                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
+                      "#include<dune/codegen/common/vectorclass.hh>\n",
+                      "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
+                      "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
+                      "#include<random>\n",
+                      "#include<fstream>\n",
+                      "#include<iostream>\n",
+                      "\n"
+                      ])
+
+    write_global_data(sf, filename)
+
+    with open(filename, "a") as f:
+        f.write("void sumfact_kernel({})\n".format(arguments))
+        for line in knl.member.lines[1:]:
+            f.write("{}\n".format(line))
+
+        f.write("\n\n")
+        f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
+
+    write_setup_code(sf, filename, define_thetas=False)
+
+    with open(filename, "a") as f:
+        f.writelines(["  for (auto _ : state){\n",
+                      "    sumfact_kernel(buffer0, buffer1);\n",
+                      "  }\n",
+                      "}\n",
+                      "BENCHMARK(BM_sumfact_kernel);\n",
+                      "\n",
+                      "BENCHMARK_MAIN();"
+                      ])
+
+
+def generate_standalone_code(sf, filename):
+    delete_cache_items("kernel_default")
 
+    # Extract sum factorization kernel
+    from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
+    knl = realize_sumfact_kernel_function(sf)
+    first_line = knl.member.lines[0]
+
+    with open(filename, "w") as f:
+        f.writelines(["// {}".format(first_line),
+                      "\n",
+                      "#include \"config.h\"\n",
+                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
+                      "#include<dune/codegen/common/tsc.hh>\n",
+                      "#include<dune/codegen/common/vectorclass.hh>\n",
+                      "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
+                      "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
+                      "#include<random>\n",
+                      "#include<fstream>\n",
+                      "#include<iostream>\n",
+                      "\n"
+                      ])
+
+        f.writelines(["int main(int argc, char** argv)\n",
+                      "{\n",
+                      ])
+
+    opcounting = write_setup_code(sf, filename)
+
+    # Write measurement
+    with open(filename, "a") as f:
         # Start a TSC timer
         f.writelines(["  auto start = Dune::PDELab::TSC::start();\n",
                       ])
@@ -205,12 +299,15 @@ def autotune_realization(sf):
     with cache_restoring():
         with filelock.FileLock(lock):
             if not os.path.isfile(logname):
-                generate_standalone_code(sf, filename)
+                if get_option("autotune_google_benchmark"):
+                    generate_standalone_code_google_benchmark(sf, filename)
+                else:
+                    generate_standalone_code(sf, filename)
 
                 devnull = open(os.devnull, 'w')
                 ret = subprocess.call(compiler_invocation(executable, filename), stdout=devnull, stderr=subprocess.STDOUT)
                 if ret != 0:
-                    raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(name, filename))))
+                    raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(executable, filename))))
 
                 # Check whether the user specified an execution wrapper
                 call = []
@@ -220,10 +317,20 @@ def autotune_realization(sf):
 
                 # Run the benchmark program
                 call.append(executable)
-                call.append(logname)
+                if get_option("autotune_google_benchmark"):
+                    call.append("--benchmark_out={}".format(logname))
+                    # call.append("--benchmark_out_format=csv")
+                else:
+                    call.append(logname)
                 ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
                 if ret != 0:
                     raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
 
             # Extract the result form the log file
-            return float(next(iter(open(logname, "r")))) / 1000000
+            if get_option("autotune_google_benchmark"):
+                import json
+                with open(logname) as json_file:
+                    data = json.load(json_file)
+                    return data['benchmarks'][0]['cpu_time']
+            else:
+                return float(next(iter(open(logname, "r")))) / 1000000
diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt
index 307e6d43e25caa89234a884156c7ee5643f16b8f..3b16c1da023a17b077df6c1acac29a667288cf39 100644
--- a/test/sumfact/poisson/CMakeLists.txt
+++ b/test/sumfact/poisson/CMakeLists.txt
@@ -119,3 +119,13 @@ dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
                                   BASENAME sumfact_poisson_dg_3d_diagonal
                                   INIFILE diagonal.mini
                                   )
+
+#======================================
+# Test autotuning with google-benchmark
+#======================================
+if(benchmark_FOUND)
+  dune_add_formcompiler_system_test(UFLFILE poisson_3d.ufl
+                                  BASENAME sumfact_poisson_3d_benchmark
+                                  INIFILE poisson_3d_benchmark.mini
+                                  )
+endif()
diff --git a/test/sumfact/poisson/poisson_3d_benchmark.mini b/test/sumfact/poisson/poisson_3d_benchmark.mini
new file mode 100644
index 0000000000000000000000000000000000000000..aca0d876328991b5b61e848919274631b78d8434
--- /dev/null
+++ b/test/sumfact/poisson/poisson_3d_benchmark.mini
@@ -0,0 +1,29 @@
+__name = sumfact_poisson_3d_benchmark_{__exec_suffix}
+__exec_suffix = {deg_suffix}_{diff_suffix}_{quadvec_suffix}_{gradvec_suffix}
+
+deg_suffix = deg{formcompiler.ufl_variants.degree}
+diff_suffix = symdiff
+quadvec_suffix = quadvec
+gradvec_suffix = autotunevec
+
+cells = 8 8 8
+extension = 1. 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+reference = poisson_ref
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-4
+autotune_google_benchmark = 1
+
+[formcompiler.r]
+numerical_jacobian = 0
+sumfact = 1
+vectorization_quadloop = 1
+vectorization_strategy = autotune
+geometry_mixins = sumfact_equidistant
+
+[formcompiler.ufl_variants]
+degree = 1