From 4ee4572ac5a69118e89c1ed5ef831ba19e027ae5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20He=C3=9F?= <>
Date: Fri, 15 Feb 2019 15:14:17 +0100
Subject: [PATCH] [skip ci] Make it possible to use google-benchmark for

 python/dune/codegen/          |   1 +
 python/dune/codegen/sumfact/ | 156 ++++++++++++++++++++----
 2 files changed, 132 insertions(+), 25 deletions(-)

diff --git a/python/dune/codegen/ b/python/dune/codegen/
index bb6bbafa..86424869 100644
--- a/python/dune/codegen/
+++ b/python/dune/codegen/
@@ -55,6 +55,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
     target_name = CodegenOption(default=None, helpstr="The target name from CMake")
     operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
     debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).")
+    autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
     # Arguments that are mainly to be set by logic depending on other options
     max_vector_width = CodegenOption(default=256, helpstr=None)
diff --git a/python/dune/codegen/sumfact/ b/python/dune/codegen/sumfact/
index 68d81957..f8b03de6 100644
--- a/python/dune/codegen/sumfact/
+++ b/python/dune/codegen/sumfact/
@@ -65,28 +65,38 @@ def compiler_invocation(name, filename):
     return compile_flags
-def generate_standalone_code(sf, filename):
-    delete_cache_items("kernel_default")
+def write_global_data(sf, filename):
+    opcounting = get_option("opcounter")
+    with open(filename, "a") as f:
+        # Get kernel
+        from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
+        knl = realize_sumfact_kernel_function(sf)
+        constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
+        constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
+        constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
-    with open(filename, "w") as f:
-        f.writelines(["#include \"config.h\"\n",
-                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
-                      "#include<dune/codegen/common/tsc.hh>\n",
-                      "#include<dune/codegen/common/vectorclass.hh>\n",
-                      "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
-                      "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
-                      "#include<random>\n",
-                      "#include<fstream>\n",
-                      "#include<iostream>\n",
-                      "\n"
-                      ])
+        target = DuneTarget()
+        from loopy.codegen import CodeGenerationState
+        codegen_state = CodeGenerationState(kernel=constructor_knl,
+                                            implemented_data_info=None,
+                                            implemented_domain=None,
+                                            implemented_predicates=frozenset(),
+                                            seen_dtypes=frozenset(),
+                                            seen_functions=frozenset(),
+                                            seen_atomic_dtypes=frozenset(),
+                                            var_subst_map={},
+                                            allow_complex=False,
+                                            is_generating_device_code=True,
+                                            )
+        for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
+            f.write("{}\n".format(next(iter(decl.generate()))))
-        f.writelines(["int main(int argc, char** argv)\n",
-                      "{\n",
-                      ])
+def write_setup_code(sf, filename, define_thetas=True):
+    opcounting = get_option("opcounter")
+    with open(filename, "a") as f:
         # Setup a polynomial object (normally done in the LocalOperator members)
-        opcounting = get_option("opcounter")
         set_option("opcounter", False)
         from import type_floatingpoint
         real = type_floatingpoint()
@@ -98,7 +108,7 @@ def generate_standalone_code(sf, filename):
         for deg in set(degs):
             f.write("  Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
-        # Get kernels
+        # Get kernel
         from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
         knl = realize_sumfact_kernel_function(sf)
         constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
@@ -142,8 +152,9 @@ def generate_standalone_code(sf, filename):
-        for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
-            f.write("  {}\n".format(next(iter(decl.generate()))))
+        if define_thetas:
+            for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
+                f.write("  {}\n".format(next(iter(decl.generate()))))
         for _, line in constructor_knl.preambles:
             if "gfsu" not in line:
@@ -161,7 +172,89 @@ def generate_standalone_code(sf, filename):
                       "  rng.seed(42);\n",
                       "  std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
+    return opcounting
+def generate_standalone_code_google_benchmark(sf, filename):
+    delete_cache_items("kernel_default")
+    # Extract sum factorization kernel
+    from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
+    knl = realize_sumfact_kernel_function(sf)
+    # Add the implementation of the kernel.
+    # TODO: This can probably done in a safer way?
+    first_line = knl.member.lines[0]
+    arguments = first_line[first_line.find("(")+1:first_line.find(")")]
+    with open(filename, "w") as f:
+        f.writelines(["// {}".format(first_line),
+                      "\n",
+                      "#include \"config.h\"\n",
+                      "#include \"benchmark/benchmark.h\"\n",
+                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
+                      "#include<dune/codegen/common/vectorclass.hh>\n",
+                      "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
+                      "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
+                      "#include<random>\n",
+                      "#include<fstream>\n",
+                      "#include<iostream>\n",
+                      "\n"
+                      ])
+    write_global_data(sf, filename);
+    with open(filename, "a") as f:
+        f.write("void sumfact_kernel({})\n".format(arguments))
+        for line in knl.member.lines[1:]:
+            f.write("{}\n".format(line))
+        f.write("\n\n")
+        f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
+    write_setup_code(sf, filename, define_thetas=False)
+    with open(filename, "a") as f:
+        f.writelines(["  for (auto _ : state){\n",
+                      "    sumfact_kernel(buffer0, buffer1);\n",
+                      "  }\n",
+                      "}\n",
+                      "BENCHMARK(BM_sumfact_kernel);\n",
+                      "\n",
+                      "BENCHMARK_MAIN();"
+                      ])
+def generate_standalone_code(sf, filename):
+    delete_cache_items("kernel_default")
+    # Extract sum factorization kernel
+    from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
+    knl = realize_sumfact_kernel_function(sf)
+    with open(filename, "w") as f:
+        f.writelines(["// {}".format(first_line),
+                      "\n",
+                      "#include \"config.h\"\n",
+                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
+                      "#include<dune/codegen/common/tsc.hh>\n",
+                      "#include<dune/codegen/common/vectorclass.hh>\n",
+                      "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
+                      "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
+                      "#include<random>\n",
+                      "#include<fstream>\n",
+                      "#include<iostream>\n",
+                      "\n"
+                      ])
+        f.writelines(["int main(int argc, char** argv)\n",
+                      "{\n",
+                      ])
+    opcounting = write_setup_code(sf, filename)
+    # Write measurement
+    with open(filename, "a") as f:
         # Start a TSC timer
         f.writelines(["  auto start = Dune::PDELab::TSC::start();\n",
@@ -204,12 +297,15 @@ def autotune_realization(sf):
     with cache_restoring():
         with filelock.FileLock(lock):
             if not os.path.isfile(logname):
-                generate_standalone_code(sf, filename)
+                if get_option("autotune_google_benchmark"):
+                    generate_standalone_code_google_benchmark(sf, filename)
+                else:
+                    generate_standalone_code(sf, filename)
                 devnull = open(os.devnull, 'w')
                 ret =, filename), stdout=devnull, stderr=subprocess.STDOUT)
                 if ret != 0:
-                    raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(name, filename))))
+                    raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(executable, filename))))
                 # Check whether the user specified an execution wrapper
                 call = []
@@ -219,10 +315,20 @@ def autotune_realization(sf):
                 # Run the benchmark program
-                call.append(logname)
+                if get_option("autotune_google_benchmark"):
+                    call.append("--benchmark_out={}".format(logname))
+                    # call.append("--benchmark_out_format=csv")
+                else:
+                    call.append(logname)
                 ret =, stdout=devnull, stderr=subprocess.STDOUT)
                 if ret != 0:
                     raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
             # Extract the result form the log file
-            return float(next(iter(open(logname, "r")))) / 1000000
+            if get_option("autotune_google_benchmark"):
+                import json
+                with open(logname) as json_file:
+                    data = json.load(json_file)
+                    return data['benchmarks'][0]['cpu_time']
+            else:
+                return float(next(iter(open(logname, "r")))) / 1000000