From 4ee4572ac5a69118e89c1ed5ef831ba19e027ae5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20He=C3=9F?= <rene.hess@iwr.uni-heidelberg.de> Date: Fri, 15 Feb 2019 15:14:17 +0100 Subject: [PATCH] [skip ci] Make it possible to use google-benchmark for autotuning --- python/dune/codegen/options.py | 1 + python/dune/codegen/sumfact/autotune.py | 156 ++++++++++++++++++++---- 2 files changed, 132 insertions(+), 25 deletions(-) diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py index bb6bbafa..86424869 100644 --- a/python/dune/codegen/options.py +++ b/python/dune/codegen/options.py @@ -55,6 +55,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord): target_name = CodegenOption(default=None, helpstr="The target name from CMake") operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!") debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).") + autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).") # Arguments that are mainly to be set by logic depending on other options max_vector_width = CodegenOption(default=256, helpstr=None) diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py index 68d81957..f8b03de6 100644 --- a/python/dune/codegen/sumfact/autotune.py +++ b/python/dune/codegen/sumfact/autotune.py @@ -65,28 +65,38 @@ def compiler_invocation(name, filename): return compile_flags -def generate_standalone_code(sf, filename): - delete_cache_items("kernel_default") +def write_global_data(sf, filename): + opcounting = get_option("opcounter") + with open(filename, "a") as f: + # Get kernel + from dune.codegen.pdelab.localoperator import extract_kernel_from_cache + knl = realize_sumfact_kernel_function(sf) + constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False) + constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False)) + constructor_knl = lp.get_one_scheduled_kernel(constructor_knl) - with open(filename, "w") as f: - f.writelines(["#include \"config.h\"\n", - "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", - "#include<dune/codegen/common/tsc.hh>\n", - "#include<dune/codegen/common/vectorclass.hh>\n", - "#include<dune/codegen/sumfact/onedquadrature.hh>\n", - "#include<dune/codegen/sumfact/horizontaladd.hh>\n", - "#include<random>\n", - "#include<fstream>\n", - "#include<iostream>\n", - "\n" - ]) + target = DuneTarget() + from loopy.codegen import CodeGenerationState + codegen_state = CodeGenerationState(kernel=constructor_knl, + implemented_data_info=None, + implemented_domain=None, + implemented_predicates=frozenset(), + seen_dtypes=frozenset(), + seen_functions=frozenset(), + seen_atomic_dtypes=frozenset(), + var_subst_map={}, + allow_complex=False, + is_generating_device_code=True, + ) + + for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0): + f.write("{}\n".format(next(iter(decl.generate())))) - f.writelines(["int main(int argc, char** argv)\n", - "{\n", - ]) +def write_setup_code(sf, filename, define_thetas=True): + opcounting = get_option("opcounter") + with open(filename, "a") as f: # Setup a polynomial object (normally done in the LocalOperator members) - opcounting = get_option("opcounter") set_option("opcounter", False) from dune.codegen.loopy.target import type_floatingpoint real = type_floatingpoint() @@ -98,7 +108,7 @@ def generate_standalone_code(sf, filename): for deg in set(degs): f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg))) - # Get kernels + # Get kernel from dune.codegen.pdelab.localoperator import extract_kernel_from_cache knl = realize_sumfact_kernel_function(sf) constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False) @@ -142,8 +152,9 @@ def generate_standalone_code(sf, filename): is_generating_device_code=True, ) - for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0): - f.write(" {}\n".format(next(iter(decl.generate())))) + if define_thetas: + for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0): + f.write(" {}\n".format(next(iter(decl.generate())))) for _, line in constructor_knl.preambles: if "gfsu" not in line: @@ -161,7 +172,89 @@ def generate_standalone_code(sf, filename): " rng.seed(42);\n", " std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)), ]) + return opcounting + + +def generate_standalone_code_google_benchmark(sf, filename): + delete_cache_items("kernel_default") + + # Extract sum factorization kernel + from dune.codegen.pdelab.localoperator import extract_kernel_from_cache + knl = realize_sumfact_kernel_function(sf) + + # Add the implementation of the kernel. + # TODO: This can probably done in a safer way? + first_line = knl.member.lines[0] + arguments = first_line[first_line.find("(")+1:first_line.find(")")] + + with open(filename, "w") as f: + f.writelines(["// {}".format(first_line), + "\n", + "#include \"config.h\"\n", + "#include \"benchmark/benchmark.h\"\n", + "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", + "#include<dune/codegen/common/vectorclass.hh>\n", + "#include<dune/codegen/sumfact/onedquadrature.hh>\n", + "#include<dune/codegen/sumfact/horizontaladd.hh>\n", + "#include<random>\n", + "#include<fstream>\n", + "#include<iostream>\n", + "\n" + ]) + + write_global_data(sf, filename); + + with open(filename, "a") as f: + f.write("void sumfact_kernel({})\n".format(arguments)) + for line in knl.member.lines[1:]: + f.write("{}\n".format(line)) + + f.write("\n\n") + f.write("static void BM_sumfact_kernel(benchmark::State& state){\n") + + write_setup_code(sf, filename, define_thetas=False) + + with open(filename, "a") as f: + f.writelines([" for (auto _ : state){\n", + " sumfact_kernel(buffer0, buffer1);\n", + " }\n", + "}\n", + "BENCHMARK(BM_sumfact_kernel);\n", + "\n", + "BENCHMARK_MAIN();" + ]) + + +def generate_standalone_code(sf, filename): + delete_cache_items("kernel_default") + # Extract sum factorization kernel + from dune.codegen.pdelab.localoperator import extract_kernel_from_cache + knl = realize_sumfact_kernel_function(sf) + + with open(filename, "w") as f: + f.writelines(["// {}".format(first_line), + "\n", + "#include \"config.h\"\n", + "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", + "#include<dune/codegen/common/tsc.hh>\n", + "#include<dune/codegen/common/vectorclass.hh>\n", + "#include<dune/codegen/sumfact/onedquadrature.hh>\n", + "#include<dune/codegen/sumfact/horizontaladd.hh>\n", + "#include<random>\n", + "#include<fstream>\n", + "#include<iostream>\n", + "\n" + ]) + + f.writelines(["int main(int argc, char** argv)\n", + "{\n", + ]) + + opcounting = write_setup_code(sf, filename) + + # Write measurement + with open(filename, "a") as f: # Start a TSC timer f.writelines([" auto start = Dune::PDELab::TSC::start();\n", ]) @@ -204,12 +297,15 @@ def autotune_realization(sf): with cache_restoring(): with filelock.FileLock(lock): if not os.path.isfile(logname): - generate_standalone_code(sf, filename) + if get_option("autotune_google_benchmark"): + generate_standalone_code_google_benchmark(sf, filename) + else: + generate_standalone_code(sf, filename) devnull = open(os.devnull, 'w') ret = subprocess.call(compiler_invocation(executable, filename), stdout=devnull, stderr=subprocess.STDOUT) if ret != 0: - raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(name, filename)))) + raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(executable, filename)))) # Check whether the user specified an execution wrapper call = [] @@ -219,10 +315,20 @@ def autotune_realization(sf): # Run the benchmark program call.append(executable) - call.append(logname) + if get_option("autotune_google_benchmark"): + call.append("--benchmark_out={}".format(logname)) + # call.append("--benchmark_out_format=csv") + else: + call.append(logname) ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT) if ret != 0: raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call))) # Extract the result form the log file - return float(next(iter(open(logname, "r")))) / 1000000 + if get_option("autotune_google_benchmark"): + import json + with open(logname) as json_file: + data = json.load(json_file) + return data['benchmarks'][0]['cpu_time'] + else: + return float(next(iter(open(logname, "r")))) / 1000000 -- GitLab