Skip to content
Snippets Groups Projects
Commit 4ee4572a authored by René Heß's avatar René Heß
Browse files

[skip ci] Make it possible to use google-benchmark for autotuning

parent 03d54df6
No related branches found
No related tags found
No related merge requests found
...@@ -55,6 +55,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord): ...@@ -55,6 +55,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
target_name = CodegenOption(default=None, helpstr="The target name from CMake") target_name = CodegenOption(default=None, helpstr="The target name from CMake")
operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!") operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).") debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).")
autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
# Arguments that are mainly to be set by logic depending on other options # Arguments that are mainly to be set by logic depending on other options
max_vector_width = CodegenOption(default=256, helpstr=None) max_vector_width = CodegenOption(default=256, helpstr=None)
......
...@@ -65,28 +65,38 @@ def compiler_invocation(name, filename): ...@@ -65,28 +65,38 @@ def compiler_invocation(name, filename):
return compile_flags return compile_flags
def generate_standalone_code(sf, filename): def write_global_data(sf, filename):
delete_cache_items("kernel_default") opcounting = get_option("opcounter")
with open(filename, "a") as f:
# Get kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
with open(filename, "w") as f: target = DuneTarget()
f.writelines(["#include \"config.h\"\n", from loopy.codegen import CodeGenerationState
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", codegen_state = CodeGenerationState(kernel=constructor_knl,
"#include<dune/codegen/common/tsc.hh>\n", implemented_data_info=None,
"#include<dune/codegen/common/vectorclass.hh>\n", implemented_domain=None,
"#include<dune/codegen/sumfact/onedquadrature.hh>\n", implemented_predicates=frozenset(),
"#include<dune/codegen/sumfact/horizontaladd.hh>\n", seen_dtypes=frozenset(),
"#include<random>\n", seen_functions=frozenset(),
"#include<fstream>\n", seen_atomic_dtypes=frozenset(),
"#include<iostream>\n", var_subst_map={},
"\n" allow_complex=False,
]) is_generating_device_code=True,
)
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write("{}\n".format(next(iter(decl.generate()))))
f.writelines(["int main(int argc, char** argv)\n",
"{\n",
])
def write_setup_code(sf, filename, define_thetas=True):
opcounting = get_option("opcounter")
with open(filename, "a") as f:
# Setup a polynomial object (normally done in the LocalOperator members) # Setup a polynomial object (normally done in the LocalOperator members)
opcounting = get_option("opcounter")
set_option("opcounter", False) set_option("opcounter", False)
from dune.codegen.loopy.target import type_floatingpoint from dune.codegen.loopy.target import type_floatingpoint
real = type_floatingpoint() real = type_floatingpoint()
...@@ -98,7 +108,7 @@ def generate_standalone_code(sf, filename): ...@@ -98,7 +108,7 @@ def generate_standalone_code(sf, filename):
for deg in set(degs): for deg in set(degs):
f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg))) f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
# Get kernels # Get kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf) knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False) constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
...@@ -142,8 +152,9 @@ def generate_standalone_code(sf, filename): ...@@ -142,8 +152,9 @@ def generate_standalone_code(sf, filename):
is_generating_device_code=True, is_generating_device_code=True,
) )
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0): if define_thetas:
f.write(" {}\n".format(next(iter(decl.generate())))) for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write(" {}\n".format(next(iter(decl.generate()))))
for _, line in constructor_knl.preambles: for _, line in constructor_knl.preambles:
if "gfsu" not in line: if "gfsu" not in line:
...@@ -161,7 +172,89 @@ def generate_standalone_code(sf, filename): ...@@ -161,7 +172,89 @@ def generate_standalone_code(sf, filename):
" rng.seed(42);\n", " rng.seed(42);\n",
" std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)), " std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
]) ])
return opcounting
def generate_standalone_code_google_benchmark(sf, filename):
delete_cache_items("kernel_default")
# Extract sum factorization kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
# Add the implementation of the kernel.
# TODO: This can probably done in a safer way?
first_line = knl.member.lines[0]
arguments = first_line[first_line.find("(")+1:first_line.find(")")]
with open(filename, "w") as f:
f.writelines(["// {}".format(first_line),
"\n",
"#include \"config.h\"\n",
"#include \"benchmark/benchmark.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
write_global_data(sf, filename);
with open(filename, "a") as f:
f.write("void sumfact_kernel({})\n".format(arguments))
for line in knl.member.lines[1:]:
f.write("{}\n".format(line))
f.write("\n\n")
f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
write_setup_code(sf, filename, define_thetas=False)
with open(filename, "a") as f:
f.writelines([" for (auto _ : state){\n",
" sumfact_kernel(buffer0, buffer1);\n",
" }\n",
"}\n",
"BENCHMARK(BM_sumfact_kernel);\n",
"\n",
"BENCHMARK_MAIN();"
])
def generate_standalone_code(sf, filename):
delete_cache_items("kernel_default")
# Extract sum factorization kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
with open(filename, "w") as f:
f.writelines(["// {}".format(first_line),
"\n",
"#include \"config.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/tsc.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
f.writelines(["int main(int argc, char** argv)\n",
"{\n",
])
opcounting = write_setup_code(sf, filename)
# Write measurement
with open(filename, "a") as f:
# Start a TSC timer # Start a TSC timer
f.writelines([" auto start = Dune::PDELab::TSC::start();\n", f.writelines([" auto start = Dune::PDELab::TSC::start();\n",
]) ])
...@@ -204,12 +297,15 @@ def autotune_realization(sf): ...@@ -204,12 +297,15 @@ def autotune_realization(sf):
with cache_restoring(): with cache_restoring():
with filelock.FileLock(lock): with filelock.FileLock(lock):
if not os.path.isfile(logname): if not os.path.isfile(logname):
generate_standalone_code(sf, filename) if get_option("autotune_google_benchmark"):
generate_standalone_code_google_benchmark(sf, filename)
else:
generate_standalone_code(sf, filename)
devnull = open(os.devnull, 'w') devnull = open(os.devnull, 'w')
ret = subprocess.call(compiler_invocation(executable, filename), stdout=devnull, stderr=subprocess.STDOUT) ret = subprocess.call(compiler_invocation(executable, filename), stdout=devnull, stderr=subprocess.STDOUT)
if ret != 0: if ret != 0:
raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(name, filename)))) raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(executable, filename))))
# Check whether the user specified an execution wrapper # Check whether the user specified an execution wrapper
call = [] call = []
...@@ -219,10 +315,20 @@ def autotune_realization(sf): ...@@ -219,10 +315,20 @@ def autotune_realization(sf):
# Run the benchmark program # Run the benchmark program
call.append(executable) call.append(executable)
call.append(logname) if get_option("autotune_google_benchmark"):
call.append("--benchmark_out={}".format(logname))
# call.append("--benchmark_out_format=csv")
else:
call.append(logname)
ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT) ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
if ret != 0: if ret != 0:
raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call))) raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
# Extract the result form the log file # Extract the result form the log file
return float(next(iter(open(logname, "r")))) / 1000000 if get_option("autotune_google_benchmark"):
import json
with open(logname) as json_file:
data = json.load(json_file)
return data['benchmarks'][0]['cpu_time']
else:
return float(next(iter(open(logname, "r")))) / 1000000
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment