Skip to content
Snippets Groups Projects
Commit 20df5b20 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

[!312] Feature/autotuning google benchmark

Merge branch 'feature/autotuning-google-benchmark' into 'master'

ref:extensions/dune-codegen Make it possible to use google-benchmark for
autotuning.

See merge request [extensions/dune-codegen!312]

  [extensions/dune-codegen!312]: gitlab.dune-project.org/extensions/dune-codegen/merge_requests/312
parents 15774221 51e194b1
No related branches found
No related tags found
No related merge requests found
......@@ -79,6 +79,8 @@
# generator is run.
#
find_package(benchmark)
add_custom_target(generation)
# Gather a list of form compiler sources to add as dependencies
......
......@@ -21,6 +21,9 @@ dune_python_add_test(NAME pep8-ourcode
add_subdirectory(test)
# Add a dummy target to extract compiler flags for the autotune tool chain
add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc)
target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing)
if(benchmark_FOUND)
target_link_libraries(_autotune_target benchmark::benchmark)
endif()
......@@ -55,6 +55,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
target_name = CodegenOption(default=None, helpstr="The target name from CMake")
operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).")
autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
# Arguments that are mainly to be set by logic depending on other options
max_vector_width = CodegenOption(default=256, helpstr=None)
......
......@@ -65,28 +65,38 @@ def compiler_invocation(name, filename):
return compile_flags
def generate_standalone_code(sf, filename):
delete_cache_items("kernel_default")
def write_global_data(sf, filename):
opcounting = get_option("opcounter")
with open(filename, "a") as f:
# Get kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
with open(filename, "w") as f:
f.writelines(["#include \"config.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/tsc.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
target = DuneTarget()
from loopy.codegen import CodeGenerationState
codegen_state = CodeGenerationState(kernel=constructor_knl,
implemented_data_info=None,
implemented_domain=None,
implemented_predicates=frozenset(),
seen_dtypes=frozenset(),
seen_functions=frozenset(),
seen_atomic_dtypes=frozenset(),
var_subst_map={},
allow_complex=False,
is_generating_device_code=True,
)
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write("{}\n".format(next(iter(decl.generate()))))
f.writelines(["int main(int argc, char** argv)\n",
"{\n",
])
def write_setup_code(sf, filename, define_thetas=True):
opcounting = get_option("opcounter")
with open(filename, "a") as f:
# Setup a polynomial object (normally done in the LocalOperator members)
opcounting = get_option("opcounter")
set_option("opcounter", False)
from dune.codegen.loopy.target import type_floatingpoint
real = type_floatingpoint()
......@@ -98,7 +108,7 @@ def generate_standalone_code(sf, filename):
for deg in set(degs):
f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
# Get kernels
# Get kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
......@@ -142,8 +152,9 @@ def generate_standalone_code(sf, filename):
is_generating_device_code=True,
)
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write(" {}\n".format(next(iter(decl.generate()))))
if define_thetas:
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write(" {}\n".format(next(iter(decl.generate()))))
for _, line in constructor_knl.preambles:
if "gfsu" not in line:
......@@ -161,7 +172,90 @@ def generate_standalone_code(sf, filename):
" rng.seed(42);\n",
" std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
])
return opcounting
def generate_standalone_code_google_benchmark(sf, filename):
delete_cache_items("kernel_default")
# Extract sum factorization kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
# Add the implementation of the kernel.
# TODO: This can probably done in a safer way?
first_line = knl.member.lines[0]
arguments = first_line[first_line.find("(") + 1:first_line.find(")")]
with open(filename, "w") as f:
f.writelines(["// {}".format(first_line),
"\n",
"#include \"config.h\"\n",
"#include \"benchmark/benchmark.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
write_global_data(sf, filename)
with open(filename, "a") as f:
f.write("void sumfact_kernel({})\n".format(arguments))
for line in knl.member.lines[1:]:
f.write("{}\n".format(line))
f.write("\n\n")
f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
write_setup_code(sf, filename, define_thetas=False)
with open(filename, "a") as f:
f.writelines([" for (auto _ : state){\n",
" sumfact_kernel(buffer0, buffer1);\n",
" }\n",
"}\n",
"BENCHMARK(BM_sumfact_kernel);\n",
"\n",
"BENCHMARK_MAIN();"
])
def generate_standalone_code(sf, filename):
delete_cache_items("kernel_default")
# Extract sum factorization kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
first_line = knl.member.lines[0]
with open(filename, "w") as f:
f.writelines(["// {}".format(first_line),
"\n",
"#include \"config.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/tsc.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
f.writelines(["int main(int argc, char** argv)\n",
"{\n",
])
opcounting = write_setup_code(sf, filename)
# Write measurement
with open(filename, "a") as f:
# Start a TSC timer
f.writelines([" auto start = Dune::PDELab::TSC::start();\n",
])
......@@ -205,12 +299,15 @@ def autotune_realization(sf):
with cache_restoring():
with filelock.FileLock(lock):
if not os.path.isfile(logname):
generate_standalone_code(sf, filename)
if get_option("autotune_google_benchmark"):
generate_standalone_code_google_benchmark(sf, filename)
else:
generate_standalone_code(sf, filename)
devnull = open(os.devnull, 'w')
ret = subprocess.call(compiler_invocation(executable, filename), stdout=devnull, stderr=subprocess.STDOUT)
if ret != 0:
raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(name, filename))))
raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(executable, filename))))
# Check whether the user specified an execution wrapper
call = []
......@@ -220,10 +317,20 @@ def autotune_realization(sf):
# Run the benchmark program
call.append(executable)
call.append(logname)
if get_option("autotune_google_benchmark"):
call.append("--benchmark_out={}".format(logname))
# call.append("--benchmark_out_format=csv")
else:
call.append(logname)
ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
if ret != 0:
raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
# Extract the result form the log file
return float(next(iter(open(logname, "r")))) / 1000000
if get_option("autotune_google_benchmark"):
import json
with open(logname) as json_file:
data = json.load(json_file)
return data['benchmarks'][0]['cpu_time']
else:
return float(next(iter(open(logname, "r")))) / 1000000
......@@ -119,3 +119,13 @@ dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
BASENAME sumfact_poisson_dg_3d_diagonal
INIFILE diagonal.mini
)
#======================================
# Test autotuning with google-benchmark
#======================================
if(benchmark_FOUND)
dune_add_formcompiler_system_test(UFLFILE poisson_3d.ufl
BASENAME sumfact_poisson_3d_benchmark
INIFILE poisson_3d_benchmark.mini
)
endif()
__name = sumfact_poisson_3d_benchmark_{__exec_suffix}
__exec_suffix = {deg_suffix}_{diff_suffix}_{quadvec_suffix}_{gradvec_suffix}
deg_suffix = deg{formcompiler.ufl_variants.degree}
diff_suffix = symdiff
quadvec_suffix = quadvec
gradvec_suffix = autotunevec
cells = 8 8 8
extension = 1. 1. 1.
[wrapper.vtkcompare]
name = {__name}
reference = poisson_ref
extension = vtu
[formcompiler]
compare_l2errorsquared = 1e-4
autotune_google_benchmark = 1
[formcompiler.r]
numerical_jacobian = 0
sumfact = 1
vectorization_quadloop = 1
vectorization_strategy = autotune
geometry_mixins = sumfact_equidistant
[formcompiler.ufl_variants]
degree = 1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment