Skip to content
Snippets Groups Projects
Commit a8d7c621 authored by René Heß's avatar René Heß
Browse files

Merge branch 'master' into feature/sumfact-loop-reordering

parents 5292297f 16aa99f4
No related branches found
No related tags found
No related merge requests found
Showing
with 591 additions and 136 deletions
#!/bin/bash
ml gcc/6.4.0
ml benchmark/1.4.0
ml python/3.6.3
ml openmpi
ml cmake
ml openblas
ml metis
ml suite-sparse
ml superlu
ml parmetis
("$@")
code=$?
echo "Code: $code"
sleep 0.1s
exit $code
......@@ -79,6 +79,8 @@
# generator is run.
#
find_package(benchmark)
add_custom_target(generation)
# Gather a list of form compiler sources to add as dependencies
......
......@@ -21,6 +21,9 @@ dune_python_add_test(NAME pep8-ourcode
add_subdirectory(test)
# Add a dummy target to extract compiler flags for the autotune tool chain
add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc)
target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing)
if(benchmark_FOUND)
target_link_libraries(_autotune_target benchmark::benchmark)
endif()
This diff is collapsed.
......@@ -55,6 +55,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
target_name = CodegenOption(default=None, helpstr="The target name from CMake")
operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).")
autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
# Arguments that are mainly to be set by logic depending on other options
max_vector_width = CodegenOption(default=256, helpstr=None)
......@@ -101,6 +102,8 @@ class CodegenFormOptionsArray(ImmutableRecord):
blockstructured = CodegenOption(default=False, helpstr="Use block structure")
number_of_blocks = CodegenOption(default=1, helpstr="Number of sub blocks in one direction")
vectorization_blockstructured = CodegenOption(default=False, helpstr="Vectorize block structuring")
vectorization_blockstructured_tail = CodegenOption(default=True, helpstr="Try to fully vectorize block structuring even when 'nunmber_of_blocks' is not divisible by vector length")
vectorization_blockstructured_tail_ordering = CodegenOption(default='consecutive', helpstr="Ordering of the tail w.r.t the vectorized loop. Possible values: consecutive|blocked")
adjoint = CodegenOption(default=False, helpstr="Generate adjoint operator")
control = CodegenOption(default=False, helpstr="Generate operator of derivative w.r.t. the control variable")
objective_function = CodegenOption(default=None, helpstr="Name of form representing the objective function in UFL file")
......
......@@ -65,28 +65,38 @@ def compiler_invocation(name, filename):
return compile_flags
def generate_standalone_code(sf, filename):
delete_cache_items("kernel_default")
def write_global_data(sf, filename):
opcounting = get_option("opcounter")
with open(filename, "a") as f:
# Get kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
with open(filename, "w") as f:
f.writelines(["#include \"config.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/tsc.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
target = DuneTarget()
from loopy.codegen import CodeGenerationState
codegen_state = CodeGenerationState(kernel=constructor_knl,
implemented_data_info=None,
implemented_domain=None,
implemented_predicates=frozenset(),
seen_dtypes=frozenset(),
seen_functions=frozenset(),
seen_atomic_dtypes=frozenset(),
var_subst_map={},
allow_complex=False,
is_generating_device_code=True,
)
f.writelines(["int main(int argc, char** argv)\n",
"{\n",
])
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write("{}\n".format(next(iter(decl.generate()))))
def write_setup_code(sf, filename, define_thetas=True):
opcounting = get_option("opcounter")
with open(filename, "a") as f:
# Setup a polynomial object (normally done in the LocalOperator members)
opcounting = get_option("opcounter")
set_option("opcounter", False)
from dune.codegen.loopy.target import type_floatingpoint
real = type_floatingpoint()
......@@ -98,7 +108,7 @@ def generate_standalone_code(sf, filename):
for deg in set(degs):
f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
# Get kernels
# Get kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
......@@ -119,7 +129,7 @@ def generate_standalone_code(sf, filename):
f.write("{} = 0;\n".format(arg))
else:
size = sf.interface.fastdg_interface_object_size
f.write("RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size))
f.write(" RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size))
# Write stuff into the input buffer
f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real),
......@@ -142,8 +152,9 @@ def generate_standalone_code(sf, filename):
is_generating_device_code=True,
)
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write(" {}\n".format(next(iter(decl.generate()))))
if define_thetas:
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write(" {}\n".format(next(iter(decl.generate()))))
for _, line in constructor_knl.preambles:
if "gfsu" not in line:
......@@ -161,13 +172,102 @@ def generate_standalone_code(sf, filename):
" rng.seed(42);\n",
" std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
])
return opcounting
def generate_standalone_code_google_benchmark(sf, filename):
delete_cache_items("kernel_default")
# Extract sum factorization kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
# Add the implementation of the kernel.
# TODO: This can probably done in a safer way?
first_line = knl.member.lines[0]
arguments = first_line[first_line.find("(") + 1:first_line.find(")")]
with open(filename, "w") as f:
f.writelines(["// {}".format(first_line),
"\n",
"#include \"config.h\"\n",
"#include \"benchmark/benchmark.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
write_global_data(sf, filename)
with open(filename, "a") as f:
f.write("void sumfact_kernel({})\n".format(arguments))
for line in knl.member.lines[1:]:
f.write("{}\n".format(line))
f.write("\n\n")
f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
opcounting = write_setup_code(sf, filename, define_thetas=False)
additional_arguments = [i.split()[-1] for i in sf.interface.signature_args]
additional_arguments = ', '.join(additional_arguments)
if len(additional_arguments) > 0:
additional_arguments = ', ' + additional_arguments
with open(filename, "a") as f:
f.writelines([" for (auto _ : state){\n",
" sumfact_kernel(buffer0, buffer1{});\n".format(additional_arguments),
" }\n",
"}\n",
"BENCHMARK(BM_sumfact_kernel);\n",
"\n",
"BENCHMARK_MAIN();"
])
set_option("opcounter", opcounting)
def generate_standalone_code(sf, filename):
delete_cache_items("kernel_default")
# Extract sum factorization kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
first_line = knl.member.lines[0]
with open(filename, "w") as f:
f.writelines(["// {}".format(first_line),
"\n",
"#include \"config.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/tsc.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
f.writelines(["int main(int argc, char** argv)\n",
"{\n",
])
opcounting = write_setup_code(sf, filename)
# Write measurement
with open(filename, "a") as f:
# Start a TSC timer
f.writelines([" auto start = Dune::PDELab::TSC::start();\n",
])
# Add the implementation of the kernel.
f.write(" for(int i=0; i<{}; ++i)\n".format(int(1e9 / sf.operations)))
repeats = int(1e9 / sf.operations)
f.write(" for(int i=0; i<{}; ++i)\n".format(repeats))
f.write(" {\n")
for line in knl.member.lines[1:]:
f.write(" {}\n".format(line))
......@@ -177,7 +277,7 @@ def generate_standalone_code(sf, filename):
f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n",
" std::ofstream file;\n",
" file.open(argv[1]);\n",
" file << Dune::PDELab::TSC::elapsed(start, stop) << std::endl;\n",
" file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))),
" file.close();\n",
" accum += output[dis(rng)];\n",
" std::cout << accum;\n",
......@@ -204,25 +304,44 @@ def autotune_realization(sf):
with cache_restoring():
with filelock.FileLock(lock):
if not os.path.isfile(logname):
generate_standalone_code(sf, filename)
if get_option("autotune_google_benchmark"):
generate_standalone_code_google_benchmark(sf, filename)
else:
generate_standalone_code(sf, filename)
call = []
wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_COMPILATION_WRAPPER")
if wrapper:
call.append(wrapper)
call.extend(compiler_invocation(executable, filename))
devnull = open(os.devnull, 'w')
ret = subprocess.call(compiler_invocation(executable, filename), stdout=devnull, stderr=subprocess.STDOUT)
ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
if ret != 0:
raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(name, filename))))
raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call)))
# Check whether the user specified an execution wrapper
call = []
wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_WRAPPER")
wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_EXECUTION_WRAPPER")
if wrapper:
call.append(wrapper)
# Run the benchmark program
call.append(executable)
call.append(logname)
if get_option("autotune_google_benchmark"):
call.append("--benchmark_out={}".format(logname))
# call.append("--benchmark_out_format=csv")
else:
call.append(logname)
ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
if ret != 0:
raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
# Extract the result form the log file
return float(next(iter(open(logname, "r")))) / 1000000
if get_option("autotune_google_benchmark"):
import json
with open(logname) as json_file:
data = json.load(json_file)
return data['benchmarks'][0]['cpu_time']
else:
return float(next(iter(open(logname, "r")))) / 1000000
......@@ -41,4 +41,9 @@ dune_add_formcompiler_system_test(UFLFILE poisson.ufl
dune_add_formcompiler_system_test(UFLFILE poisson.ufl
BASENAME blockstructured_poisson_grid
INIFILE poisson_grid.mini
)
\ No newline at end of file
)
dune_add_formcompiler_system_test(UFLFILE poisson.ufl
BASENAME blockstructured_poisson_vec_tail
INIFILE poisson_vec_tail.mini
)
__name = blockstructured_poisson_tensor_{__exec_suffix}
__exec_suffix = {grid_suffix}_{vec_suffix}_{dim_suffix}
__exec_suffix = {grid_suffix}_{vec_suffix}_{dim_suffix}_blocks_{blocks}
dim = 2, 3 | expand dimension
blocks_2d = 8, 7 | expand blocks
blocks_3d = 4, 5 | expand blocks
blocks = {blocks_2d}, {blocks_3d} | expand dimension
grid_suffix = structured, unstructured | expand unstructured
vec_suffix = nonvec, vec | expand vectorized
dim_suffix = 2d, 3d | expand dimension
......@@ -26,7 +31,7 @@ matrix_free = 1
vectorization_blockstructured = 0, 1 | expand vectorized
generate_jacobians = 0
blockstructured = 1
number_of_blocks = 8, 4 | expand dimension
number_of_blocks = {blocks}
geometry_mixins = blockstructured_equidistant, blockstructured_multilinear | expand unstructured
[formcompiler.ufl_variants]
......
__name = blockstructured_poisson_vec_tail_{__exec_suffix}
__exec_suffix = {dimname}_{tail_suffix}
dim = 2, 3 | expand dimension
dimname = 2d, 3d | expand dimension
cells = 8, 2 | expand dimension | repeat {dim}
extension = 1. | repeat {dim}
tail_vec = 0, 1 | expand tail_vec
tail_modus = consecutive, blocked | expand mod
tail_suffix = novec_{tail_modus}, vec_{tail_modus} | expand tail_vec
[wrapper.vtkcompare]
name = {__name}
reference = poisson_ref
extension = vtu
[formcompiler]
compare_l2errorsquared = 1e-7
[formcompiler.r]
matrix_free = 1
generate_jacobians = 0
blockstructured = 1
number_of_blocks = 15, 7 | expand dimension
vectorization_blockstructured = 1
vectorization_blockstructured_tail = {tail_vec}
vectorization_blockstructured_tail_ordering = {tail_modus}
geometry_mixins = blockstructured_equidistant
[formcompiler.ufl_variants]
cell = quadrilateral, hexahedron | expand dimension
degree = 1
\ No newline at end of file
......@@ -119,3 +119,18 @@ dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl
BASENAME sumfact_poisson_dg_3d_diagonal
INIFILE diagonal.mini
)
#======================================
# Test autotuning with google-benchmark
#======================================
if(benchmark_FOUND)
dune_add_formcompiler_system_test(UFLFILE poisson_3d.ufl
BASENAME sumfact_poisson_3d_benchmark
INIFILE poisson_3d_benchmark.mini
)
dune_add_formcompiler_system_test(UFLFILE poisson_dg_volumes_3d.ufl
BASENAME sumfact_poisson_fastdg_volumes_3d_benchmark
INIFILE poisson_fastdg_volumes_3d_benchmark.mini
)
endif()
__name = sumfact_poisson_3d_benchmark_{__exec_suffix}
__exec_suffix = {deg_suffix}_{diff_suffix}_{quadvec_suffix}_{gradvec_suffix}
deg_suffix = deg{formcompiler.ufl_variants.degree}
diff_suffix = symdiff
quadvec_suffix = quadvec
gradvec_suffix = autotunevec
cells = 8 8 8
extension = 1. 1. 1.
[wrapper.vtkcompare]
name = {__name}
reference = poisson_ref
extension = vtu
[formcompiler]
compare_l2errorsquared = 1e-4
autotune_google_benchmark = 1
[formcompiler.r]
numerical_jacobian = 0
sumfact = 1
vectorization_quadloop = 1
vectorization_strategy = autotune
geometry_mixins = sumfact_equidistant
[formcompiler.ufl_variants]
degree = 1
cell = hexahedron
dim = 3
x = SpatialCoordinate(cell)
c = (0.5-x[0])**2 + (0.5-x[1])**2 + (0.5-x[2])**2
g = exp(-1.*c)
f = 2*(3.-2*c)*g
V = FiniteElement("DG", cell, degree)
u = TrialFunction(V)
v = TestFunction(V)
n = FacetNormal(cell)('+')
# penalty factor
alpha = 1.0
h_ext = CellVolume(cell) / FacetArea(cell)
gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0
theta = 1.0
r = inner(grad(u), grad(v))*dx \
- f*v*dx
# - inner(n, avg(grad(u)))*jump(v)*dS \
# + gamma_int*jump(u)*jump(v)*dS \
# + theta*jump(u)*inner(avg(grad(v)), n)*dS \
# - inner(n, grad(u))*v*ds \
# + gamma_ext*u*v*ds \
# + theta*u*inner(grad(v), n)*ds \
# - gamma_ext*g*v*ds \
# - theta*g*inner(grad(v), n)*ds
exact_solution = g
__name = sumfact_poisson_fastdg_volumes_3d_benchmark_{__exec_suffix}
__exec_suffix = {deg_suffix}_{diff_suffix}_{quadvec_suffix}_{gradvec_suffix}
deg_suffix = deg{formcompiler.ufl_variants.degree}
diff_suffix = symdiff
quadvec_suffix = quadvec
gradvec_suffix = autotunevec
cells = 8 8 8
extension = 1. 1. 1.
[wrapper.vtkcompare]
name = {__name}
reference = poisson_ref
extension = vtu
[formcompiler]
# Since this test makes a DG scheme without skeletons the solution is garbage.
# This test just tests generation of microbenchmarks.
# compare_l2errorsquared = 1e-4
autotune_google_benchmark = 1
[formcompiler.r]
numerical_jacobian = 0
sumfact = 1
vectorization_quadloop = 1
vectorization_strategy = autotune
fastdg = 1
geometry_mixins = sumfact_equidistant
[formcompiler.ufl_variants]
degree = 1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment