Newer
Older
""" Autotuning for sum factorization kernels """
from dune.codegen.generation import cache_restoring, delete_cache_items
from dune.codegen.loopy.target import DuneTarget
from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
from dune.codegen.options import get_option, set_option
from dune.codegen.error import CodegenAutotuneError
import loopy as lp
from pytools import product
import os
import re
import subprocess
Dominic Kempf
committed
import filelock
def get_cmake_cache_entry(entry):
for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"):
match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH|UNINITIALIZED|STATIC]+=(.*)".format(entry), line)
if match:
return match.groups()[0]
def get_dune_codegen_dir():
if get_cmake_cache_entry("CMAKE_PROJECT_NAME") == "dune-codegen":
return get_option("project_basedir")
else:
def compiler_invocation(name, filename):
# Determine the CMake Generator in use
gen = get_cmake_cache_entry("CMAKE_GENERATOR")
assert(gen == "Unix Makefiles")
# Find compiler path
compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER")
compile_flags = [compiler]
# Parse compiler flags
for line in open(os.path.join(get_dune_codegen_dir(), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"):
match = re.match("([^=]*)=(.*)", line)
if match:
compile_flags.extend(match.groups()[1].split())
# Add the source file
compile_flags.append(filename)
# Parse linker flags
for line in open(os.path.join(get_dune_codegen_dir(), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"):
match = re.match(".*_autotune_target (.*)", line)
if match:
for flag in match.groups()[0].split():
if flag.startswith("-") or os.path.isabs(flag):
compile_flags.append(flag)
else:
compile_flags.append(os.path.join(get_dune_codegen_dir(), "python", flag))
# Set an output name
compile_flags.append("-o")
compile_flags.append(name)
return compile_flags
def write_global_data(sf, filename):
opcounting = get_option("opcounter")
with open(filename, "a") as f:
# Get kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
target = DuneTarget()
from loopy.codegen import CodeGenerationState
codegen_state = CodeGenerationState(kernel=constructor_knl,
implemented_data_info=None,
implemented_domain=None,
implemented_predicates=frozenset(),
seen_dtypes=frozenset(),
seen_functions=frozenset(),
seen_atomic_dtypes=frozenset(),
var_subst_map={},
allow_complex=False,
is_generating_device_code=True,
)
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write("{}\n".format(next(iter(decl.generate()))))
def write_setup_code(sf, filename, define_thetas=True):
opcounting = get_option("opcounter")
with open(filename, "a") as f:
# Setup a polynomial object (normally done in the LocalOperator members)
from dune.codegen.loopy.target import type_floatingpoint
real = type_floatingpoint()
f.write(" using RF = {};\n".format(real))
f.write(" using DF = {};\n".format(real))
from dune.codegen.sumfact.tabulation import name_polynomials
degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence_quadrature_permuted)
for deg in set(degs):
f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
# Get kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
# Allocate buffers
size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width,
product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width)
f.writelines([" char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size),
" char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size),
])
# Setup fastdg inputs
for arg in sf.interface.signature_args:
if "jacobian" in arg:
f.write("{} = 0;\n".format(arg))
else:
size = sf.interface.fastdg_interface_object_size
f.write("RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size))
# Write stuff into the input buffer
f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real),
" {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2),
" for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)),
" input[i] = ({})(i+1);\n".format(real),
])
target = DuneTarget()
from loopy.codegen import CodeGenerationState
codegen_state = CodeGenerationState(kernel=constructor_knl,
implemented_data_info=None,
implemented_domain=None,
implemented_predicates=frozenset(),
seen_dtypes=frozenset(),
seen_functions=frozenset(),
seen_atomic_dtypes=frozenset(),
var_subst_map={},
allow_complex=False,
is_generating_device_code=True,
)
if define_thetas:
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write(" {}\n".format(next(iter(decl.generate()))))
for _, line in constructor_knl.preambles:
if "gfsu" not in line:
f.write(" {}\n".format(line))
# Add setup code for theta matrices. We add some lines not necessary,
# but it would be more work to remove them than keeping them.
for line in lp.generate_body(constructor_knl).split("\n")[1:-1]:
if "gfsu" not in line and "meshwidth" not in line and "geometry" not in line:
f.write(" {}\n".format(line))
# INtroduces a variable that makes sure that the kernel cannot be optimized away
f.writelines([" {} accum;\n".format(real),
" std::mt19937 rng;\n",
" rng.seed(42);\n",
" std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
])
return opcounting
def generate_standalone_code_google_benchmark(sf, filename):
delete_cache_items("kernel_default")
# Extract sum factorization kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
# Add the implementation of the kernel.
# TODO: This can probably done in a safer way?
first_line = knl.member.lines[0]
arguments = first_line[first_line.find("(") + 1:first_line.find(")")]
with open(filename, "w") as f:
f.writelines(["// {}".format(first_line),
"\n",
"#include \"config.h\"\n",
"#include \"benchmark/benchmark.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
with open(filename, "a") as f:
f.write("void sumfact_kernel({})\n".format(arguments))
for line in knl.member.lines[1:]:
f.write("{}\n".format(line))
f.write("\n\n")
f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
write_setup_code(sf, filename, define_thetas=False)
with open(filename, "a") as f:
f.writelines([" for (auto _ : state){\n",
" sumfact_kernel(buffer0, buffer1);\n",
" }\n",
"}\n",
"BENCHMARK(BM_sumfact_kernel);\n",
"\n",
"BENCHMARK_MAIN();"
])
def generate_standalone_code(sf, filename):
delete_cache_items("kernel_default")
# Extract sum factorization kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
with open(filename, "w") as f:
f.writelines(["// {}".format(first_line),
"\n",
"#include \"config.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/tsc.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
f.writelines(["int main(int argc, char** argv)\n",
"{\n",
])
opcounting = write_setup_code(sf, filename)
# Write measurement
with open(filename, "a") as f:
])
# Add the implementation of the kernel.
repeats = int(1e9 / sf.operations)
f.write(" for(int i=0; i<{}; ++i)\n".format(repeats))
f.write(" {\n")
for line in knl.member.lines[1:]:
f.write(" {}\n".format(line))
f.write(" }\n")
# Stop the TSC timer and write the result to a file
f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n",
" std::ofstream file;\n",
" file.open(argv[1]);\n",
" file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))),
" file.close();\n",
" accum += output[dis(rng)];\n",
" std::cout << accum;\n",
"}\n",
])
set_option("opcounter", opcounting)
def autotune_realization(sf):
# Make sure that the benchmark directory exists
dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
if not os.path.exists(dir):
os.mkdir(dir)
basename = "autotune_sumfact_{}".format(sf.function_name)
basename = hashlib.sha256(basename.encode()).hexdigest()
filename = os.path.join(dir, "{}.cc".format(basename))
logname = os.path.join(dir, "{}.log".format(basename))
lock = os.path.join(dir, "{}.lock".format(basename))
executable = os.path.join(dir, basename)
Dominic Kempf
committed
# Generate and compile a benchmark program
with cache_restoring():
with filelock.FileLock(lock):
if not os.path.isfile(logname):
if get_option("autotune_google_benchmark"):
generate_standalone_code_google_benchmark(sf, filename)
else:
generate_standalone_code(sf, filename)
Dominic Kempf
committed
devnull = open(os.devnull, 'w')
ret = subprocess.call(compiler_invocation(executable, filename), stdout=devnull, stderr=subprocess.STDOUT)
raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(executable, filename))))
Dominic Kempf
committed
# Check whether the user specified an execution wrapper
call = []
wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_WRAPPER")
Dominic Kempf
committed
if wrapper:
call.append(wrapper)
# Run the benchmark program
if get_option("autotune_google_benchmark"):
call.append("--benchmark_out={}".format(logname))
# call.append("--benchmark_out_format=csv")
else:
call.append(logname)
Dominic Kempf
committed
ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
# Extract the result form the log file
if get_option("autotune_google_benchmark"):
import json
with open(logname) as json_file:
data = json.load(json_file)
return data['benchmarks'][0]['cpu_time']
else:
return float(next(iter(open(logname, "r")))) / 1000000