diff --git a/CMakeLists.txt b/CMakeLists.txt index 7cf8469e5b7b3ad58a370e1c114229740813b812..d26e0ab5fa633638a039ad51fc61f9545aab4e26 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,5 +40,7 @@ add_subdirectory(test) add_subdirectory(bin) add_subdirectory(applications) +add_executable(autotune_target EXCLUDE_FROM_ALL autotune.cc) + # finalize the dune project, e.g. generating config.h etc. finalize_dune_project(GENERATE_CONFIG_H_CMAKE) diff --git a/python/_autotune.cc b/python/_autotune.cc new file mode 100644 index 0000000000000000000000000000000000000000..0c87b37802d394fb8dafc0ca6ae150797e81199a --- /dev/null +++ b/python/_autotune.cc @@ -0,0 +1,2 @@ +int main() +{} diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index 9d4ccb3f5da38ac1d83bd5d7b63f7c7a1059c99d..2bbc2474063943f198e778123ecd23cc1d4abde7 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -82,7 +82,7 @@ class PerftoolFormOptionsArray(ImmutableRecord): sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization") sumfact_regular_jacobians = PerftoolOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)") vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") - vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target") + vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune") vectorization_not_fully_vectorized_error = PerftoolOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize") vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy") vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy") diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 3fb213c87e98883e4f474c4145090b40aff1eec4..1b49197e8e48af700e3697ef3a90b338cf7fa200 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -459,7 +459,7 @@ def generate_kernel(integrals): # Delete the cache contents and do the real thing! logger.debug('generate_kernel: visit_integrals (no dry run)') from dune.perftool.generation import delete_cache_items - delete_cache_items("kernel_default") + delete_cache_items("kernel_default or member") for integral in integrals: visit_integral(integral) diff --git a/python/dune/perftool/sumfact/autotune.py b/python/dune/perftool/sumfact/autotune.py new file mode 100644 index 0000000000000000000000000000000000000000..8d171ecf8921eb7c1dc8ee21a1df310baafedfb6 --- /dev/null +++ b/python/dune/perftool/sumfact/autotune.py @@ -0,0 +1,182 @@ +""" Autotuning for sum factorization kernels """ + +from dune.perftool.generation import delete_cache_items +from dune.perftool.loopy.target import DuneTarget +from dune.perftool.sumfact.realization import realize_sumfact_kernel_function +from dune.perftool.options import get_option + +import loopy as lp +from pytools import product + +import os +import re +import subprocess + + +def get_cmake_cache_entry(entry): + for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"): + match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH]+=(.*)".format(entry), line) + if match: + return match.groups()[0] + + +def compiler_invocation(name, filename): + # Determine the CMake Generator in use + gen = get_cmake_cache_entry("CMAKE_GENERATOR") + assert(gen == "Unix Makefiles") + + # Find compiler path + compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER") + compile_flags = [compiler] + + # Parse compiler flags + for line in open(os.path.join(get_option("project_basedir"), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"): + match = re.match("([^=]*)=(.*)", line) + if match: + compile_flags.extend(match.groups()[1].split()) + + # Add the source file + compile_flags.append(filename) + + # Parse linker flags + for line in open(os.path.join(get_option("project_basedir"), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"): + match = re.match(".*_autotune_target (.*)", line) + if match: + for flag in match.groups()[0].split(): + if flag.startswith("-") or os.path.isabs(flag): + compile_flags.append(flag) + else: + compile_flags.append(os.path.join(get_option("project_basedir"), "python", flag)) + + # Set an output name + compile_flags.append("-o") + compile_flags.append(name) + + return compile_flags + + +def generate_standalone_code(sf, filename, logname): + delete_cache_items("kernel_default") + + with open(filename, "w") as f: + f.writelines(["#include \"config.h\"\n", + "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", + "#include<dune/perftool/common/tsc.hh>\n", + "#include<dune/perftool/common/vectorclass.hh>\n", + "#include<dune/perftool/sumfact/onedquadrature.hh>\n", + "#include<random>\n", + "#include<fstream>\n", + "\n" + ]) + + f.writelines(["int main(int argc, char** argv)\n", + "{\n", + ]) + + # Setup a polynomial object (normally done in the LocalOperator members) + from dune.perftool.loopy.target import type_floatingpoint + real = type_floatingpoint() + f.write(" using RF = {};\n".format(real)) + f.write(" using DF = {};\n".format(real)) + + from dune.perftool.sumfact.tabulation import name_polynomials + degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence) + for deg in set(degs): + f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg))) + + # Get kernels + from dune.perftool.pdelab.localoperator import extract_kernel_from_cache + knl = realize_sumfact_kernel_function(sf) + constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False) + constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False)) + constructor_knl = lp.get_one_scheduled_kernel(constructor_knl) + + # Allocate buffers + size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width, + product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width) + f.writelines([" char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size), + " char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size), + ]) + + # Write stuff into the input buffer + f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real), + " {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2), + " for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)), + " input[i] = ({})(i+1);\n".format(real), + ]) + + target = DuneTarget() + from loopy.codegen import CodeGenerationState + codegen_state = CodeGenerationState(kernel=constructor_knl, + implemented_data_info=None, + implemented_domain=None, + implemented_predicates=frozenset(), + seen_dtypes=frozenset(), + seen_functions=frozenset(), + seen_atomic_dtypes=frozenset(), + var_subst_map={}, + allow_complex=False, + is_generating_device_code=True, + ) + + for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0): + f.write(" {}\n".format(next(iter(decl.generate())))) + + for _, line in constructor_knl.preambles: + if "gfsu" not in line: + f.write(" {}\n".format(line)) + + # Add setup code for theta matrices. We add some lines not necessary, + # but it would be more work to remove them than keeping them. + for line in lp.generate_body(constructor_knl).split("\n")[1:-1]: + if "gfsu" not in line and "meshwidth" not in line: + f.write(" {}\n".format(line)) + + # INtroduces a variable that makes sure that the kernel cannot be optimized away + f.writelines([" {} accum;\n".format(real), + " std::mt19937 rng;\n", + " rng.seed(42);\n", + " std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)), + ]) + + # Start a TSC timer + f.writelines([ + " auto start = Dune::PDELab::TSC::start();\n", + ]) + + # Add the implementation of the kernel. + f.write(" for(int i=0; i<10000000; ++i)\n") + f.write(" {\n") + for line in knl.member.lines[1:]: + f.write(" {}\n".format(line)) + f.write(" }\n") + + # Stop the TSC timer and write the result to a file + f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n", + " std::ofstream file;\n", + " file.open(\"{}\");\n".format(logname), + " file << Dune::PDELab::TSC::elapsed(start, stop) << std::endl;\n", + " file.close();\n", + " accum += output[dis(rng)];\n", + " std::cout << accum;\n", + "}\n", + ]) + + +def autotune_realization(sf): + name = "autotune_sumfact_{}".format(sf.function_name) + filename = "{}.cc".format(name) + logname = "{}.log".format(name) + + # Generate and compile a benchmark program + generate_standalone_code(sf, filename, logname) + ret = subprocess.call(compiler_invocation(name, filename)) + assert ret == 0 + + # Run the benchmark program + devnull = open(os.devnull, 'w') + ret = subprocess.call(["./{}".format(name)], stdout=devnull, stderr=subprocess.STDOUT) + assert ret == 0 + + # Extract the result form the log file + return float(next(iter(open(logname, "r")))) / 1000000 diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index de13c45c3105e8c0e7411d1d849f7f55f4629e96..74a6a3d1e400210395f11314397f39acf0acc4b6 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -362,13 +362,17 @@ def name_polynomials(degree): return name -@preamble(kernel="operator") def sort_quadrature_points_weights(qp, qw, bound): range_field = lop_template_range_field() domain_field = name_domain_field() include_file("dune/perftool/sumfact/onedquadrature.hh", filetag="operatorfile") - return "onedQuadraturePointsWeights<{}, {}, {}>({}, {});" \ - .format(range_field, domain_field, bound, qp, qw) + return frozenset({instruction(code="onedQuadraturePointsWeights<{}, {}, {}>({}, {});" \ + .format(range_field, domain_field, bound, qp, qw), + assignees=frozenset({qp, qw}), + read_variables=frozenset({qp, qw}), + kernel="operator", + ), + }) @iname(kernel="operator") @@ -439,10 +443,11 @@ def define_theta(name, tabmat, additional_indices=(), width=None): args = [inames[1]] + dep = frozenset() if tabmat.face is None: qp = name_oned_quadrature_points(bound) qw = name_oned_quadrature_weights(bound) - sort_quadrature_points_weights(qp, qw, bound) + dep = sort_quadrature_points_weights(qp, qw, bound) args.append(prim.Subscript(prim.Variable(qp), (inames[0],))) else: args.append(tabmat.face) @@ -450,6 +455,7 @@ def define_theta(name, tabmat, additional_indices=(), width=None): instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices), expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)), kernel="operator", + depends_on=dep, ) diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index aed3f4a30b9115fc0a4aef1169f85db8e492853a..58df262f7158fa9bd8d855bfe665b10172bc1f04 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -110,6 +110,9 @@ def strategy_cost(strat_tuple): func = explicit_costfunction elif s == "target": func = target_costfunction + elif s == "autotune": + from dune.perftool.sumfact.autotune import autotune_realization + func = autotune_realization else: raise NotImplementedError("Vectorization strategy '{}' unknown!".format(s))