""" Autotuning for sum factorization kernels """ from dune.codegen.generation import cache_restoring, delete_cache_items from dune.codegen.loopy.target import DuneTarget from dune.codegen.sumfact.realization import realize_sumfact_kernel_function from dune.codegen.options import get_option, set_option from dune.codegen.error import CodegenAutotuneError import loopy as lp from pytools import product import os import re import subprocess import filelock def get_cmake_cache_entry(entry): for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"): match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH|UNINITIALIZED|STATIC]+=(.*)".format(entry), line) if match: return match.groups()[0] def get_dune_codegen_dir(): if get_cmake_cache_entry("CMAKE_PROJECT_NAME") == "dune-codegen": return get_option("project_basedir") else: return get_cmake_cache_entry("dune-codegen_DIR") def compiler_invocation(name, filename): # Determine the CMake Generator in use gen = get_cmake_cache_entry("CMAKE_GENERATOR") assert(gen == "Unix Makefiles") # Find compiler path compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER") compile_flags = [compiler] # Parse compiler flags for line in open(os.path.join(get_dune_codegen_dir(), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"): match = re.match("([^=]*)=(.*)", line) if match: compile_flags.extend(match.groups()[1].split()) # Add the source file compile_flags.append(filename) # Parse linker flags for line in open(os.path.join(get_dune_codegen_dir(), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"): match = re.match(".*_autotune_target (.*)", line) if match: for flag in match.groups()[0].split(): if flag.startswith("-") or os.path.isabs(flag): compile_flags.append(flag) else: compile_flags.append(os.path.join(get_dune_codegen_dir(), "python", flag)) # Set an output name compile_flags.append("-o") compile_flags.append(name) return compile_flags def generate_standalone_code(sf, filename): delete_cache_items("kernel_default") with open(filename, "w") as f: f.writelines(["#include \"config.h\"\n", "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", "#include<dune/codegen/common/tsc.hh>\n", "#include<dune/codegen/common/vectorclass.hh>\n", "#include<dune/codegen/sumfact/onedquadrature.hh>\n", "#include<dune/codegen/sumfact/horizontaladd.hh>\n", "#include<random>\n", "#include<fstream>\n", "#include<iostream>\n", "\n" ]) f.writelines(["int main(int argc, char** argv)\n", "{\n", ]) # Setup a polynomial object (normally done in the LocalOperator members) opcounting = get_option("opcounter") set_option("opcounter", False) from dune.codegen.loopy.target import type_floatingpoint real = type_floatingpoint() f.write(" using RF = {};\n".format(real)) f.write(" using DF = {};\n".format(real)) from dune.codegen.sumfact.tabulation import name_polynomials degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence) for deg in set(degs): f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg))) # Get kernels from dune.codegen.pdelab.localoperator import extract_kernel_from_cache knl = realize_sumfact_kernel_function(sf) constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False) constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False)) constructor_knl = lp.get_one_scheduled_kernel(constructor_knl) # Allocate buffers size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width, product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width) size = int(size * (get_option("precision_bits") / 8)) f.writelines([" char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size), " char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size), ]) # Setup fastdg inputs for arg in sf.interface.signature_args: if "jacobian" in arg: f.write("{} = 0;\n".format(arg)) else: size = sf.interface.fastdg_interface_object_size f.write("RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size)) # Write stuff into the input buffer f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real), " {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2), " for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)), " input[i] = ({})(i+1);\n".format(real), ]) target = DuneTarget() from loopy.codegen import CodeGenerationState codegen_state = CodeGenerationState(kernel=constructor_knl, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), seen_dtypes=frozenset(), seen_functions=frozenset(), seen_atomic_dtypes=frozenset(), var_subst_map={}, allow_complex=False, is_generating_device_code=True, ) for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0): f.write(" {}\n".format(next(iter(decl.generate())))) for _, line in constructor_knl.preambles: if "gfsu" not in line: f.write(" {}\n".format(line)) # Add setup code for theta matrices. We add some lines not necessary, # but it would be more work to remove them than keeping them. for line in lp.generate_body(constructor_knl).split("\n")[1:-1]: if "gfsu" not in line and "meshwidth" not in line and "geometry" not in line: f.write(" {}\n".format(line)) # INtroduces a variable that makes sure that the kernel cannot be optimized away f.writelines([" {} accum;\n".format(real), " std::mt19937 rng;\n", " rng.seed(42);\n", " std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)), ]) # Start a TSC timer f.writelines([" auto start = Dune::PDELab::TSC::start();\n", ]) # Add the implementation of the kernel. f.write(" for(int i=0; i<{}; ++i)\n".format(int(1e9 / sf.operations))) f.write(" {\n") for line in knl.member.lines[1:]: f.write(" {}\n".format(line)) f.write(" }\n") # Stop the TSC timer and write the result to a file f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n", " std::ofstream file;\n", " file.open(argv[1]);\n", " file << Dune::PDELab::TSC::elapsed(start, stop) << std::endl;\n", " file.close();\n", " accum += output[dis(rng)];\n", " std::cout << accum;\n", "}\n", ]) set_option("opcounter", opcounting) def autotune_realization(sf): # Make sure that the benchmark directory exists dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks") if not os.path.exists(dir): os.mkdir(dir) basename = "autotune_sumfact_{}".format(sf.function_name) name = os.path.join(dir, "autotune_sumfact_{}".format(sf.function_name)) filename = os.path.join(dir, "{}.cc".format(basename)) logname = os.path.join(dir, "{}.log".format(basename)) lock = "{}.lock".format(name) # Generate and compile a benchmark program with cache_restoring(): with filelock.FileLock(lock): if not os.path.isfile(logname): generate_standalone_code(sf, filename) ret = subprocess.call(compiler_invocation(name, filename)) if ret != 0: raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(name, filename)))) # Check whether the user specified an execution wrapper call = [] wrapper = get_cmake_cache_entry("DUNE_PERFTOOL_BENCHMARK_WRAPPER") if wrapper: call.append(wrapper) # Run the benchmark program call.append(name) call.append(logname) devnull = open(os.devnull, 'w') ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT) if ret != 0: raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call))) # Extract the result form the log file return float(next(iter(open(logname, "r")))) / 1000000