""" Autotuning for sum factorization kernels """ import os import re import subprocess import filelock import hashlib import json from operator import mul import time import loopy as lp from pytools import product from cgen import ArrayOf, AlignedAttribute, Initializer from dune.codegen.generation import cache_restoring, delete_cache_items from dune.codegen.loopy.target import DuneTarget, type_floatingpoint from dune.codegen.sumfact.realization import realize_sumfact_kernel_function from dune.codegen.options import get_option, set_option from dune.codegen.error import CodegenAutotuneError def get_cmake_cache_entry(entry): for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"): match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH|UNINITIALIZED|STATIC]+=(.*)".format(entry), line) if match: return match.groups()[0] def get_dune_codegen_dir(): if get_cmake_cache_entry("CMAKE_PROJECT_NAME") == "dune-codegen": return get_option("project_basedir") else: return get_cmake_cache_entry("dune-codegen_DIR") def compiler_invocation(name, filename): # Determine the CMake Generator in use gen = get_cmake_cache_entry("CMAKE_GENERATOR") assert(gen == "Unix Makefiles") # Find compiler path compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER") compile_flags = [compiler] # Parse compiler flags for line in open(os.path.join(get_dune_codegen_dir(), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"): match = re.match("([^=]*)=(.*)", line) if match: compile_flags.extend(match.groups()[1].split()) # Add the source file compile_flags.append(filename) # Parse linker flags for line in open(os.path.join(get_dune_codegen_dir(), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"): match = re.match(".*_autotune_target (.*)", line) if match: for flag in match.groups()[0].split(): if flag.startswith("-") or os.path.isabs(flag): compile_flags.append(flag) else: compile_flags.append(os.path.join(get_dune_codegen_dir(), "python", flag)) # Set an output name compile_flags.append("-o") compile_flags.append(name) return compile_flags def write_global_data(sf, filename): opcounting = get_option("opcounter") with open(filename, "a") as f: # Get kernel from dune.codegen.pdelab.localoperator import extract_kernel_from_cache knl = realize_sumfact_kernel_function(sf) constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False) constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False)) constructor_knl = lp.get_one_scheduled_kernel(constructor_knl) target = DuneTarget() from loopy.codegen import CodeGenerationState codegen_state = CodeGenerationState(kernel=constructor_knl, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), seen_dtypes=frozenset(), seen_functions=frozenset(), seen_atomic_dtypes=frozenset(), var_subst_map={}, allow_complex=False, is_generating_device_code=True, ) for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0): f.write("{}\n".format(next(iter(decl.generate())))) def write_setup_code(sf, filename, define_thetas=True): with open(filename, "a") as f: # Setup a polynomial object (normally done in the LocalOperator members) from dune.codegen.loopy.target import type_floatingpoint real = type_floatingpoint() f.write(" using RF = {};\n".format(real)) f.write(" using DF = {};\n".format(real)) from dune.codegen.sumfact.tabulation import name_polynomials degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence_quadrature_permuted) for deg in set(degs): f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg))) # Get kernel from dune.codegen.pdelab.localoperator import extract_kernel_from_cache knl = realize_sumfact_kernel_function(sf) constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False) constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False)) constructor_knl = lp.get_one_scheduled_kernel(constructor_knl) # Allocate buffers alignment = get_option("max_vector_width") // 8 size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width, product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width) size = int(size * (get_option("precision_bits") / 8)) f.writelines([" char buffer0[{}] __attribute__ ((aligned ({})));\n".format(size, alignment), " char buffer1[{}] __attribute__ ((aligned ({})));\n".format(size, alignment), ]) # Setup fastdg inputs for arg in sf.interface.signature_args: if "jacobian" in arg: f.write("{} = 0;\n".format(arg)) else: size = sf.interface.fastdg_interface_object_size f.write(" RF {}[{}] __attribute__ ((aligned ({})));\n".format(arg.split()[-1], size, alignment)) # Write stuff into the input buffer f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real), " {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2), " for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)), " input[i] = ({})(i+1);\n".format(real), ]) target = DuneTarget() from loopy.codegen import CodeGenerationState codegen_state = CodeGenerationState(kernel=constructor_knl, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), seen_dtypes=frozenset(), seen_functions=frozenset(), seen_atomic_dtypes=frozenset(), var_subst_map={}, allow_complex=False, is_generating_device_code=True, ) if define_thetas: for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0): f.write(" {}\n".format(next(iter(decl.generate())))) for _, line in constructor_knl.preambles: if "gfsu" not in line: f.write(" {}\n".format(line)) # Add setup code for theta matrices. We add some lines not necessary, # but it would be more work to remove them than keeping them. for line in lp.generate_body(constructor_knl).split("\n")[1:-1]: if "gfsu" not in line and "meshwidth" not in line and "geometry" not in line: f.write(" {}\n".format(line)) # INtroduces a variable that makes sure that the kernel cannot be optimized away f.writelines([" {} accum;\n".format(real), " std::mt19937 rng;\n", " rng.seed(42);\n", " std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)), ]) def generate_standalone_code_google_benchmark(sf, filename): delete_cache_items("kernel_default") # Turn off opcounting opcounting = get_option("opcounter") set_option("opcounter", False) # Extract sum factorization kernel from dune.codegen.pdelab.localoperator import extract_kernel_from_cache knl = realize_sumfact_kernel_function(sf) # Add the implementation of the kernel. # TODO: This can probably done in a safer way? first_line = knl.member.lines[0] arguments = first_line[first_line.find("(") + 1:first_line.find(")")] with open(filename, "w") as f: f.writelines(["// {}".format(first_line), "\n", "#include \"config.h\"\n", "#include \"benchmark/benchmark.h\"\n", "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", "#include<dune/codegen/common/vectorclass.hh>\n", "#include<dune/codegen/sumfact/onedquadrature.hh>\n", "#include<dune/codegen/sumfact/horizontaladd.hh>\n", "#include<random>\n", "#include<fstream>\n", "#include<iostream>\n", "\n" ]) write_global_data(sf, filename) with open(filename, "a") as f: arguments = ', '.join(sf.interface.signature_args) if len(arguments) > 0: arguments = ', ' + arguments arguments = 'const char* buffer0, const char* buffer1' + arguments f.write("void sumfact_kernel({})\n".format(arguments)) for line in knl.member.lines[1:]: f.write("{}\n".format(line)) f.write("\n\n") f.write("static void BM_sumfact_kernel(benchmark::State& state){\n") write_setup_code(sf, filename, define_thetas=False) additional_arguments = [i.split()[-1] for i in sf.interface.signature_args] additional_arguments = ', '.join(additional_arguments) if len(additional_arguments) > 0: additional_arguments = ', ' + additional_arguments with open(filename, "a") as f: f.writelines([" for (auto _ : state){\n", " sumfact_kernel(buffer0, buffer1{});\n".format(additional_arguments), " }\n", "}\n", "BENCHMARK(BM_sumfact_kernel);\n", "\n", "BENCHMARK_MAIN();" ]) # Maybe turn opcounting on again set_option("opcounter", opcounting) def generate_standalone_code(sf, filename): delete_cache_items("kernel_default") # Turn off opcounting opcounting = get_option("opcounter") set_option("opcounter", False) # Extract sum factorization kernel from dune.codegen.pdelab.localoperator import extract_kernel_from_cache knl = realize_sumfact_kernel_function(sf) first_line = knl.member.lines[0] with open(filename, "w") as f: f.writelines(["// {}".format(first_line), "\n", "#include \"config.h\"\n", "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", "#include<dune/codegen/common/tsc.hh>\n", "#include<dune/codegen/common/vectorclass.hh>\n", "#include<dune/codegen/sumfact/onedquadrature.hh>\n", "#include<dune/codegen/sumfact/horizontaladd.hh>\n", "#include<random>\n", "#include<fstream>\n", "#include<iostream>\n", "\n" ]) f.writelines(["int main(int argc, char** argv)\n", "{\n", ]) write_setup_code(sf, filename) # Write measurement with open(filename, "a") as f: # Start a TSC timer f.writelines([" auto start = Dune::PDELab::TSC::start();\n", ]) # Add the implementation of the kernel. repeats = int(1e9 / sf.operations) f.write(" for(int i=0; i<{}; ++i)\n".format(repeats)) f.write(" {\n") for line in knl.member.lines[1:]: f.write(" {}\n".format(line)) f.write(" }\n") # Stop the TSC timer and write the result to a file f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n", " std::ofstream file;\n", " file.open(argv[1]);\n", " file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))), " file.close();\n", " accum += output[dis(rng)];\n", " std::cout << accum;\n", "}\n", ]) # Maybe turn opcounting on again set_option("opcounter", opcounting) def autotune_realization(sf): # Make sure that the benchmark directory exists dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks") if not os.path.exists(dir): os.mkdir(dir) if sf is None: basename = "autotune_sumfact_{}".format(kernel.name) else: basename = "autotune_sumfact_{}".format(sf.function_name) basename = hashlib.sha256(basename.encode()).hexdigest() filename = os.path.join(dir, "{}.cc".format(basename)) logname = os.path.join(dir, "{}.log".format(basename)) lock = os.path.join(dir, "{}.lock".format(basename)) executable = os.path.join(dir, basename) # Generate and compile a benchmark program # # Note: cache restoring is only necessary when generating from SumfactKernel with cache_restoring(): with filelock.FileLock(lock): if not os.path.isfile(logname): if sf is None: generate_standalone_kernel_code(kernel, signature, filename) elif get_option("autotune_google_benchmark"): generate_standalone_code_google_benchmark(sf, filename) else: generate_standalone_code(sf, filename) call = [] wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_COMPILATION_WRAPPER") if wrapper: call.append(wrapper) call.extend(compiler_invocation(executable, filename)) devnull = open(os.devnull, 'w') ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT) if ret != 0: raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call))) # File system synchronization! while not os.path.exists(executable): time.sleep(0.01) # Check whether the user specified an execution wrapper call = [] wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_EXECUTION_WRAPPER") if wrapper: call.append(wrapper) # Run the benchmark program call.append(executable) if get_option("autotune_google_benchmark"): call.append("--benchmark_out={}".format(logname)) # call.append("--benchmark_out_format=csv") else: call.append(logname) ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT) if ret != 0: raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call))) # File system synchronization! while not os.path.exists(logname): time.sleep(0.01) # Extract the result form the log file if get_option("autotune_google_benchmark"): import json with open(logname) as json_file: try: data = json.load(json_file) return data['benchmarks'][0]['cpu_time'] except Exception as e: print("Error while loading file {}".format(logname)) raise e else: return float(next(iter(open(logname, "r")))) / 1000000