diff --git a/bin/donkey_benchmark_compilation_wrapper.sh b/bin/donkey_benchmark_compilation_wrapper.sh index e2691b5d71db2473b731fabcea6a1095ec2516d9..a786d111264ef0e67769100713f1776e3e002c3d 100755 --- a/bin/donkey_benchmark_compilation_wrapper.sh +++ b/bin/donkey_benchmark_compilation_wrapper.sh @@ -13,7 +13,4 @@ ml parmetis ("$@") code=$? -echo "Code: $code" -sleep 0.1s - exit $code diff --git a/bin/donkey_benchmark_execution_wrapper.py b/bin/donkey_benchmark_execution_wrapper.py index 7951b8b06bfdeb217bd3328c9b3b88f229bbc606..d383963318291ae947c70b1f083c320a60250303 100755 --- a/bin/donkey_benchmark_execution_wrapper.py +++ b/bin/donkey_benchmark_execution_wrapper.py @@ -13,8 +13,3 @@ ret = subprocess.call(command) # If that failed - fail! if ret != 0: sys.exit(ret) - -# If that was succesful, wait for the output file to be available on the filesystem -# This step is necessary because the NFS synchronization is too slow for our workflow. -while not os.path.isfile(sys.argv[2]): - time.sleep(0.1) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b96bd06cbce6361f1130e17bb7cb4d97efe08ec7..8881504f934e7b1d755580ffc46d47f2a1467346 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -25,5 +25,5 @@ add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc) target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing) if(benchmark_FOUND) - target_link_libraries(_autotune_target benchmark::benchmark) + target_link_libraries(_autotune_target benchmark) endif() diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py index d19fd7be5587258e8db1f3ada3301b1e7017f95e..a1d7442addc4e2d1d720b1cc81d8a914c1f75532 100644 --- a/python/dune/codegen/sumfact/autotune.py +++ b/python/dune/codegen/sumfact/autotune.py @@ -14,6 +14,7 @@ import re import subprocess import filelock import hashlib +import time def get_cmake_cache_entry(entry): @@ -94,10 +95,8 @@ def write_global_data(sf, filename): def write_setup_code(sf, filename, define_thetas=True): - opcounting = get_option("opcounter") with open(filename, "a") as f: # Setup a polynomial object (normally done in the LocalOperator members) - set_option("opcounter", False) from dune.codegen.loopy.target import type_floatingpoint real = type_floatingpoint() f.write(" using RF = {};\n".format(real)) @@ -116,11 +115,12 @@ def write_setup_code(sf, filename, define_thetas=True): constructor_knl = lp.get_one_scheduled_kernel(constructor_knl) # Allocate buffers + alignment = get_option("max_vector_width") // 8 size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width, product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width) size = int(size * (get_option("precision_bits") / 8)) - f.writelines([" char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size), - " char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size), + f.writelines([" char buffer0[{}] __attribute__ ((aligned ({})));\n".format(size, alignment), + " char buffer1[{}] __attribute__ ((aligned ({})));\n".format(size, alignment), ]) # Setup fastdg inputs @@ -129,7 +129,7 @@ def write_setup_code(sf, filename, define_thetas=True): f.write("{} = 0;\n".format(arg)) else: size = sf.interface.fastdg_interface_object_size - f.write(" RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size)) + f.write(" RF {}[{}] __attribute__ ((aligned ({})));\n".format(arg.split()[-1], size, alignment)) # Write stuff into the input buffer f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real), @@ -172,12 +172,15 @@ def write_setup_code(sf, filename, define_thetas=True): " rng.seed(42);\n", " std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)), ]) - return opcounting def generate_standalone_code_google_benchmark(sf, filename): delete_cache_items("kernel_default") + # Turn off opcounting + opcounting = get_option("opcounter") + set_option("opcounter", False) + # Extract sum factorization kernel from dune.codegen.pdelab.localoperator import extract_kernel_from_cache knl = realize_sumfact_kernel_function(sf) @@ -205,6 +208,10 @@ def generate_standalone_code_google_benchmark(sf, filename): write_global_data(sf, filename) with open(filename, "a") as f: + arguments = ', '.join(sf.interface.signature_args) + if len(arguments) > 0: + arguments = ', ' + arguments + arguments = 'const char* buffer0, const char* buffer1' + arguments f.write("void sumfact_kernel({})\n".format(arguments)) for line in knl.member.lines[1:]: f.write("{}\n".format(line)) @@ -212,7 +219,7 @@ def generate_standalone_code_google_benchmark(sf, filename): f.write("\n\n") f.write("static void BM_sumfact_kernel(benchmark::State& state){\n") - opcounting = write_setup_code(sf, filename, define_thetas=False) + write_setup_code(sf, filename, define_thetas=False) additional_arguments = [i.split()[-1] for i in sf.interface.signature_args] additional_arguments = ', '.join(additional_arguments) @@ -227,12 +234,18 @@ def generate_standalone_code_google_benchmark(sf, filename): "\n", "BENCHMARK_MAIN();" ]) + + # Maybe turn opcounting on again set_option("opcounter", opcounting) def generate_standalone_code(sf, filename): delete_cache_items("kernel_default") + # Turn off opcounting + opcounting = get_option("opcounter") + set_option("opcounter", False) + # Extract sum factorization kernel from dune.codegen.pdelab.localoperator import extract_kernel_from_cache knl = realize_sumfact_kernel_function(sf) @@ -257,7 +270,7 @@ def generate_standalone_code(sf, filename): "{\n", ]) - opcounting = write_setup_code(sf, filename) + write_setup_code(sf, filename) # Write measurement with open(filename, "a") as f: @@ -283,7 +296,9 @@ def generate_standalone_code(sf, filename): " std::cout << accum;\n", "}\n", ]) - set_option("opcounter", opcounting) + + # Maybe turn opcounting on again + set_option("opcounter", opcounting) def autotune_realization(sf): @@ -320,6 +335,10 @@ def autotune_realization(sf): if ret != 0: raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call))) + # File system synchronization! + while not os.path.exists(executable): + time.sleep(0.01) + # Check whether the user specified an execution wrapper call = [] wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_EXECUTION_WRAPPER") @@ -337,11 +356,19 @@ def autotune_realization(sf): if ret != 0: raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call))) + # File system synchronization! + while not os.path.exists(logname): + time.sleep(0.01) + # Extract the result form the log file if get_option("autotune_google_benchmark"): import json with open(logname) as json_file: - data = json.load(json_file) - return data['benchmarks'][0]['cpu_time'] + try: + data = json.load(json_file) + return data['benchmarks'][0]['cpu_time'] + except Exception as e: + print("Error while loading file {}".format(logname)) + raise e else: return float(next(iter(open(logname, "r")))) / 1000000