Newer
Older
""" Autotuning for sum factorization kernels """
import os
import re
import subprocess
Dominic Kempf
committed
import filelock
from six.moves import reduce
import loopy as lp
from pytools import product
from cgen import ArrayOf, AlignedAttribute, Initializer
from dune.codegen.generation import cache_restoring, delete_cache_items
from dune.codegen.loopy.target import DuneTarget, type_floatingpoint
from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
from dune.codegen.options import get_option, option_context
from dune.codegen.error import CodegenAutotuneError
def get_cmake_cache_entry(entry):
for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"):
match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH|UNINITIALIZED|STATIC]+=(.*)".format(entry), line)
if match:
return match.groups()[0]
def get_dune_codegen_dir():
if get_cmake_cache_entry("CMAKE_PROJECT_NAME") == "dune-codegen":
return get_option("project_basedir")
else:
def compiler_invocation(name, filename):
# Determine the CMake Generator in use
gen = get_cmake_cache_entry("CMAKE_GENERATOR")
assert(gen == "Unix Makefiles")
# Find compiler path
compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER")
compile_flags = [compiler]
# Parse compiler flags
for line in open(os.path.join(get_dune_codegen_dir(), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"):
match = re.match("([^=]*)=(.*)", line)
if match:
compile_flags.extend(match.groups()[1].split())
# Add the source file
compile_flags.append(filename)
# Parse linker flags
for line in open(os.path.join(get_dune_codegen_dir(), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"):
match = re.match(".*_autotune_target (.*)", line)
if match:
for flag in match.groups()[0].split():
if flag.startswith("-") or os.path.isabs(flag):
compile_flags.append(flag)
else:
compile_flags.append(os.path.join(get_dune_codegen_dir(), "python", flag))
# Set an output name
compile_flags.append("-o")
compile_flags.append(name)
return compile_flags
def write_global_data(sf, filename):
opcounting = get_option("opcounter")
with open(filename, "a") as f:
# Get kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
target = DuneTarget()
from loopy.codegen import CodeGenerationState
codegen_state = CodeGenerationState(kernel=constructor_knl,
implemented_data_info=None,
implemented_domain=None,
implemented_predicates=frozenset(),
seen_dtypes=frozenset(),
seen_functions=frozenset(),
seen_atomic_dtypes=frozenset(),
var_subst_map={},
allow_complex=False,
is_generating_device_code=True,
)
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write("{}\n".format(next(iter(decl.generate()))))
def write_setup_code(sf, filename, define_thetas=True):
with open(filename, "a") as f:
# Setup a polynomial object (normally done in the LocalOperator members)
from dune.codegen.loopy.target import type_floatingpoint
real = type_floatingpoint()
f.write(" using RF = {};\n".format(real))
f.write(" using DF = {};\n".format(real))
from dune.codegen.sumfact.tabulation import name_polynomials
degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence_quadrature_permuted)
for deg in set(degs):
f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
# Get kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
# Allocate buffers
alignment = get_option("max_vector_width") // 8
size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width,
product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width)
f.writelines([" char buffer0[{}] __attribute__ ((aligned ({})));\n".format(size, alignment),
" char buffer1[{}] __attribute__ ((aligned ({})));\n".format(size, alignment),
# Setup fastdg inputs
for arg in sf.interface.signature_args:
if "jacobian" in arg:
f.write("{} = 0;\n".format(arg))
else:
f.write(" RF {}[{}] __attribute__ ((aligned ({})));\n".format(arg.split()[-1], size, alignment))
# Write stuff into the input buffer
f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real),
" {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2),
" for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)),
" input[i] = ({})(i+1);\n".format(real),
])
target = DuneTarget()
from loopy.codegen import CodeGenerationState
codegen_state = CodeGenerationState(kernel=constructor_knl,
implemented_data_info=None,
implemented_domain=None,
implemented_predicates=frozenset(),
seen_dtypes=frozenset(),
seen_functions=frozenset(),
seen_atomic_dtypes=frozenset(),
var_subst_map={},
allow_complex=False,
is_generating_device_code=True,
)
if define_thetas:
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write(" {}\n".format(next(iter(decl.generate()))))
for _, line in constructor_knl.preambles:
if "gfsu" not in line:
f.write(" {}\n".format(line))
# Add setup code for theta matrices. We add some lines not necessary,
# but it would be more work to remove them than keeping them.
for line in lp.generate_body(constructor_knl).split("\n")[1:-1]:
if "gfsu" not in line and "meshwidth" not in line and "geometry" not in line:
f.write(" {}\n".format(line))
# INtroduces a variable that makes sure that the kernel cannot be optimized away
f.writelines([" {} accum;\n".format(real),
" std::mt19937 rng;\n",
" rng.seed(42);\n",
" std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
])
def generate_standalone_code_google_benchmark(sf, filename):
delete_cache_items("kernel_default")
with option_context(opcounter=False):
# Extract sum factorization kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# Add the implementation of the kernel.
# TODO: This can probably done in a safer way?
first_line = knl.member.lines[0]
arguments = first_line[first_line.find("(") + 1:first_line.find(")")]
with open(filename, "w") as f:
f.writelines(["// {}".format(first_line),
"\n",
"#include \"config.h\"\n",
"#include \"benchmark/benchmark.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
write_global_data(sf, filename)
with open(filename, "a") as f:
arguments = ', '.join(sf.interface.signature_args)
if len(arguments) > 0:
arguments = ', ' + arguments
arguments = 'const char* buffer0, const char* buffer1' + arguments
f.write("void sumfact_kernel({})\n".format(arguments))
for line in knl.member.lines[1:]:
f.write("{}\n".format(line))
f.write("\n\n")
f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
write_setup_code(sf, filename, define_thetas=False)
additional_arguments = [i.split()[-1] for i in sf.interface.signature_args]
additional_arguments = ', '.join(additional_arguments)
if len(additional_arguments) > 0:
additional_arguments = ', ' + additional_arguments
with open(filename, "a") as f:
f.writelines([" for (auto _ : state){\n",
" sumfact_kernel(buffer0, buffer1{});\n".format(additional_arguments),
" }\n",
"}\n",
"BENCHMARK(BM_sumfact_kernel);\n",
"\n",
"BENCHMARK_MAIN();"
])
def generate_standalone_code(sf, filename):
delete_cache_items("kernel_default")
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
with option_context(opcounter=False):
# Extract sum factorization kernel
from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
first_line = knl.member.lines[0]
with open(filename, "w") as f:
f.writelines(["// {}".format(first_line),
"\n",
"#include \"config.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/codegen/common/tsc.hh>\n",
"#include<dune/codegen/common/vectorclass.hh>\n",
"#include<dune/codegen/sumfact/onedquadrature.hh>\n",
"#include<dune/codegen/sumfact/horizontaladd.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"#include<iostream>\n",
"\n"
])
f.writelines(["int main(int argc, char** argv)\n",
"{\n",
])
write_setup_code(sf, filename)
# Write measurement
with open(filename, "a") as f:
# Start a TSC timer
f.writelines([" auto start = Dune::PDELab::TSC::start();\n",
])
# Add the implementation of the kernel.
repeats = int(1e9 / sf.operations)
f.write(" for(int i=0; i<{}; ++i)\n".format(repeats))
f.write(" {\n")
for line in knl.member.lines[1:]:
f.write(" {}\n".format(line))
f.write(" }\n")
# Stop the TSC timer and write the result to a file
f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n",
" std::ofstream file;\n",
" file.open(argv[1]);\n",
" file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))),
" file.close();\n",
" accum += output[dis(rng)];\n",
" std::cout << accum;\n",
"}\n",
])
def generate_standalone_kernel_code(kernel, signature, filename, transformations=None):
with option_context(opcounter=False):
# Remove opcounter from signature
p = re.compile('OpCounter::OpCounter<([^>]*)>')
assert len(signature) == 1
sig = signature[0]
sig = p.sub(r'\1', sig)
assert 'OpCounter' not in signature
# Which transformations were applied
codegen_transformations = ''
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
if transformations:
codegen_transformations = ''
for trafo in transformations:
codegen_transformations += '// {}\n'.format(trafo)
template = 'kernel_benchmark_template1.cc.in'
use_datasets = True
# Old benchmark template
# template = 'kernel_benchmark_template0.cc.in'
# use_datasets = False
template_filename = pkg_resources.resource_filename(__name__, template)
with open(template_filename, 'r') as f:
benchmark = f.read()
# Find function arguments and global arguments
arguments = sig[sig.find('(') + 1:sig.find(')')].split(',')
arguments = [a.split(' ')[-1] for a in arguments]
global_args = [a for a in kernel.args if a.name not in arguments]
buffer_arguments = [a for a in arguments if a.startswith('buff')]
input_arguments = [a for a in arguments if a not in buffer_arguments]
# Declare global arguments
codegen_declare_global_arguments = ''
target = DuneTarget()
for g in global_args:
decl_info = g.decl_info(target, True, g.dtype)
for idi in decl_info:
ast_builder = target.get_device_ast_builder()
arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
codegen_declare_global_arguments += '{}\n'.format(arg_decl)
codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, ' ')
# Helper function for argument initialization
def _initialize_arg(arg):
if isinstance(arg, lp.ValueArg):
return []
real = type_floatingpoint()
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
size = reduce(mul, arg.shape)
fill_name = arg.name + '_fill'
lines = [' {}* {} = (double *) {};'.format(real, fill_name, arg.name),
' for (std::size_t i=0; i<{}; ++i){{'.format(size),
' {}[i] = unif(re);'.format(fill_name),
' }']
return lines
# Initialize global arguments
codegen_initialize_global_arguments = ''
for arg in global_args:
lines = _initialize_arg(arg)
codegen_initialize_global_arguments += '\n'.join(lines) + '\n'
codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, ' ')
codegen_initialize_input = ''
# Function we want to benchmark
codegen_benchmark_function = ''
codegen_benchmark_function += sig[0:sig.find(')') + 1]
codegen_benchmark_function += lp.generate_body(kernel)
codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, ' ')
# Declare function arguments
codegen_declare_arguments = []
codegen_declare_input = []
function_arguments = [a for a in kernel.args if a.name in arguments]
for arg in function_arguments:
if 'buffer' in arg.name:
byte_size = reduce(mul, arg.shape) * 8
codegen_declare_arguments.append(' char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
byte_size,
arg.alignment),)
elif isinstance(arg, lp.ValueArg):
assert 'jacobian_offset' in arg.name
decl = arg.get_arg_decl(ast_builder)
decl = Initializer(decl, 'unif_int(re)')
codegen_declare_arguments.append((' {}\n'.format(decl)))
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
assert 'fastdg' in arg.name
size = reduce(mul, arg.shape)
min_stride = min([tag.stride for tag in arg.dim_tags])
size *= min_stride
alignment = arg.dtype.itemsize
real = type_floatingpoint()
if use_datasets:
codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real,
arg.name,
size,
alignment)))
else:
codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
arg.name,
size,
alignment)))
codegen_declare_arguments = ''.join(codegen_declare_arguments)
codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, ' ')
codegen_declare_input = ''.join(codegen_declare_input)
codegen_declare_input = textwrap.indent(codegen_declare_input, ' ')
# Initialize function arguments
codegen_initialize_arguments = ''
codegen_initialize_input = ''
for arg in function_arguments:
if 'fastdg' in arg.name:
if use_datasets:
lines = _initialize_arg(arg)
lines = [' ' + a for a in lines]
lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines]
lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){')
lines.append('}')
codegen_initialize_input += '\n'.join(lines) + '\n'
else:
lines = _initialize_arg(arg)
codegen_initialize_arguments += '\n'.join(lines) + '\n'
else:
lines = _initialize_arg(arg)
codegen_initialize_arguments += '\n'.join(lines) + '\n'
codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, ' ')
codegen_initialize_input = textwrap.indent(codegen_initialize_input, ' ')
# Call the benchmark function
if use_datasets:
arguments_with_datasets = arguments.copy()
arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments]
codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n'
codegen_call_benchmark_function += ' ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n'
for arg in input_arguments:
codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg)
codegen_call_benchmark_function += '}'
codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n'
codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, ' ')
# Replace placeholders in benchmark template
benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations)
benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments)
benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input)
benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments)
benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input)
benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function)
benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments)
benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments)
benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function)
# Write benchmark source file
with open(filename, 'w') as f:
f.writelines(benchmark)
def autotune_realization(sf=None, kernel=None, signature=None, transformations=None):
"""Generate an microbenchmark, compile run and return time
Parameters
----------
sf: SumfactKernel or VectorizedSumfactKernel
kernel: loopy.kernel.LoopKernel
signature: str
transformation: list of str
Will be used to distinguish between autotune targets
"""
if sf is None:
assert kernel is not None
assert signature is not None
else:
assert kernel is None
assert signature is None
# Make sure that the benchmark directory exists
dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
if not os.path.exists(dir):
os.mkdir(dir)
if sf is None:
basename = "autotune_sumfact_{}".format(kernel.name)
else:
basename = "autotune_sumfact_{}".format(sf.function_name)
if transformations:
for trafo in transformations:
basename = '{}_{}'.format(basename, trafo)
basename = hashlib.sha256(basename.encode()).hexdigest()
filename = os.path.join(dir, "{}.cc".format(basename))
logname = os.path.join(dir, "{}.log".format(basename))
lock = os.path.join(dir, "{}.lock".format(basename))
executable = os.path.join(dir, basename)
Dominic Kempf
committed
# Generate and compile a benchmark program
#
# Note: cache restoring is only necessary when generating from SumfactKernel
Dominic Kempf
committed
with cache_restoring():
with filelock.FileLock(lock):
if not os.path.isfile(logname):
logger.debug('Generate autotune target in file {}'.format(filename))
generate_standalone_kernel_code(kernel, signature, filename, transformations)
generate_standalone_code_google_benchmark(sf, filename)
else:
generate_standalone_code(sf, filename)
Dominic Kempf
committed
call = []
wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_COMPILATION_WRAPPER")
if wrapper:
call.append(wrapper)
call.extend(compiler_invocation(executable, filename))
os.environ['DUNE_CODEGEN_THREADS'] = '1'
ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call)))
Dominic Kempf
committed
# File system synchronization!
while not os.path.exists(executable):
time.sleep(0.01)
Dominic Kempf
committed
# Check whether the user specified an execution wrapper
call = []
wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_EXECUTION_WRAPPER")
Dominic Kempf
committed
if wrapper:
call.append(wrapper)
# Run the benchmark program
if get_option("autotune_google_benchmark"):
call.append("--benchmark_out={}".format(logname))
# call.append("--benchmark_out_format=csv")
else:
call.append(logname)
Dominic Kempf
committed
ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
# File system synchronization!
while not os.path.exists(logname):
time.sleep(0.01)
# Extract the result form the log file
if get_option("autotune_google_benchmark"):
import json
with open(logname) as json_file:
try:
data = json.load(json_file)
minimal_time = 1e80
for b in data['benchmarks']:
if b['name'].endswith('_mean') or b['name'].endswith('_median') or b['name'].endswith('_stddev'):
pass
else:
if b['cpu_time'] < minimal_time:
minimal_time = b['cpu_time']
assert minimal_time < 1e80
return minimal_time
except Exception as e:
print("Error while loading file {}".format(logname))
raise e
else:
return float(next(iter(open(logname, "r")))) / 1000000