diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py index 97e7f1d540823d1753197272f0ccd0a415d8a8b2..e89f13c271ae6517baab92afbde305994c4c965f 100644 --- a/python/dune/codegen/sumfact/autotune.py +++ b/python/dune/codegen/sumfact/autotune.py @@ -303,7 +303,118 @@ def generate_standalone_code(sf, filename): set_option("opcounter", opcounting) -def autotune_realization(sf): +def generate_standalone_kernel_code(kernel, signature, filename): + with open(filename, 'w') as f: + # Write headers + headers = ['#include "config.h"', + '#include <iostream>', + '#include <fstream>', + '#include <random>', + '#include "benchmark/benchmark.h"', + '#include <dune/codegen/common/vectorclass.hh>', + '#include <dune/codegen/sumfact/horizontaladd.hh>', + ] + f.write("\n".join(headers)) + + # Get a list of the function argument names + assert len(signature) == 1 + sig = signature[0] + arguments = sig[sig.find('(') +1:sig.find(')')].split(',') + arguments = [a.split(' ')[-1] for a in arguments] + + global_args = [a for a in kernel.args if a.name not in arguments] + + # Declare global arguments + f.write('\n\n') + target = DuneTarget() + for g in global_args: + decl_info = g.decl_info(target, True, g.dtype) + for idi in decl_info: + ast_builder = target.get_device_ast_builder() + arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name) + arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape)) + arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl) + f.write('{}\n'.format(arg_decl)) + + # Generate function we want to benchmark + f.write('\n') + f.write(sig[0:sig.find(')')+1]) + f.writelines(lp.generate_body(kernel)) + f.write('\n\n') + + # Generate function that will do the benchmarking + f.write('static void BM_sumfact_kernel(benchmark::State& state){\n') + + # Declare random generators + real = type_floatingpoint() + lines = [' std::uniform_real_distribution<{}> unif(0,1);'.format(real), + ' std::uniform_int_distribution<int> unif_int(0,128);', + ' std::default_random_engine re;'] + f.write('\n'.join(lines) + '\n') + + # Declare function arguments + function_arguments = [a for a in kernel.args if a.name in arguments] + for arg in function_arguments: + if 'buffer' in arg.name: + byte_size = reduce(mul, arg.shape) * 8 + f.write(' char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name, + byte_size, + arg.alignment),) + elif isinstance(arg, lp.ValueArg): + assert 'jacobian_offset' in arg.name + decl = arg.get_arg_decl(ast_builder) + decl = Initializer(decl, 'unif_int(re)') + f.write(' {}\n'.format(decl)) + else: + assert 'fastdg' in arg.name + size = reduce(mul, arg.shape) + alignment = arg.dtype.itemsize + f.write(' {} {}[{}] __attribute__ ((aligned ({})));\n'.format(real, + arg.name, + size, + alignment)) + + # Initialize arguments + def _initialize_arg(arg): + if isinstance(arg, lp.ValueArg): + return [] + real = type_floatingpoint() + size = reduce(mul, arg.shape) + fill_name = arg.name + '_fill' + lines = [' {}* {} = (double *) {};'.format(real, fill_name, arg.name), + ' for (std::size_t i=0; i<{}; ++i){{'.format(size), + ' {}[i] = unif(re);'.format(fill_name), + ' }'] + return lines + + for arg in kernel.args: + lines = _initialize_arg(arg) + f.write('\n'.join(lines) + '\n') + + # Benchmark loop + function_call = kernel.name + '({})'.format(','.join(arguments)) + f.writelines([' for (auto _ : state){\n', + ' {};\n'.format(function_call), + ' }\n', + ]) + f.write('}\n') + + # Benchmark main + main = ['', + 'BENCHMARK(BM_sumfact_kernel);', + '', + 'BENCHMARK_MAIN();'] + f.write('\n'.join(main)) + + +def autotune_realization(sf=None, kernel=None, signature=None): + if sf is None: + assert kernel is not None + assert signature is not None + else: + assert kernel is None + assert signature is None + # Make sure that the benchmark directory exists dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks") if not os.path.exists(dir): diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py index d020c490b89f224a8c1e43fdb83f8135d2c3eea5..38171bdb05554b00e050bd0fc8899cf585c19e61 100644 --- a/python/dune/codegen/sumfact/realization.py +++ b/python/dune/codegen/sumfact/realization.py @@ -122,14 +122,16 @@ class BufferSwitcher(object): self.current = 0 def get_temporary(self, sf=None, name=None, **kwargs): + assert sf assert name - bs = "buffer{}".format(self.current) - # Calculate correct alignment + bs = "buffer{}".format(self.current) shape = kwargs['shape'] assert shape dim_tags = kwargs['dim_tags'].split(',') assert dim_tags + + # Calculate correct alignment vec_size = 1 if 'vec' in dim_tags: vec_size = shape[dim_tags.index('vec')] @@ -142,8 +144,9 @@ class BufferSwitcher(object): # this buffer is used to store different data sizes we need to make # sure it is big enough. assert sf - size = _max_sum_factorization_buffer_size(sf) / vec_size - globalarg(bs, shape=(size, vec_size), alignment=alignment, dim_tags=['f', 'vec']) + size = _max_sum_factorization_buffer_size(sf) + globalarg(bs, shape=(size,), alignment=alignment, dim_tags=['f',]) + temporary_variable(name, managed=True, custom_base_storage=bs, @@ -304,12 +307,6 @@ def realize_sumfact_kernel_function(sf): ) }) - # Register kernel transformations - for trafo in sf.transformations: - transform(trafo.kernel_transformation()[0], - trafo.kernel_transformation()[1], - **trafo.kernel_transformation()[2]) - # Construct a loopy kernel object from dune.codegen.pdelab.localoperator import extract_kernel_from_cache args = ("const char* buffer0", "const char* buffer1") + sf.interface.signature_args diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py index 8fc82f3bc2650ea78cb73375c5fb501ee3522ea4..71fd8c71e94b5ca7feffa59ec876af8149fa039c 100644 --- a/python/dune/codegen/sumfact/symbolic.py +++ b/python/dune/codegen/sumfact/symbolic.py @@ -476,7 +476,6 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): insn_dep=frozenset(), interface=SumfactKernelInterfaceBase(), predicates=frozenset(), - transformations=(), ): """Create a sum factorization kernel @@ -603,10 +602,6 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): name_quad_perm = "_qpperm_{}".format("".join(str(a) for a in self.interface.quadrature_permutation)) name = name + name_quad_perm - # Change name for applied transformations - for t in self.transformations: - name = name + '_' + t.name_appendix() - return name @property @@ -867,7 +862,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) buffer=buffer, insn_dep=insn_dep, vertical_width=vertical_width, - transformations=transformations, ) prim.Variable.__init__(self, "VecSUMFAC") @@ -896,11 +890,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) def function_name(self): name = "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted), self.interface.function_name_suffix) - - # Change name for applied transformations - for t in self.transformations: - name = name + '_' + t.name_appendix() - return name @property