diff --git a/bin/donkey_benchmark_execution_wrapper.py b/bin/donkey_benchmark_execution_wrapper.py index d383963318291ae947c70b1f083c320a60250303..22835c7cbce61336168396007e73703394eb7433 100755 --- a/bin/donkey_benchmark_execution_wrapper.py +++ b/bin/donkey_benchmark_execution_wrapper.py @@ -5,8 +5,16 @@ import sys import subprocess import os +os.environ['DUNE_CODEGEN_THREADS'] = '20' + # Run the actual command -command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split() + +# The actual measurements will be called like this (mpi parallelism) +# command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split() + +# Command for autotune benchmarks with threading +command = "srun -p haswell10c -n 1 -c 20 --hint=nomultithread --cpu_bin=verbose,core".split() + command.extend(sys.argv[1:]) ret = subprocess.call(command) diff --git a/python/dune/codegen/logging.conf b/python/dune/codegen/logging.conf index 32974b12744dd1c23c36f561148943ec936b3ccb..1fcc4ba02962460ea3f1db8c678dbba5ef3fef2e 100644 --- a/python/dune/codegen/logging.conf +++ b/python/dune/codegen/logging.conf @@ -1,5 +1,5 @@ [loggers] -keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.vectorization +keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.transformations,dune.codegen.sumfact.vectorization [handlers] keys=consoleHandler @@ -16,6 +16,12 @@ handlers=consoleHandler qualname=dune.codegen.pdelab.localoperator propagate=0 +[logger_dune.codegen.sumfact.transformations] +level=INFO +handlers=consoleHandler +qualname=dune.codegen.sumfact.transformations +propagate=0 + [logger_dune.codegen.sumfact.vectorization] level=INFO handlers=consoleHandler diff --git a/python/dune/codegen/loopy/transformations/performance.py b/python/dune/codegen/loopy/transformations/performance.py new file mode 100644 index 0000000000000000000000000000000000000000..02644e80c3c0f782309a165c050949e0f1d71cb2 --- /dev/null +++ b/python/dune/codegen/loopy/transformations/performance.py @@ -0,0 +1,8 @@ +from dune.codegen.options import get_form_option +from dune.codegen.sumfact.transformations import sumfact_performance_transformations + + +def performance_transformations(kernel, signature): + if get_form_option("sumfact"): + kernel = sumfact_performance_transformations(kernel, signature) + return kernel diff --git a/python/dune/codegen/loopy/transformations/remove_reductions.py b/python/dune/codegen/loopy/transformations/remove_reductions.py new file mode 100644 index 0000000000000000000000000000000000000000..1ccca09c230f5ec32a9134035f3e883c63cc3680 --- /dev/null +++ b/python/dune/codegen/loopy/transformations/remove_reductions.py @@ -0,0 +1,71 @@ +import loopy as lp + +import pymbolic.primitives as prim + + +def remove_reduction(knl, match): + """Removes all matching reductions and do direct accumulation in assignee instead""" + + # Find reductions + for instr in lp.find_instructions(knl, match): + if isinstance(instr.expression, lp.symbolic.Reduction): + instructions = [] + + # Dependencies + depends_on = instr.depends_on + depending = [] + for i in knl.instructions: + if instr.id in i.depends_on: + depending.append(i.id) + + # Remove the instruction from the kernel + knl = lp.remove_instructions(knl, set([instr.id])) + + # Add instruction that sets assignee to zero + id_zero = instr.id + '_set_zero' + instructions.append(lp.Assignment(instr.assignee, + 0, + within_inames=instr.within_inames, + id=id_zero, + tags=('set_zero',) + )) + + # Add instruction that accumulates directly in assignee + assignee = instr.assignee + expression = prim.Sum((assignee, instr.expression.expr)) + within_inames = frozenset(tuple(instr.within_inames) + instr.expression.inames) + id_accum = instr.id + '_accum' + instructions.append(lp.Assignment(assignee, + expression, + within_inames=within_inames, + id=id_accum, + depends_on=frozenset((id_zero,) + tuple(depends_on)), + tags=('assignment',))) + + knl = knl.copy(instructions=knl.instructions + instructions) + + # Restore dependencies + for dep in depending: + match = lp.match.Id(dep) + knl = lp.add_dependency(knl, match, id_accum) + return knl + + +def remove_all_reductions(knl): + """Remove all reductions from loopy kernel + + This removes all reductions by instead setting the assignee to zero and + directly accumulating in the assignee. + """ + # Find ids of all reductions + ids = [] + for instr in knl.instructions: + if isinstance(instr.expression, lp.symbolic.Reduction): + ids.append(instr.id) + + # Remove reductions + for id in ids: + match = lp.match.Id(id) + knl = remove_reduction(knl, match) + + return knl diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py index 257da40169f578a4b64a367f6aaa4cd55729253d..a4f65ce476fe2f610135234fb8d5a16b97401dec 100644 --- a/python/dune/codegen/options.py +++ b/python/dune/codegen/options.py @@ -85,6 +85,9 @@ class CodegenFormOptionsArray(ImmutableRecord): sumfact = CodegenOption(default=False, helpstr="Use sumfactorization") sumfact_regular_jacobians = CodegenOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)") sumfact_on_boundary = CodegenOption(default=True, helpstr="Whether boundary integrals should be vectorized. It might not be worth the hassle...") + sumfact_optimize_loop_order = CodegenOption(default=False, helpstr="Optimize order of loops in sumf factorization function using autotuning.") + sumfact_performance_transformations = CodegenOption(default=False, helpstr="Apply sum factorization specific performance transformations.") + sumfact_performance_transformations_testrun = CodegenOption(default=0, helpstr="If larger than zero determines test case to run.") vectorization_quadloop = CodegenOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorization_strategy = CodegenOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune") vectorization_not_fully_vectorized_error = CodegenOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize") diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py index eb7e1d3a50e7f8ba4b7b0f476c54565925c15d2a..dfc553fbefee7fa8c2e6aa1d92ae7ea047acdd38 100644 --- a/python/dune/codegen/pdelab/localoperator.py +++ b/python/dune/codegen/pdelab/localoperator.py @@ -594,6 +594,10 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin for trafo in transformations: kernel = trafo[0](kernel, *trafo[1], **trafo[2]) + # Apply performance transformations + from dune.codegen.loopy.transformations.performance import performance_transformations + kernel = performance_transformations(kernel, signature) + from dune.codegen.loopy import heuristic_duplication kernel = heuristic_duplication(kernel) diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py index ae4a763063f0dc25303fc517356b58b02551eebc..c1ccd13a5552980db0be3788651065dc8a39654c 100644 --- a/python/dune/codegen/sumfact/accumulation.py +++ b/python/dune/codegen/sumfact/accumulation.py @@ -164,7 +164,7 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): from dune.codegen.sumfact.basis import SumfactBasisMixin return SumfactBasisMixin.lfs_inames(SumfactBasisMixin(), get_leaf(self.trial_element, self.trial_element_index), self.restriction) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # The result of stage 2 has the correct quadrature permutation but no # cost permutation is applied. The inames for this method are # quadrature and cost permuted. This means we need to reverse the cost @@ -175,7 +175,8 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py index a1d7442addc4e2d1d720b1cc81d8a914c1f75532..1e801f4550d17d89ef8972fe3ffa867d4a44ea25 100644 --- a/python/dune/codegen/sumfact/autotune.py +++ b/python/dune/codegen/sumfact/autotune.py @@ -1,21 +1,28 @@ """ Autotuning for sum factorization kernels """ -from dune.codegen.generation import cache_restoring, delete_cache_items -from dune.codegen.loopy.target import DuneTarget -from dune.codegen.sumfact.realization import realize_sumfact_kernel_function -from dune.codegen.options import get_option, set_option -from dune.codegen.error import CodegenAutotuneError - -import loopy as lp -from pytools import product - import os import re import subprocess import filelock import hashlib +import logging +import json +from operator import mul +import pkg_resources +from six.moves import reduce +import textwrap import time +import loopy as lp +from pytools import product +from cgen import ArrayOf, AlignedAttribute, Initializer + +from dune.codegen.generation import cache_restoring, delete_cache_items +from dune.codegen.loopy.target import DuneTarget, type_floatingpoint +from dune.codegen.sumfact.realization import realize_sumfact_kernel_function +from dune.codegen.options import get_option, set_option +from dune.codegen.error import CodegenAutotuneError + def get_cmake_cache_entry(entry): for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"): @@ -301,13 +308,207 @@ def generate_standalone_code(sf, filename): set_option("opcounter", opcounting) -def autotune_realization(sf): +def generate_standalone_kernel_code(kernel, signature, filename, transformations=None): + # Turn off opcounting + opcounting = get_option("opcounter") + set_option("opcounter", False) + + # Remove opcounter from signature + p = re.compile('OpCounter::OpCounter<([^>]*)>') + assert len(signature) == 1 + sig = signature[0] + sig = p.sub(r'\1', sig) + assert 'OpCounter' not in signature + + # Which transformations were applied + codegen_transformations = '' + if transformations: + codegen_transformations = '' + for trafo in transformations: + codegen_transformations += '// {}\n'.format(trafo) + + template = 'kernel_benchmark_template1.cc.in' + use_datasets = True + + # Old benchmark template + # template = 'kernel_benchmark_template0.cc.in' + # use_datasets = False + + template_filename = pkg_resources.resource_filename(__name__, template) + with open(template_filename, 'r') as f: + benchmark = f.read() + + # Find function arguments and global arguments + arguments = sig[sig.find('(') + 1:sig.find(')')].split(',') + arguments = [a.split(' ')[-1] for a in arguments] + global_args = [a for a in kernel.args if a.name not in arguments] + buffer_arguments = [a for a in arguments if a.startswith('buff')] + input_arguments = [a for a in arguments if a not in buffer_arguments] + + # Declare global arguments + codegen_declare_global_arguments = '' + target = DuneTarget() + for g in global_args: + decl_info = g.decl_info(target, True, g.dtype) + for idi in decl_info: + ast_builder = target.get_device_ast_builder() + arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name) + arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape)) + arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl) + codegen_declare_global_arguments += '{}\n'.format(arg_decl) + codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, ' ') + + # Helper function for argument initialization + def _initialize_arg(arg): + if isinstance(arg, lp.ValueArg): + return [] + real = type_floatingpoint() + size = reduce(mul, arg.shape) + fill_name = arg.name + '_fill' + lines = [' {}* {} = (double *) {};'.format(real, fill_name, arg.name), + ' for (std::size_t i=0; i<{}; ++i){{'.format(size), + ' {}[i] = unif(re);'.format(fill_name), + ' }'] + return lines + + # Initialize global arguments + codegen_initialize_global_arguments = '' + for arg in global_args: + lines = _initialize_arg(arg) + codegen_initialize_global_arguments += '\n'.join(lines) + '\n' + codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, ' ') + + codegen_initialize_input = '' + + # Function we want to benchmark + codegen_benchmark_function = '' + codegen_benchmark_function += sig[0:sig.find(')') + 1] + codegen_benchmark_function += lp.generate_body(kernel) + codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, ' ') + + # Declare function arguments + codegen_declare_arguments = [] + codegen_declare_input = [] + function_arguments = [a for a in kernel.args if a.name in arguments] + for arg in function_arguments: + if 'buffer' in arg.name: + byte_size = reduce(mul, arg.shape) * 8 + codegen_declare_arguments.append(' char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name, + byte_size, + arg.alignment),) + elif isinstance(arg, lp.ValueArg): + assert 'jacobian_offset' in arg.name + decl = arg.get_arg_decl(ast_builder) + decl = Initializer(decl, 'unif_int(re)') + codegen_declare_arguments.append((' {}\n'.format(decl))) + else: + assert 'fastdg' in arg.name + size = reduce(mul, arg.shape) + min_stride = min([tag.stride for tag in arg.dim_tags]) + size *= min_stride + alignment = arg.dtype.itemsize + real = type_floatingpoint() + if use_datasets: + codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real, + arg.name, + size, + alignment))) + else: + codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real, + arg.name, + size, + alignment))) + + codegen_declare_arguments = ''.join(codegen_declare_arguments) + codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, ' ') + codegen_declare_input = ''.join(codegen_declare_input) + codegen_declare_input = textwrap.indent(codegen_declare_input, ' ') + + # Initialize function arguments + codegen_initialize_arguments = '' + codegen_initialize_input = '' + for arg in function_arguments: + if 'fastdg' in arg.name: + if use_datasets: + lines = _initialize_arg(arg) + lines = [' ' + a for a in lines] + lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines] + lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){') + lines.append('}') + codegen_initialize_input += '\n'.join(lines) + '\n' + else: + lines = _initialize_arg(arg) + codegen_initialize_arguments += '\n'.join(lines) + '\n' + else: + lines = _initialize_arg(arg) + codegen_initialize_arguments += '\n'.join(lines) + '\n' + codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, ' ') + codegen_initialize_input = textwrap.indent(codegen_initialize_input, ' ') + + # Call the benchmark function + if use_datasets: + arguments_with_datasets = arguments.copy() + arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments] + codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n' + codegen_call_benchmark_function += ' ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n' + for arg in input_arguments: + codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg) + codegen_call_benchmark_function += '}' + else: + codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n' + codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, ' ') + + # Replace placeholders in benchmark template + benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations) + benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments) + benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input) + benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function) + benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments) + benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function) + + # Write benchmark source file + with open(filename, 'w') as f: + f.writelines(benchmark) + + # Maybe turn opcounting on again + set_option("opcounter", opcounting) + + +def autotune_realization(sf=None, kernel=None, signature=None, transformations=None): + """Generate an microbenchmark, compile run and return time + + Parameters + ---------- + sf: SumfactKernel or VectorizedSumfactKernel + kernel: loopy.kernel.LoopKernel + signature: str + transformation: list of str + Will be used to distinguish between autotune targets + """ + if sf is None: + assert kernel is not None + assert signature is not None + else: + assert kernel is None + assert signature is None + + logger = logging.getLogger(__name__) + # Make sure that the benchmark directory exists dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks") if not os.path.exists(dir): os.mkdir(dir) - basename = "autotune_sumfact_{}".format(sf.function_name) + if sf is None: + basename = "autotune_sumfact_{}".format(kernel.name) + else: + basename = "autotune_sumfact_{}".format(sf.function_name) + if transformations: + for trafo in transformations: + basename = '{}_{}'.format(basename, trafo) basename = hashlib.sha256(basename.encode()).hexdigest() filename = os.path.join(dir, "{}.cc".format(basename)) @@ -316,10 +517,16 @@ def autotune_realization(sf): executable = os.path.join(dir, basename) # Generate and compile a benchmark program + # + # Note: cache restoring is only necessary when generating from SumfactKernel with cache_restoring(): with filelock.FileLock(lock): if not os.path.isfile(logname): - if get_option("autotune_google_benchmark"): + logger.debug('Generate autotune target in file {}'.format(filename)) + + if sf is None: + generate_standalone_kernel_code(kernel, signature, filename, transformations) + elif get_option("autotune_google_benchmark"): generate_standalone_code_google_benchmark(sf, filename) else: generate_standalone_code(sf, filename) @@ -331,6 +538,7 @@ def autotune_realization(sf): call.extend(compiler_invocation(executable, filename)) devnull = open(os.devnull, 'w') + os.environ['DUNE_CODEGEN_THREADS'] = '1' ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT) if ret != 0: raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call))) @@ -349,6 +557,7 @@ def autotune_realization(sf): call.append(executable) if get_option("autotune_google_benchmark"): call.append("--benchmark_out={}".format(logname)) + call.append("--benchmark_repetitions=5") # call.append("--benchmark_out_format=csv") else: call.append(logname) @@ -366,7 +575,15 @@ def autotune_realization(sf): with open(logname) as json_file: try: data = json.load(json_file) - return data['benchmarks'][0]['cpu_time'] + minimal_time = 1e80 + for b in data['benchmarks']: + if b['name'].endswith('_mean') or b['name'].endswith('_median') or b['name'].endswith('_stddev'): + pass + else: + if b['cpu_time'] < minimal_time: + minimal_time = b['cpu_time'] + assert minimal_time < 1e80 + return minimal_time except Exception as e: print("Error while loading file {}".format(logname)) raise e diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py index c9b75eb445af01e9420e850650b12054dd0115f3..757d51870aa7ccf690b9362b9b0522a4892f792b 100644 --- a/python/dune/codegen/sumfact/basis.py +++ b/python/dune/codegen/sumfact/basis.py @@ -345,14 +345,15 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): return insn_dep.union(frozenset({insn})) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # Note: Here we do not need to reverse any permutation since this is # already done in the setup_input method above! # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py index d37558f5c40f42355b5fa99ed9a4ea25ab7ade28..8ac8aaa4faa1a4849c3b88fb54f87b0f87ca7b2b 100644 --- a/python/dune/codegen/sumfact/geometry.py +++ b/python/dune/codegen/sumfact/geometry.py @@ -456,11 +456,12 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): return insn_dep.union(frozenset({insn})) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in new file mode 100644 index 0000000000000000000000000000000000000000..c4b49da5aec59006a9f007534dc7711b98ef0387 --- /dev/null +++ b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in @@ -0,0 +1,37 @@ +// Transformations: +${CODEGEN_TRANSFORMATIONS} + +#include "config.h" +#include <iostream> +#include <fstream> +#include <random> +#include "benchmark/benchmark.h" +#include <dune/codegen/common/vectorclass.hh> +#include <dune/codegen/sumfact/horizontaladd.hh> + +${CODEGEN_DECLARE_GLOBAL_ARGUMENTS} + +${CODEGEN_BENCHMARK_FUNCTION} + +static void BM_sumfact_kernel(benchmark::State& state){ + std::uniform_real_distribution<double> unif(0,1); + std::uniform_int_distribution<int> unif_int(0,1); + std::default_random_engine re; + + ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS} + ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS} + + ${CODEGEN_DECLARE_INPUT} + ${CODEGEN_INITIALIZE_INPUT} + + ${CODEGEN_DECLARE_ARGUMENTS} + ${CODEGEN_INITIALIZE_ARGUMENTS} + + for (auto _ : state){ + ${CODEGEN_CALL_BENCHMARK_FUNCTION} + } +} + +${CODEGEN_BENCHMARK} + +BENCHMARK_MAIN(); diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in new file mode 100644 index 0000000000000000000000000000000000000000..c03577d672123b3063020841b731fc1fd9780fce --- /dev/null +++ b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in @@ -0,0 +1,58 @@ +// Transformations: +${CODEGEN_TRANSFORMATIONS} + +#include "config.h" + +#include <iostream> +#include <fstream> +#include <random> +#include <cstdlib> + +#include "benchmark/benchmark.h" +#include <dune/codegen/common/vectorclass.hh> +#include <dune/codegen/sumfact/horizontaladd.hh> + +const int datasets = 512; + +class Benchmark{ + ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS} + + ${CODEGEN_DECLARE_INPUT} + +public: + Benchmark (){ + std::uniform_real_distribution<double> unif(0,1); + std::uniform_int_distribution<int> unif_int(0,1); + std::default_random_engine re; + + ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS} + + ${CODEGEN_INITIALIZE_INPUT} + } + + ${CODEGEN_BENCHMARK_FUNCTION} + + void run_benchmark(){ + ${CODEGEN_DECLARE_ARGUMENTS} + + std::uniform_real_distribution<double> unif(0,1); + std::uniform_int_distribution<int> unif_int(0,1); + std::default_random_engine re; + + ${CODEGEN_INITIALIZE_ARGUMENTS} + + ${CODEGEN_CALL_BENCHMARK_FUNCTION} + } + +}; + +static void BM_sumfact_kernel(benchmark::State& state){ + Benchmark benchmark; + for (auto _ : state){ + benchmark.run_benchmark(); + } +} + +BENCHMARK(BM_sumfact_kernel) -> Threads(atoi(std::getenv("DUNE_CODEGEN_THREADS"))) -> UseRealTime(); + +BENCHMARK_MAIN(); diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py index 14cd0b1d0fe3814f294c22bc00f25260f4ee0233..461f5b0a3ace21e09a7a3707c4849aa1c36e6972 100644 --- a/python/dune/codegen/sumfact/realization.py +++ b/python/dune/codegen/sumfact/realization.py @@ -30,7 +30,6 @@ from dune.codegen.sumfact.quadrature import quadrature_points_per_direction from dune.codegen.sumfact.symbolic import (SumfactKernel, VectorizedSumfactKernel, ) -from dune.codegen.sumfact.vectorization import attach_vectorization_info from dune.codegen.sumfact.accumulation import sumfact_iname from dune.codegen.loopy.target import dtype_floatingpoint from dune.codegen.loopy.vcl import ExplicitVCLCast @@ -61,6 +60,12 @@ def name_buffer_storage(buff, which): return name +def _max_sum_factorization_buffer_size(sf): + size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width, + product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width) + return size + + @kernel_cached def _realize_sum_factorization_kernel(sf): insn_dep = sf.insn_dep @@ -74,8 +79,7 @@ def _realize_sum_factorization_kernel(sf): for buf in buffers: # Determine the necessary size of the buffer. We assume that we do not # underintegrate the form!!! - size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width, - product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width) + size = _max_sum_factorization_buffer_size(sf) temporary_variable("{}_dummy".format(buf), shape=(size,), custom_base_storage=buf, @@ -118,10 +122,32 @@ class BufferSwitcher(object): def __init__(self): self.current = 0 - def get_temporary(self, name=None, **kwargs): + def get_temporary(self, sf=None, name=None, **kwargs): + assert sf assert name + bs = "buffer{}".format(self.current) - globalarg(bs) + shape = kwargs['shape'] + assert shape + dim_tags = kwargs['dim_tags'].split(',') + assert dim_tags + + # Calculate correct alignment + vec_size = 1 + if 'vec' in dim_tags: + vec_size = shape[dim_tags.index('vec')] + from dune.codegen.loopy.target import dtype_floatingpoint + dtype = np.dtype(kwargs.get("dtype", dtype_floatingpoint())) + alignment = dtype.itemsize * vec_size + + # Add this buffer as global argument to the kernel. Add a shape to make + # benchmark generation for autotuning of loopy kernels easier. Since + # this buffer is used to store different data sizes we need to make + # sure it is big enough. + assert sf + size = _max_sum_factorization_buffer_size(sf) + globalarg(bs, shape=(size,), alignment=alignment, dim_tags=['f', ]) + temporary_variable(name, managed=True, custom_base_storage=bs, @@ -193,7 +219,8 @@ def realize_sumfact_kernel_function(sf): if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible: input_summand = sf.interface.realize_direct_input(input_inames, inp_shape) elif l == 0: - input_summand = sf.interface.realize_input(input_inames, + input_summand = sf.interface.realize_input(sf, + input_inames, inp_shape, vec_iname, vec_shape, @@ -204,7 +231,8 @@ def realize_sumfact_kernel_function(sf): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buffer.get_temporary("buff_step{}_in".format(l), + inp = buffer.get_temporary(sf, + "buff_step{}_in".format(l), shape=inp_shape + vec_shape, dim_tags=ftags, ) @@ -250,7 +278,8 @@ def realize_sumfact_kernel_function(sf): output_inames = permute_backward(output_inames, sf.interface.quadrature_permutation) output_shape = permute_backward(output_shape, sf.interface.quadrature_permutation) - out = buffer.get_temporary("buff_step{}_out".format(l), + out = buffer.get_temporary(sf, + "buff_step{}_out".format(l), shape=output_shape + vec_shape, dim_tags=ftags, ) @@ -265,7 +294,8 @@ def realize_sumfact_kernel_function(sf): else: output_shape = tuple(out_shape[1:]) + (out_shape[0],) - out = buffer.get_temporary("buff_step{}_out".format(l), + out = buffer.get_temporary(sf, + "buff_step{}_out".format(l), shape=output_shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py index 60641b0ce4af4ddef525a4c03a438c4ebe277c0c..dfd9383f93c79d3c50bfcca24cdd717fcc16aa58 100644 --- a/python/dune/codegen/sumfact/symbolic.py +++ b/python/dune/codegen/sumfact/symbolic.py @@ -60,7 +60,7 @@ class SumfactKernelInterfaceBase(object): """ raise NotImplementedError - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): """Interpret the input of sumfact kernel function in the right way (non fastdgg) This happens inside the sumfact kernel function. @@ -73,6 +73,7 @@ class SumfactKernelInterfaceBase(object): Parameters ---------- + sf : SumfactKernel or VectorizedSumfactKernel inames : tuple of pymbolic.primitives.Variable Inames for accesing the input. Ordered according to permuted matrix sequence. shape : tuple of int @@ -252,11 +253,12 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase): dep = dep.union(inp.setup_input(sf, dep, index=i)) return dep - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) @@ -358,7 +360,7 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): return prim.Call(prim.Variable(hadd_function), (result,)) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # The input for stage 3 is quadrature permuted. The inames and shape # passed to this method are quadrature and cost permuted. This means we # need to take the cost permutation back to get the right inames and @@ -368,7 +370,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): # Get a temporary that interprets the base storage of the input as a # column-major matrix. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) @@ -382,9 +385,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): def realize_direct_output(self, result, inames, shape, **args): outputs = set(self.interfaces) - # If multiple horizontal_add's are to be performed with 'result' - # we need to precompute the result! if len(outputs) > 1: + # Introduce substrule for the argument of the horizontal add substname = "haddsubst_{}".format("_".join([i.name for i in inames])) subst_rule(substname, (), result) result = prim.Call(prim.Variable(substname), ()) @@ -503,6 +505,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): the transformation brings the next reduction index in the fastest position. + In einsum notation from numpy this can be written as three contractions + of the form: 'ij,jkl->kli' + It can make sense to permute the order of directions. If you have a small m_l (e.g. stage 1 on faces) it is better to do direction l first. This can be done by: @@ -533,6 +538,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): other. interface: An SumfactKernelInterfaceBase instance describing the input (stage 1) or output (stage 3) of the kernel + """ # Assert the inputs! assert isinstance(matrix_sequence, tuple) @@ -828,6 +834,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) vertical_width=1, buffer=None, insn_dep=frozenset(), + transformations=(), ): # Assert the input data structure assert isinstance(kernels, tuple) @@ -880,8 +887,9 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) # @property def function_name(self): - return "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted), + name = "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted), self.interface.function_name_suffix) + return name @property def cache_key(self): diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py new file mode 100644 index 0000000000000000000000000000000000000000..a752d217add1013bbcb22771824cf992103b2237 --- /dev/null +++ b/python/dune/codegen/sumfact/transformations.py @@ -0,0 +1,422 @@ +import re +import logging + +import loopy as lp +import pymbolic.primitives as prim +import islpy as isl + +from dune.codegen.generation import (get_counted_variable, + get_global_context_value, + ) +from dune.codegen.loopy.transformations.remove_reductions import remove_all_reductions, remove_reduction +from dune.codegen.options import get_form_option, get_option +from dune.codegen.pdelab.geometry import world_dimension +from dune.codegen.error import CodegenAutotuneError +from dune.codegen.sumfact.autotune import autotune_realization + + +def _current_iname_order(current_inames, new_iname_order): + """Sort the inames for this contraction according to new_iname order""" + current_iname_order = [] + for i in new_iname_order: + for j in current_inames: + if i in j: + current_iname_order.append(j) + return current_iname_order + + +def _get_inames_of_reduction(instr, iname_permutation): + # Check which loops are inside the reduction loop + reduction_index = iname_permutation[-1] + within_reduction = tuple(i > reduction_index for i in iname_permutation) + + # Basename of inner inames + dim = world_dimension() + inner_inames_base = [] + for index, within in enumerate(within_reduction): + if within: + inner_inames_base.append('sf_out_inames_{}'.format(dim - 1 - index)) + + # Name of inner inames and outer inames + inner_inames = [] + outer_inames = [] + vec_inames = [] + for i in [iname for iname in instr.within_inames]: + for j in inner_inames_base: + if j in i: + inner_inames.append(i) + if 'sf_vec' in i: + vec_inames.append(i) + elif i not in inner_inames: + outer_inames.append(i) + + # Reduction iname + regex = re.compile('sf_red_([0-9]*)') + reduction_index = set(regex.findall(str(instr))) + if len(reduction_index) == 0: + reduction_iname = None + else: + assert len(reduction_index) == 1 + reduction_index = reduction_index.pop() + reduction_iname = 'sf_red_{}'.format(reduction_index) + + return outer_inames, reduction_iname, inner_inames, vec_inames + + +class FindReductionMapper(lp.symbolic.WalkMapper): + def __init__(self): + self.reductions = [] + + def map_reduction(self, expr, **kwargs): + self.reductions.append(expr) + return lp.symbolic.WalkMapper.map_reduction(self, expr, **kwargs) + + +def _reorder_reduction_loops(kernel, match, iname_order): + """Reorder loops in instructions containing one reduction + + In order to reorder the loops we need to manually do the accumulation in a + temporary that is large enough to fit all the data created in loops inside + the reduction loop. + + Parameters + ---------- + kernel: loopy.kernel.LoopKernel + match: + iname_order: tuple of str + prefered loop order for this instruction + """ + instructions = lp.find_instructions(kernel, match) + for instr in instructions: + # Get reduction + reduction = FindReductionMapper() + reduction(instr.expression) + assert len(reduction.reductions) == 1 + reduction = reduction.reductions[0] + + # Reduction iname, inner inames and vetcor inames + assert len(reduction.inames) == 1 + reduction_iname = reduction.inames[0] + assert set([reduction_iname]) == instr.reduction_inames() + if reduction_iname not in iname_order: + lp.prioritize_loops(kernel, iname_order) + continue + inner_inames = iname_order[iname_order.index(reduction_iname) + 1:] + if len(inner_inames) == 0: + lp.prioritize_loops(kernel, iname_order) + continue + # TODO: There should be a better way to do that + vec_inames = [i for i in instr.within_inames if 'sf_vec' in i] + assert len(vec_inames) < 2 + + # {{{ Create new temporary variable + + # Create dim_tags + dim_tags = ','.join(['f'] * len(inner_inames)) + vectorized = len(vec_inames) > 0 + if vectorized: + assert len(vec_inames) == 1 + dim_tags = dim_tags + ',vec' + + # Create shape + shape = tuple(kernel.get_constant_iname_length(i) for i in inner_inames + vec_inames) + + # Update temporary_variables of this kernel + from dune.codegen.loopy.temporary import DuneTemporaryVariable + accum_variable = get_counted_variable('accum_variable') + from dune.codegen.loopy.target import dtype_floatingpoint + dtype = lp.types.NumpyType(dtype_floatingpoint()) + var = {accum_variable: DuneTemporaryVariable(accum_variable, + dtype=dtype, + shape=shape, + dim_tags=dim_tags, + managed=True)} + tv = kernel.temporary_variables.copy() + tv.update(var) + kernel = kernel.copy(temporary_variables=tv) + + # }}} + + # Set accumulation variable to zero + accum_init_inames = tuple(prim.Variable(i) for i in inner_inames) + if vectorized: + accum_init_inames = accum_init_inames + (prim.Variable(vec_inames[0]),) + assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames) + accum_init_id = instr.id + '_accum_init' + accum_init_instr = lp.Assignment(assignee, + 0, + within_inames=instr.within_inames, + id=accum_init_id, + depends_on=instr.depends_on, + tags=('accum_init',), + ) + kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr]) + + # Accumulate in temporary variable + assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames) + expression = prim.Sum((assignee, reduction.expr)) + within_inames = frozenset(tuple(instr.within_inames) + reduction.inames) + accum_id = instr.id + '_accum' + accum_instr = lp.Assignment(assignee, + expression, + within_inames=within_inames, + id=accum_id, + depends_on=frozenset([accum_init_id]), + tags=('accum',), + ) + kernel = kernel.copy(instructions=kernel.instructions + [accum_instr]) + + # Replace reduction in insn with accumulation result + def map_reduction(expr, rec): + return assignee + mapper = lp.symbolic.ReductionCallbackMapper(map_reduction) + new_expression = mapper(instr.expression) + assign_id = instr.id + '_assign' + new_instr = instr.copy(expression=new_expression, + id=assign_id, + depends_on=frozenset(list(instr.depends_on) + [accum_id])) + kernel = kernel.copy(instructions=kernel.instructions + [new_instr]) + + # Fix dependencies and remove old instruction + for i in kernel.instructions: + if instr.id in i.depends_on: + match = lp.match.Id(i.id) + kernel = lp.add_dependency(kernel, match, assign_id) + kernel = lp.remove_instructions(kernel, set([instr.id])) + + # Duplicate and reorder loops for accumulation + ids = [accum_init_id, accum_id] + duplicate_inames = tuple(inner_inames) + for idx in ids: + match = lp.match.Id(idx) + kernel = lp.duplicate_inames(kernel, duplicate_inames, match) + match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames) + current_iname_order = _current_iname_order(match_inames, iname_order) + kernel = lp.prioritize_loops(kernel, tuple(current_iname_order)) + + # Reorder assignment loops + kernel = lp.prioritize_loops(kernel, iname_order) + return kernel + + +def _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation): + """Reorder the loop nests of a tensor contraction accumulating directly in the data structure""" + dim = world_dimension() + + # Nothing to do if permutation is identity + if iname_permutation == tuple(range(dim + 1)): + return kernel + + # Use names used in sum factorization kernel (without the index that distinguishes the different directions) + default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red'] + from dune.codegen.sumfact.permutation import permute_backward + new_iname_order = permute_backward(default_iname_order, iname_permutation) + + for instr in kernel.instructions: + # Inames used in this reduction + outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr, + iname_permutation) + if reduction_iname: + current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames + else: + current_inames = outer_inames + inner_inames + vec_inames + current_iname_order = _current_iname_order(current_inames, + new_iname_order) + + # We can directly use lp.prioritize_loops if + # - The reduction is the innermost loop + # - There is no reduction (eg reduced direction on faces) + if iname_permutation[-1] == dim or reduction_iname is None: + kernel = lp.prioritize_loops(kernel, tuple(current_iname_order)) + continue + + if isinstance(instr.expression, lp.symbolic.Reduction): + # If the instruction is a reduction we can directly accumulate in the assignee + assert set(inner_inames).issubset(set(i.name for i in instr.assignee.index_tuple)) + match = lp.match.Id(instr.id) + kernel = remove_reduction(kernel, match) + + lp.prioritize_loops(kernel, current_iname_order) + duplicate_inames = tuple(inner_inames) + match = lp.match.Id(instr.id + '_set_zero') + kernel = lp.duplicate_inames(kernel, duplicate_inames, match) + match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames) + set_zero_iname_order = _current_iname_order(match_inames, new_iname_order) + lp.prioritize_loops(kernel, tuple(set_zero_iname_order)) + else: + # In stage 3 this is usually not a reduction. In this case direct + # accumulation is not possible and we accumulate in a temporay + # variable instead + match = lp.match.Id(instr.id) + kernel = _reorder_reduction_loops(kernel, match, current_iname_order) + + return kernel + + +def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation): + """Reorder the loop nests of a tensor contraction using an accumulation variable""" + dim = world_dimension() + + # Nothing to do if permutation is identity + if iname_permutation == tuple(range(dim + 1)): + return kernel + + # Use names used in sum factorization kernel (without the index that distinguishes the different directions) + default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red'] + from dune.codegen.sumfact.permutation import permute_backward + new_iname_order = permute_backward(default_iname_order, iname_permutation) + + for instr in kernel.instructions: + # Inames used in this reduction + outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr, + iname_permutation) + if reduction_iname: + current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames + else: + current_inames = outer_inames + inner_inames + vec_inames + current_iname_order = _current_iname_order(current_inames, new_iname_order) + + # We can directly use lp.prioritize_loops if: + # - The reduction is the innermost loop + # - There is no reduction (eg reduced direction on faces) + if iname_permutation[-1] == dim or reduction_iname is None: + kernel = lp.prioritize_loops(kernel, tuple(current_iname_order)) + continue + + match = lp.match.Id(instr.id) + kernel = _reorder_reduction_loops(kernel, match, current_iname_order) + + return kernel + + +def reorder_loops_in_tensor_contraction(kernel, iname_permutation, accum_variable=True): + """Change loop order in tensor contractions of sum factorization kernel + + Since there is a reduction involved this implies more than just reordering + loops. There are two possibilities to handle the reduction: + + 1) Generate an object of correct size for the accumulation and assign to + the result data structure afterwards + + 2) Make sure to set the correct entries of the result data structure to + zero and accumulate inplace + + Parameters + ---------- + kernel: loopy.kernel.LoopKernel + iname_permutation: tuple of ints + How to permute the loop inames. Should contain some permutation of the + numbers 0 to dim+1 + accum_variable: bool + If True use method 1 else use method 2 from above + + Notes + ----- + Using einsum notation from numpy a contraction of a sum factorization + kernel can be written as: + + - 3D: 'ij,jkl->kli' + - 2D: 'ij,jk->ki' + + In the sum factorization kernel itself those inames are called: + + sf_out_inames_2_* : l + sf_out_inames_1_* : k + sf_out_inames_0_* : i + sf_red_* : j + + where * represents the current direction (0,1,2 for 3D problems). The + default order of the loops is + + (sf_out_iname_2_*, sf_out_iname_1_*, sf_out_iname_0_*, red_*) + + A permutation vector of (3,2,0,1) would mean that we do + + (sf_out_iname_0_*, red_*, sf_out_iname_1_*, sf_out_iname_2_*) + + instead. + + """ + if accum_variable: + kernel = _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation) + return kernel + else: + kernel = _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation) + return kernel + + +def tensor_contraction_loop_order_generator(kernel): + yield kernel, ['None'] + + dim = world_dimension() + identity = range(dim + 1) + import itertools + for permutation in itertools.permutations(identity): + # Culling of stupid variant + if permutation[0] == dim: + continue + + new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, accum_variable=True) + yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_True'.format(permutation)] + + new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, accum_variable=False) + yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_False'.format(permutation)] + + +def simple_autotuner(kernel_generator, signature): + kernel, transformations = next(kernel_generator) + best_cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations) + best_kernel = kernel + best_transformations = transformations + for kernel, transformations in kernel_generator: + cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations) + if cost < best_cost: + best_cost = cost + best_kernel = kernel + best_transformations = transformations + return best_kernel, best_transformations + + +def autotune_tensor_contraction_loop_order(kernel, signature): + logger = logging.getLogger(__name__) + + assert get_option('autotune_google_benchmark') + from dune.codegen.loopy.transformations.matchfma import match_fused_multiply_add + kernel = match_fused_multiply_add(kernel) + generator = tensor_contraction_loop_order_generator(kernel) + kernel, transformations = simple_autotuner(generator, signature) + logger.debug('autotute_tensor_contraction_loop_order - kernel {} transformations {}'.format(kernel.name, transformations)) + return kernel + + +def sumfact_performance_transformations(kernel, signature): + if get_form_option('sumfact_performance_transformations'): + assert not get_form_option('sumfact_performance_transformations_testrun') + if kernel.name.startswith('sfimpl'): + kernel = autotune_tensor_contraction_loop_order(kernel, signature) + + # Testing performance transformations + elif get_form_option('sumfact_performance_transformations_testrun'): + assert not get_form_option('sumfact_performance_transformations') + if kernel.name.startswith('sfimpl'): + dim = world_dimension() + testcase = get_form_option('sumfact_performance_transformations_testrun') + if dim == 2: + testrun_dict = { + 1: [(reorder_loops_in_tensor_contraction, ((0, 2, 1), True))], + 2: [(reorder_loops_in_tensor_contraction, ((0, 2, 1), False))], + 3: [(reorder_loops_in_tensor_contraction, ((2, 0, 1), True))], + 4: [(reorder_loops_in_tensor_contraction, ((2, 0, 1), False))], + } + for trafo, arguments in testrun_dict[testcase]: + kernel = trafo(kernel, *arguments) + else: + testrun_dict = { + 1: [(reorder_loops_in_tensor_contraction, ((3, 2, 0, 1), True))], + 2: [(reorder_loops_in_tensor_contraction, ((3, 2, 0, 1), False))], + 3: [(reorder_loops_in_tensor_contraction, ((2, 0, 1, 3), True))], + 4: [(reorder_loops_in_tensor_contraction, ((2, 0, 1, 3), False))], + } + for trafo, arguments in testrun_dict[testcase]: + kernel = trafo(kernel, *arguments) + return kernel diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt index 5bb10fbcd303a916ce1ab353d7ecac8972f5dbe4..c81c94bc9fd536645149940d290846abce857715 100644 --- a/test/sumfact/poisson/CMakeLists.txt +++ b/test/sumfact/poisson/CMakeLists.txt @@ -134,3 +134,24 @@ if(benchmark_FOUND) INIFILE poisson_fastdg_volumes_3d_benchmark.mini ) endif() + + +#================================================ +# Poisson fastdg with performance transformations +#================================================ +dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl + BASENAME sumfact_poisson_fastdg_2d_performance_transformations + INIFILE poisson_fastdg_2d_performance_transformations.mini + ) +dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl + BASENAME sumfact_poisson_fastdg_3d_performance_transformations + INIFILE poisson_fastdg_3d_performance_transformations.mini + ) +dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl + BASENAME sumfact_poisson_dg_2d_performance_transformations + INIFILE poisson_dg_2d_performance_transformations.mini + ) +dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl + BASENAME sumfact_poisson_dg_3d_performance_transformations + INIFILE poisson_dg_3d_performance_transformations.mini + ) diff --git a/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini new file mode 100644 index 0000000000000000000000000000000000000000..47b0e5269c9ebf62d03328e0f5370528a14ac9f6 --- /dev/null +++ b/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini @@ -0,0 +1,28 @@ +__name = sumfact_poisson_dg_2d_performance_transformations_{__exec_suffix} +__exec_suffix = {gradvec_suffix}_{testrun_suffix} + +gradvec_suffix = gradvec, nongradvec | expand gradvec +testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun} + +cells = 16 16 +extension = 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-4 +autotune_google_benchmark = 1 + +[formcompiler.r] +numerical_jacobian = 0 +sumfact = 1 +vectorization_quadloop = 1 +vectorization_strategy = explicit, none | expand gradvec +fastdg = 0 +geometry_mixins = sumfact_equidistant +sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun + +[formcompiler.ufl_variants] +degree = 2 \ No newline at end of file diff --git a/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini new file mode 100644 index 0000000000000000000000000000000000000000..0b6d75f5c1181bcfe9f5c7ab9c6a941cd9653251 --- /dev/null +++ b/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini @@ -0,0 +1,29 @@ +__name = sumfact_poisson_dg_3d_performance_transformations_{__exec_suffix} +__exec_suffix = {gradvec_suffix}_{testrun_suffix} + +gradvec_suffix = gradvec, nongradvec | expand gradvec +testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun} + +cells = 8 8 8 +extension = 1. 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-4 +autotune_google_benchmark = 1 + +[formcompiler.r] +numerical_jacobian = 0 +sumfact = 1 +vectorization_quadloop = 1 +# vectorization_strategy = explicit, none | expand gradvec +vectorization_strategy = model, none | expand gradvec +fastdg = 0 +sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun +geometry_mixins = sumfact_equidistant + +[formcompiler.ufl_variants] +degree = 2 diff --git a/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini new file mode 100644 index 0000000000000000000000000000000000000000..22461b9cc8bbafc6a9e15c2489bd35bf432bb5de --- /dev/null +++ b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini @@ -0,0 +1,28 @@ +__name = sumfact_poisson_fastdg_2d_performance_transformations_{__exec_suffix} +__exec_suffix = {gradvec_suffix}_{testrun_suffix} + +gradvec_suffix = gradvec, nongradvec | expand gradvec +testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun} + +cells = 16 16 +extension = 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-4 +autotune_google_benchmark = 1 + +[formcompiler.r] +numerical_jacobian = 0 +sumfact = 1 +vectorization_quadloop = 1 +vectorization_strategy = model, none | expand gradvec +fastdg = 1 +sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun +geometry_mixins = sumfact_equidistant + +[formcompiler.ufl_variants] +degree = 2 \ No newline at end of file diff --git a/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini new file mode 100644 index 0000000000000000000000000000000000000000..0e9cfea80bab55167a277ce8e189761e9b780420 --- /dev/null +++ b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini @@ -0,0 +1,28 @@ +__name = sumfact_poisson_fastdg_3d_performance_transformations_{__exec_suffix} +__exec_suffix = {gradvec_suffix}_{testrun_suffix} + +gradvec_suffix = gradvec, nongradvec | expand gradvec +testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun} + +cells = 8 8 8 +extension = 1. 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-4 +autotune_google_benchmark = 1 + +[formcompiler.r] +numerical_jacobian = 0 +sumfact = 1 +vectorization_quadloop = 1 +vectorization_strategy = explicit, none | expand gradvec +fastdg = 1 +geometry_mixins = sumfact_equidistant +sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun + +[formcompiler.ufl_variants] +degree = 2