diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py index 9407709a515ef497b9f9b7561f570cf0411dd4ed..eb9d73aef388124ab61ad75faa0aa15f1252d3fb 100644 --- a/python/dune/codegen/options.py +++ b/python/dune/codegen/options.py @@ -3,6 +3,7 @@ from argparse import ArgumentParser from os.path import abspath from pytools import ImmutableRecord, memoize +from contextlib import contextmanager from dune.testtools.parametertree.parser import parse_ini_file @@ -297,21 +298,39 @@ def get_form_option(key, form=None): return getattr(processed_form_opts, key) -def option_switch(opt): - def _switch(): - if isinstance(opt, tuple): - opts = opt - else: - assert isinstance(opt, str) - opts = (opt,) - try: - for o in opts: - if get_option(o): - return o - return "default" - except AttributeError: - for o in opts: - if get_form_option(o): - return o - return "default" - return _switch +@contextmanager +def option_context(conditional=True, **opts): + """ A context manager that sets a given option and restores it on exit. """ + # Backup old values and set to new ones + if conditional: + backup = {} + for k, v in opts.items(): + backup[k] = get_option(k) + set_option(k, v) + + yield + + if conditional: + # Restore old values + for k in opts.keys(): + set_option(k, backup[k]) + + +@contextmanager +def form_option_context(conditional=True, **opts): + """ A context manager that sets a given form option and restores it on exit """ + if conditional: + form = opts.pop("form", None) + + # Backup old values and set to new ones + backup = {} + for k, v in opts.items(): + backup[k] = get_form_option(k, form=form) + set_form_option(k, v, form=form) + + yield + + # Restore old values + if conditional: + for k in opts.keys(): + set_form_option(k, backup[k], form=form) diff --git a/python/dune/codegen/pdelab/basis.py b/python/dune/codegen/pdelab/basis.py index 39f867a7836c905d30e4ec124eedb6a214230e62..885194af5817bd30d6cc339603ab84f00bc7b990 100644 --- a/python/dune/codegen/pdelab/basis.py +++ b/python/dune/codegen/pdelab/basis.py @@ -8,9 +8,7 @@ from dune.codegen.generation import (basis_mixin, preamble, temporary_variable, ) -from dune.codegen.options import (option_switch, - get_form_option, - ) +from dune.codegen.options import get_form_option from dune.codegen.pdelab.argument import (name_applycontainer, name_coefficientcontainer, ) diff --git a/python/dune/codegen/pdelab/geometry.py b/python/dune/codegen/pdelab/geometry.py index 434f9f56d1098bd4c69b49bb79274a4c6f224581..df6b91548a5ccf217b3e36a7d176be828a35f374 100644 --- a/python/dune/codegen/pdelab/geometry.py +++ b/python/dune/codegen/pdelab/geometry.py @@ -12,9 +12,7 @@ from dune.codegen.generation import (class_member, temporary_variable, valuearg, ) -from dune.codegen.options import (get_form_option, - option_switch, - ) +from dune.codegen.options import get_form_option from dune.codegen.loopy.target import dtype_floatingpoint, type_floatingpoint from dune.codegen.pdelab.quadrature import (quadrature_preamble, ) diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py index a9293b582a50eef979efb7ecc63f2ab984a20563..835fe44e24df9f0849391e1f50422512df43c044 100644 --- a/python/dune/codegen/pdelab/localoperator.py +++ b/python/dune/codegen/pdelab/localoperator.py @@ -7,8 +7,8 @@ import numpy as np from dune.codegen.options import (get_form_option, get_option, - option_switch, - set_form_option) + form_option_context, + ) from dune.codegen.generation import (accumulation_mixin, base_class, class_basename, @@ -41,6 +41,7 @@ from dune.codegen.cgen.clazz import (AccessModifier, ) from dune.codegen.loopy.target import type_floatingpoint from dune.codegen.ufl.modified_terminals import Restriction +from frozendict import frozendict import dune.codegen.loopy.mangler @@ -452,7 +453,6 @@ def generate_accumulation_instruction(expr, visitor): ) from dune.codegen.generation import instruction - from dune.codegen.options import option_switch quad_inames = visitor.quadrature_inames() lfs_inames = frozenset(visitor.test_info.inames) if visitor.trial_info: @@ -510,24 +510,35 @@ def visit_integral(integral): def generate_kernel(integrals): logger = logging.getLogger(__name__) - # Visit all integrals once to collect information (dry-run)! - logger.debug('generate_kernel: visit_integrals (dry run)') - with global_context(dry_run=True): + # Assert that metadata for a given measure type agrees. This is a limitation + # of our current approach that is hard to overcome. + def remove_nonuser_metadata(d): + return frozendict({k: v for k, v in d.items() if k != "estimated_polynomial_degree"}) + + meta_dicts = [remove_nonuser_metadata(i.metadata()) for i in integrals] + if len(set(meta_dicts)) > 1: + measure = get_global_context_value("measure") + raise CodegenUFLError("Measure {} used with varying metadata! dune-codegen does not currently support this.") + + with form_option_context(**meta_dicts[0]): + # Visit all integrals once to collect information (dry-run)! + logger.debug('generate_kernel: visit_integrals (dry run)') + with global_context(dry_run=True): + for integral in integrals: + visit_integral(integral) + + # Now perform some checks on what should be done + from dune.codegen.sumfact.vectorization import decide_vectorization_strategy + logger.debug('generate_kernel: decide_vectorization_strategy') + decide_vectorization_strategy() + + # Delete the cache contents and do the real thing! + logger.debug('generate_kernel: visit_integrals (no dry run)') + from dune.codegen.generation import delete_cache_items + delete_cache_items("kernel_default") for integral in integrals: visit_integral(integral) - # Now perform some checks on what should be done - from dune.codegen.sumfact.vectorization import decide_vectorization_strategy - logger.debug('generate_kernel: decide_vectorization_strategy') - decide_vectorization_strategy() - - # Delete the cache contents and do the real thing! - logger.debug('generate_kernel: visit_integrals (no dry run)') - from dune.codegen.generation import delete_cache_items - delete_cache_items("kernel_default") - for integral in integrals: - visit_integral(integral) - from dune.codegen.pdelab.signatures import kernel_name, assembly_routine_signature name = kernel_name() signature = assembly_routine_signature() @@ -1026,13 +1037,9 @@ def generate_jacobian_kernels(form, original_form): operator_kernels[(it, 'jacobian_apply')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)] if get_form_option("generate_jacobians"): with global_context(form_type="jacobian"): - if get_form_option("generate_jacobians"): - if get_form_option("sumfact"): - was_sumfact = True - if get_form_option("sumfact_regular_jacobians"): - old_geometry_mixins = get_form_option("geometry_mixins") - set_form_option("geometry_mixins", "generic") - set_form_option("sumfact", False) + with form_option_context(conditional=get_form_option("sumfact") and get_form_option("sumfact_regular_jacobians"), + geometry_mixins="generic", + sumfact=False): for measure in set(i.integral_type() for i in jacform.integrals()): if not measure_is_enabled(measure): continue @@ -1059,10 +1066,6 @@ def generate_jacobian_kernels(form, original_form): with global_context(integral_type=it): from dune.codegen.pdelab.signatures import assembly_routine_signature operator_kernels[(it, 'jacobian')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)] - if get_form_option("sumfact_regular_jacobians"): - if was_sumfact: - set_form_option("sumfact", True) - set_form_option("geometry_mixins", old_geometry_mixins) return operator_kernels diff --git a/python/dune/codegen/pdelab/quadrature.py b/python/dune/codegen/pdelab/quadrature.py index 9814f6f2d389e4ec46d055ff2892f8058a330ebf..ca44056ce82037124cc3ea2a9781e6eb1feb9692 100644 --- a/python/dune/codegen/pdelab/quadrature.py +++ b/python/dune/codegen/pdelab/quadrature.py @@ -203,7 +203,7 @@ def quadrature_order(): possible to use a different quadrature_order per direction. """ if get_form_option("quadrature_order"): - quadrature_order = tuple(map(int, get_form_option("quadrature_order").split(','))) + quadrature_order = tuple(map(int, str(get_form_option("quadrature_order")).split(','))) else: quadrature_order = _estimate_quadrature_order() diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py index 1e801f4550d17d89ef8972fe3ffa867d4a44ea25..7908ab3515eb43c1f667216481821024076bcf88 100644 --- a/python/dune/codegen/sumfact/autotune.py +++ b/python/dune/codegen/sumfact/autotune.py @@ -20,7 +20,7 @@ from cgen import ArrayOf, AlignedAttribute, Initializer from dune.codegen.generation import cache_restoring, delete_cache_items from dune.codegen.loopy.target import DuneTarget, type_floatingpoint from dune.codegen.sumfact.realization import realize_sumfact_kernel_function -from dune.codegen.options import get_option, set_option +from dune.codegen.options import get_option, option_context from dune.codegen.error import CodegenAutotuneError @@ -185,296 +185,281 @@ def generate_standalone_code_google_benchmark(sf, filename): delete_cache_items("kernel_default") # Turn off opcounting - opcounting = get_option("opcounter") - set_option("opcounter", False) - - # Extract sum factorization kernel - from dune.codegen.pdelab.localoperator import extract_kernel_from_cache - knl = realize_sumfact_kernel_function(sf) - - # Add the implementation of the kernel. - # TODO: This can probably done in a safer way? - first_line = knl.member.lines[0] - arguments = first_line[first_line.find("(") + 1:first_line.find(")")] - - with open(filename, "w") as f: - f.writelines(["// {}".format(first_line), - "\n", - "#include \"config.h\"\n", - "#include \"benchmark/benchmark.h\"\n", - "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", - "#include<dune/codegen/common/vectorclass.hh>\n", - "#include<dune/codegen/sumfact/onedquadrature.hh>\n", - "#include<dune/codegen/sumfact/horizontaladd.hh>\n", - "#include<random>\n", - "#include<fstream>\n", - "#include<iostream>\n", - "\n" - ]) - - write_global_data(sf, filename) - - with open(filename, "a") as f: - arguments = ', '.join(sf.interface.signature_args) - if len(arguments) > 0: - arguments = ', ' + arguments - arguments = 'const char* buffer0, const char* buffer1' + arguments - f.write("void sumfact_kernel({})\n".format(arguments)) - for line in knl.member.lines[1:]: - f.write("{}\n".format(line)) - - f.write("\n\n") - f.write("static void BM_sumfact_kernel(benchmark::State& state){\n") - - write_setup_code(sf, filename, define_thetas=False) - - additional_arguments = [i.split()[-1] for i in sf.interface.signature_args] - additional_arguments = ', '.join(additional_arguments) - if len(additional_arguments) > 0: - additional_arguments = ', ' + additional_arguments - with open(filename, "a") as f: - f.writelines([" for (auto _ : state){\n", - " sumfact_kernel(buffer0, buffer1{});\n".format(additional_arguments), - " }\n", - "}\n", - "BENCHMARK(BM_sumfact_kernel);\n", - "\n", - "BENCHMARK_MAIN();" - ]) + with option_context(opcounter=False): + # Extract sum factorization kernel + from dune.codegen.pdelab.localoperator import extract_kernel_from_cache + knl = realize_sumfact_kernel_function(sf) - # Maybe turn opcounting on again - set_option("opcounter", opcounting) + # Add the implementation of the kernel. + # TODO: This can probably done in a safer way? + first_line = knl.member.lines[0] + arguments = first_line[first_line.find("(") + 1:first_line.find(")")] + + with open(filename, "w") as f: + f.writelines(["// {}".format(first_line), + "\n", + "#include \"config.h\"\n", + "#include \"benchmark/benchmark.h\"\n", + "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", + "#include<dune/codegen/common/vectorclass.hh>\n", + "#include<dune/codegen/sumfact/onedquadrature.hh>\n", + "#include<dune/codegen/sumfact/horizontaladd.hh>\n", + "#include<random>\n", + "#include<fstream>\n", + "#include<iostream>\n", + "\n" + ]) + + write_global_data(sf, filename) + + with open(filename, "a") as f: + arguments = ', '.join(sf.interface.signature_args) + if len(arguments) > 0: + arguments = ', ' + arguments + arguments = 'const char* buffer0, const char* buffer1' + arguments + f.write("void sumfact_kernel({})\n".format(arguments)) + for line in knl.member.lines[1:]: + f.write("{}\n".format(line)) + + f.write("\n\n") + f.write("static void BM_sumfact_kernel(benchmark::State& state){\n") + + write_setup_code(sf, filename, define_thetas=False) + + additional_arguments = [i.split()[-1] for i in sf.interface.signature_args] + additional_arguments = ', '.join(additional_arguments) + if len(additional_arguments) > 0: + additional_arguments = ', ' + additional_arguments + with open(filename, "a") as f: + f.writelines([" for (auto _ : state){\n", + " sumfact_kernel(buffer0, buffer1{});\n".format(additional_arguments), + " }\n", + "}\n", + "BENCHMARK(BM_sumfact_kernel);\n", + "\n", + "BENCHMARK_MAIN();" + ]) def generate_standalone_code(sf, filename): delete_cache_items("kernel_default") # Turn off opcounting - opcounting = get_option("opcounter") - set_option("opcounter", False) - - # Extract sum factorization kernel - from dune.codegen.pdelab.localoperator import extract_kernel_from_cache - knl = realize_sumfact_kernel_function(sf) - first_line = knl.member.lines[0] - - with open(filename, "w") as f: - f.writelines(["// {}".format(first_line), - "\n", - "#include \"config.h\"\n", - "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", - "#include<dune/codegen/common/tsc.hh>\n", - "#include<dune/codegen/common/vectorclass.hh>\n", - "#include<dune/codegen/sumfact/onedquadrature.hh>\n", - "#include<dune/codegen/sumfact/horizontaladd.hh>\n", - "#include<random>\n", - "#include<fstream>\n", - "#include<iostream>\n", - "\n" - ]) - - f.writelines(["int main(int argc, char** argv)\n", - "{\n", - ]) - - write_setup_code(sf, filename) - - # Write measurement - with open(filename, "a") as f: - # Start a TSC timer - f.writelines([" auto start = Dune::PDELab::TSC::start();\n", - ]) - - # Add the implementation of the kernel. - repeats = int(1e9 / sf.operations) - f.write(" for(int i=0; i<{}; ++i)\n".format(repeats)) - f.write(" {\n") - for line in knl.member.lines[1:]: - f.write(" {}\n".format(line)) - f.write(" }\n") - - # Stop the TSC timer and write the result to a file - f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n", - " std::ofstream file;\n", - " file.open(argv[1]);\n", - " file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))), - " file.close();\n", - " accum += output[dis(rng)];\n", - " std::cout << accum;\n", - "}\n", - ]) - - # Maybe turn opcounting on again - set_option("opcounter", opcounting) + with option_context(opcounter=False): + # Extract sum factorization kernel + from dune.codegen.pdelab.localoperator import extract_kernel_from_cache + knl = realize_sumfact_kernel_function(sf) + first_line = knl.member.lines[0] + + with open(filename, "w") as f: + f.writelines(["// {}".format(first_line), + "\n", + "#include \"config.h\"\n", + "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", + "#include<dune/codegen/common/tsc.hh>\n", + "#include<dune/codegen/common/vectorclass.hh>\n", + "#include<dune/codegen/sumfact/onedquadrature.hh>\n", + "#include<dune/codegen/sumfact/horizontaladd.hh>\n", + "#include<random>\n", + "#include<fstream>\n", + "#include<iostream>\n", + "\n" + ]) + + f.writelines(["int main(int argc, char** argv)\n", + "{\n", + ]) + + write_setup_code(sf, filename) + + # Write measurement + with open(filename, "a") as f: + # Start a TSC timer + f.writelines([" auto start = Dune::PDELab::TSC::start();\n", + ]) + + # Add the implementation of the kernel. + repeats = int(1e9 / sf.operations) + f.write(" for(int i=0; i<{}; ++i)\n".format(repeats)) + f.write(" {\n") + for line in knl.member.lines[1:]: + f.write(" {}\n".format(line)) + f.write(" }\n") + + # Stop the TSC timer and write the result to a file + f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n", + " std::ofstream file;\n", + " file.open(argv[1]);\n", + " file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))), + " file.close();\n", + " accum += output[dis(rng)];\n", + " std::cout << accum;\n", + "}\n", + ]) def generate_standalone_kernel_code(kernel, signature, filename, transformations=None): # Turn off opcounting - opcounting = get_option("opcounter") - set_option("opcounter", False) - - # Remove opcounter from signature - p = re.compile('OpCounter::OpCounter<([^>]*)>') - assert len(signature) == 1 - sig = signature[0] - sig = p.sub(r'\1', sig) - assert 'OpCounter' not in signature - - # Which transformations were applied - codegen_transformations = '' - if transformations: + with option_context(opcounter=False): + # Remove opcounter from signature + p = re.compile('OpCounter::OpCounter<([^>]*)>') + assert len(signature) == 1 + sig = signature[0] + sig = p.sub(r'\1', sig) + assert 'OpCounter' not in signature + + # Which transformations were applied codegen_transformations = '' - for trafo in transformations: - codegen_transformations += '// {}\n'.format(trafo) - - template = 'kernel_benchmark_template1.cc.in' - use_datasets = True - - # Old benchmark template - # template = 'kernel_benchmark_template0.cc.in' - # use_datasets = False - - template_filename = pkg_resources.resource_filename(__name__, template) - with open(template_filename, 'r') as f: - benchmark = f.read() - - # Find function arguments and global arguments - arguments = sig[sig.find('(') + 1:sig.find(')')].split(',') - arguments = [a.split(' ')[-1] for a in arguments] - global_args = [a for a in kernel.args if a.name not in arguments] - buffer_arguments = [a for a in arguments if a.startswith('buff')] - input_arguments = [a for a in arguments if a not in buffer_arguments] - - # Declare global arguments - codegen_declare_global_arguments = '' - target = DuneTarget() - for g in global_args: - decl_info = g.decl_info(target, True, g.dtype) - for idi in decl_info: - ast_builder = target.get_device_ast_builder() - arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name) - arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape)) - arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl) - codegen_declare_global_arguments += '{}\n'.format(arg_decl) - codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, ' ') - - # Helper function for argument initialization - def _initialize_arg(arg): - if isinstance(arg, lp.ValueArg): - return [] - real = type_floatingpoint() - size = reduce(mul, arg.shape) - fill_name = arg.name + '_fill' - lines = [' {}* {} = (double *) {};'.format(real, fill_name, arg.name), - ' for (std::size_t i=0; i<{}; ++i){{'.format(size), - ' {}[i] = unif(re);'.format(fill_name), - ' }'] - return lines - - # Initialize global arguments - codegen_initialize_global_arguments = '' - for arg in global_args: - lines = _initialize_arg(arg) - codegen_initialize_global_arguments += '\n'.join(lines) + '\n' - codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, ' ') - - codegen_initialize_input = '' - - # Function we want to benchmark - codegen_benchmark_function = '' - codegen_benchmark_function += sig[0:sig.find(')') + 1] - codegen_benchmark_function += lp.generate_body(kernel) - codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, ' ') - - # Declare function arguments - codegen_declare_arguments = [] - codegen_declare_input = [] - function_arguments = [a for a in kernel.args if a.name in arguments] - for arg in function_arguments: - if 'buffer' in arg.name: - byte_size = reduce(mul, arg.shape) * 8 - codegen_declare_arguments.append(' char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name, - byte_size, - arg.alignment),) - elif isinstance(arg, lp.ValueArg): - assert 'jacobian_offset' in arg.name - decl = arg.get_arg_decl(ast_builder) - decl = Initializer(decl, 'unif_int(re)') - codegen_declare_arguments.append((' {}\n'.format(decl))) - else: - assert 'fastdg' in arg.name - size = reduce(mul, arg.shape) - min_stride = min([tag.stride for tag in arg.dim_tags]) - size *= min_stride - alignment = arg.dtype.itemsize + if transformations: + codegen_transformations = '' + for trafo in transformations: + codegen_transformations += '// {}\n'.format(trafo) + + template = 'kernel_benchmark_template1.cc.in' + use_datasets = True + + # Old benchmark template + # template = 'kernel_benchmark_template0.cc.in' + # use_datasets = False + + template_filename = pkg_resources.resource_filename(__name__, template) + with open(template_filename, 'r') as f: + benchmark = f.read() + + # Find function arguments and global arguments + arguments = sig[sig.find('(') + 1:sig.find(')')].split(',') + arguments = [a.split(' ')[-1] for a in arguments] + global_args = [a for a in kernel.args if a.name not in arguments] + buffer_arguments = [a for a in arguments if a.startswith('buff')] + input_arguments = [a for a in arguments if a not in buffer_arguments] + + # Declare global arguments + codegen_declare_global_arguments = '' + target = DuneTarget() + for g in global_args: + decl_info = g.decl_info(target, True, g.dtype) + for idi in decl_info: + ast_builder = target.get_device_ast_builder() + arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name) + arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape)) + arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl) + codegen_declare_global_arguments += '{}\n'.format(arg_decl) + codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, ' ') + + # Helper function for argument initialization + def _initialize_arg(arg): + if isinstance(arg, lp.ValueArg): + return [] real = type_floatingpoint() - if use_datasets: - codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real, - arg.name, - size, - alignment))) + size = reduce(mul, arg.shape) + fill_name = arg.name + '_fill' + lines = [' {}* {} = (double *) {};'.format(real, fill_name, arg.name), + ' for (std::size_t i=0; i<{}; ++i){{'.format(size), + ' {}[i] = unif(re);'.format(fill_name), + ' }'] + return lines + + # Initialize global arguments + codegen_initialize_global_arguments = '' + for arg in global_args: + lines = _initialize_arg(arg) + codegen_initialize_global_arguments += '\n'.join(lines) + '\n' + codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, ' ') + + codegen_initialize_input = '' + + # Function we want to benchmark + codegen_benchmark_function = '' + codegen_benchmark_function += sig[0:sig.find(')') + 1] + codegen_benchmark_function += lp.generate_body(kernel) + codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, ' ') + + # Declare function arguments + codegen_declare_arguments = [] + codegen_declare_input = [] + function_arguments = [a for a in kernel.args if a.name in arguments] + for arg in function_arguments: + if 'buffer' in arg.name: + byte_size = reduce(mul, arg.shape) * 8 + codegen_declare_arguments.append(' char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name, + byte_size, + arg.alignment),) + elif isinstance(arg, lp.ValueArg): + assert 'jacobian_offset' in arg.name + decl = arg.get_arg_decl(ast_builder) + decl = Initializer(decl, 'unif_int(re)') + codegen_declare_arguments.append((' {}\n'.format(decl))) else: - codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real, - arg.name, - size, - alignment))) - - codegen_declare_arguments = ''.join(codegen_declare_arguments) - codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, ' ') - codegen_declare_input = ''.join(codegen_declare_input) - codegen_declare_input = textwrap.indent(codegen_declare_input, ' ') - - # Initialize function arguments - codegen_initialize_arguments = '' - codegen_initialize_input = '' - for arg in function_arguments: - if 'fastdg' in arg.name: - if use_datasets: - lines = _initialize_arg(arg) - lines = [' ' + a for a in lines] - lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines] - lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){') - lines.append('}') - codegen_initialize_input += '\n'.join(lines) + '\n' + assert 'fastdg' in arg.name + size = reduce(mul, arg.shape) + min_stride = min([tag.stride for tag in arg.dim_tags]) + size *= min_stride + alignment = arg.dtype.itemsize + real = type_floatingpoint() + if use_datasets: + codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real, + arg.name, + size, + alignment))) + else: + codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real, + arg.name, + size, + alignment))) + + codegen_declare_arguments = ''.join(codegen_declare_arguments) + codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, ' ') + codegen_declare_input = ''.join(codegen_declare_input) + codegen_declare_input = textwrap.indent(codegen_declare_input, ' ') + + # Initialize function arguments + codegen_initialize_arguments = '' + codegen_initialize_input = '' + for arg in function_arguments: + if 'fastdg' in arg.name: + if use_datasets: + lines = _initialize_arg(arg) + lines = [' ' + a for a in lines] + lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines] + lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){') + lines.append('}') + codegen_initialize_input += '\n'.join(lines) + '\n' + else: + lines = _initialize_arg(arg) + codegen_initialize_arguments += '\n'.join(lines) + '\n' else: lines = _initialize_arg(arg) codegen_initialize_arguments += '\n'.join(lines) + '\n' + codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, ' ') + codegen_initialize_input = textwrap.indent(codegen_initialize_input, ' ') + + # Call the benchmark function + if use_datasets: + arguments_with_datasets = arguments.copy() + arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments] + codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n' + codegen_call_benchmark_function += ' ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n' + for arg in input_arguments: + codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg) + codegen_call_benchmark_function += '}' else: - lines = _initialize_arg(arg) - codegen_initialize_arguments += '\n'.join(lines) + '\n' - codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, ' ') - codegen_initialize_input = textwrap.indent(codegen_initialize_input, ' ') - - # Call the benchmark function - if use_datasets: - arguments_with_datasets = arguments.copy() - arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments] - codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n' - codegen_call_benchmark_function += ' ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n' - for arg in input_arguments: - codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg) - codegen_call_benchmark_function += '}' - else: - codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n' - codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, ' ') - - # Replace placeholders in benchmark template - benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations) - benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments) - benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input) - benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments) - benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input) - benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function) - benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments) - benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments) - benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function) - - # Write benchmark source file - with open(filename, 'w') as f: - f.writelines(benchmark) - - # Maybe turn opcounting on again - set_option("opcounter", opcounting) + codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n' + codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, ' ') + + # Replace placeholders in benchmark template + benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations) + benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments) + benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input) + benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function) + benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments) + benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function) + + # Write benchmark source file + with open(filename, 'w') as f: + f.writelines(benchmark) def autotune_realization(sf=None, kernel=None, signature=None, transformations=None): diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py index 811a201e9e11238bcdc4965ce2c7f75426fc3487..79ca2726bd3fab0dca0f7a21598ef13b9fe0dd0a 100644 --- a/python/dune/codegen/sumfact/geometry.py +++ b/python/dune/codegen/sumfact/geometry.py @@ -17,7 +17,7 @@ from dune.codegen.generation import (class_member, ) from dune.codegen.loopy.flatten import flatten_index from dune.codegen.loopy.target import type_floatingpoint -from dune.codegen.options import get_option +from dune.codegen.options import get_form_option, get_option from dune.codegen.pdelab.geometry import (enforce_boundary_restriction, local_dimension, world_dimension, @@ -41,7 +41,6 @@ from dune.codegen.sumfact.permutation import (permute_backward, from dune.codegen.sumfact.quadrature import additional_inames from dune.codegen.sumfact.symbolic import SumfactKernelInterfaceBase, SumfactKernel from dune.codegen.tools import get_pymbolic_basename, ImmutableCuttingRecord -from dune.codegen.options import get_form_option, option_switch from dune.codegen.ufl.modified_terminals import Restriction from loopy.match import Writes @@ -51,8 +50,13 @@ import numpy as np import loopy as lp +class SumfactGeometryMixinBase(GenericPDELabGeometryMixin): + def nonsumfact_fallback(self): + return None + + @geometry_mixin("sumfact_multilinear") -class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin): +class SumfactMultiLinearGeometryMixin(SumfactGeometryMixinBase): def nonsumfact_fallback(self): return "generic" @@ -242,7 +246,7 @@ class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin): @geometry_mixin("sumfact_axiparallel") -class SumfactAxiParallelGeometryMixin(AxiparallelGeometryMixin): +class SumfactAxiParallelGeometryMixin(SumfactGeometryMixinBase, AxiparallelGeometryMixin): def nonsumfact_fallback(self): return "axiparallel" diff --git a/python/dune/codegen/sumfact/switch.py b/python/dune/codegen/sumfact/switch.py index 65d35c17e4a8d2753e680c8c440fee6e9f4c6adf..6b8c43e4dac7bfd08f7bbbfb6a551b47fd02a499 100644 --- a/python/dune/codegen/sumfact/switch.py +++ b/python/dune/codegen/sumfact/switch.py @@ -12,7 +12,7 @@ from dune.codegen.pdelab.signatures import (assembly_routine_args, assembly_routine_signature, kernel_name, ) -from dune.codegen.options import get_form_option, get_option, set_form_option +from dune.codegen.options import get_form_option, get_option, form_option_context from dune.codegen.cgen.clazz import ClassMember @@ -26,21 +26,12 @@ def sumfact_generate_kernels_per_integral(integrals): if measure == "exterior_facet": # Maybe skip sum factorization on boundary integrals if not get_form_option("sumfact_on_boundary"): - set_form_option("sumfact", False) - - # Try to find a fallback for sum factorized geometry mixins - geometry_backup = get_form_option("geometry_mixins") - mixin = construct_from_mixins(mixins=[geometry_backup])() - if hasattr(mixin, "nonsumfact_fallback"): - set_form_option("geometry_mixins", mixin.nonsumfact_fallback()) - - for k in generate_kernels_per_integral(integrals): - yield k - - # Reset state - set_form_option("geometry_mixins", geometry_backup) - set_form_option("sumfact", True) - return + mixin = construct_from_mixins(mixins=[get_form_option("geometry_mixins")])() + geometry = mixin.nonsumfact_fallback() or get_form_option("geometry_mixins") + with form_option_context(sumfact=False, geometry_mixins=geometry): + for k in generate_kernels_per_integral(integrals): + yield k + return # Generate all necessary kernels for facedir in range(dim): diff --git a/python/dune/codegen/sumfact/vectorization.py b/python/dune/codegen/sumfact/vectorization.py index e753652b10b3ac9b1765ee071bce1ccebd15f6b1..8da146281d6d5323303d215f42117ff5f011da37 100644 --- a/python/dune/codegen/sumfact/vectorization.py +++ b/python/dune/codegen/sumfact/vectorization.py @@ -20,7 +20,7 @@ from dune.codegen.sumfact.tabulation import (quadrature_points_per_direction, set_quadrature_points, ) from dune.codegen.error import CodegenVectorizationError -from dune.codegen.options import get_form_option, get_option, set_form_option +from dune.codegen.options import get_form_option, get_option, form_option_context from dune.codegen.tools import add_to_frozendict, round_to_multiple, list_diff from pymbolic.mapper.flop_counter import FlopCounter @@ -331,17 +331,16 @@ def level1_optimal_vectorization_strategy(sumfacts, width): # If we are using the 'target' strategy, we might want to log some information. if get_form_option("vectorization_strategy") == "target": # Print the achieved cost and the target cost on the screen - set_form_option("vectorization_strategy", "model") - target = float(get_form_option("vectorization_target")) - qp = min(optimal_strategies, key=lambda qp: abs(strategy_cost((qp, optimal_strategies[qp])) - target)) - cost = strategy_cost((qp, optimal_strategies[qp])) - - print("The target cost was: {}".format(target)) - print("The achieved cost was: {}".format(cost)) - optimum = level1_optimal_vectorization_strategy(sumfacts, width) - print("The optimal cost would be: {}".format(strategy_cost(optimum))) - set_form_option("vectorization_strategy", "target") - print("The score in 'target' logic was: {}".format(strategy_cost((qp, optimal_strategies[qp])))) + with form_option_context(vectorization_strategy="model"): + target = float(get_form_option("vectorization_target")) + qp = min(optimal_strategies, key=lambda qp: abs(strategy_cost((qp, optimal_strategies[qp])) - target)) + cost = strategy_cost((qp, optimal_strategies[qp])) + + print("The target cost was: {}".format(target)) + print("The achieved cost was: {}".format(cost)) + optimum = level1_optimal_vectorization_strategy(sumfacts, width) + print("The optimal cost would be: {}".format(strategy_cost(optimum))) + print("The score in 'target' logic was: {}".format(strategy_cost((qp, optimal_strategies[qp])))) # Print the employed vectorization strategy into a file suffix = "" diff --git a/test/poisson/poisson_tensor.ufl b/test/poisson/poisson_tensor.ufl index b527d05258667dae629f608a1a630e5f11f947b8..3591d4d70e17f8811cf32bb759212c7c33e0bb24 100644 --- a/test/poisson/poisson_tensor.ufl +++ b/test/poisson/poisson_tensor.ufl @@ -12,6 +12,9 @@ V = FiniteElement("CG", cell, 1) u = TrialFunction(V) v = TestFunction(V) +# Test metadata setting of options +dx = dx(metadata={"quadrature_order": 27}) + r= (inner(A*grad(u), grad(v)) + c*u*v -f*v)*dx exact_solution = g is_dirichlet = 1