diff --git a/applications/knl/poisson_dg/knl_poisson_dg.mini b/applications/knl/poisson_dg/knl_poisson_dg.mini index 37c02c14602bcb3a5a386ce885776a1fd4f20594..ba2f345fd39edcca22fbbb9aded8e4bd3193157c 100644 --- a/applications/knl/poisson_dg/knl_poisson_dg.mini +++ b/applications/knl/poisson_dg/knl_poisson_dg.mini @@ -51,7 +51,6 @@ vectorization_vertical = 2 matrix_free = 1 generate_jacobians = 0 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval -assure_statement_ordering = 1 [formcompiler.ufl_variants] cell = hexahedron diff --git a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini index e3466ed40eeebec2f361c7da2367ff60b20152ea..4f3f5826f71fd9fc742efa4d88ae222e45759529 100644 --- a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini +++ b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini @@ -49,7 +49,6 @@ vectorization_strategy = explicit vectorization_horizontal = 4 vectorization_vertical = 2 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval -assure_statement_ordering = 1 matrix_free = 1 generate_jacobians = 0 diff --git a/applications/poisson_dg/poisson_dg.mini b/applications/poisson_dg/poisson_dg.mini index 6e9ccac0e0d27a765f54f094077bb32282d4fec4..7e65a4815f5fccc2dc6aae955bbdef559e9ebe92 100644 --- a/applications/poisson_dg/poisson_dg.mini +++ b/applications/poisson_dg/poisson_dg.mini @@ -46,7 +46,6 @@ sumfact = 1 vectorization_quadloop = 1 vectorization_strategy = explicit quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval -assure_statement_ordering = 1 matrix_free = 1 generate_jacobians = 0 diff --git a/applications/poisson_dg_tensor/poisson_dg_tensor.mini b/applications/poisson_dg_tensor/poisson_dg_tensor.mini index 4e9c6f95700feddaf4b48e1a9147ba455378ad4e..4e7ac555b33fa5fcef568cca5b6dddf6dd3552e3 100644 --- a/applications/poisson_dg_tensor/poisson_dg_tensor.mini +++ b/applications/poisson_dg_tensor/poisson_dg_tensor.mini @@ -46,7 +46,6 @@ sumfact = 1 vectorization_quadloop = 1 vectorization_strategy = explicit quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval -assure_statement_ordering = 1 matrix_free = 1 generate_jacobians = 0 diff --git a/applications/stokes_dg/stokes_dg.mini b/applications/stokes_dg/stokes_dg.mini index 6b1a1e9ca0b3cf1fdd2bc79cb9bd00ae12ebbb0c..a80567f9b73473ae09e52c3a768177431435e849 100644 --- a/applications/stokes_dg/stokes_dg.mini +++ b/applications/stokes_dg/stokes_dg.mini @@ -48,7 +48,6 @@ vectorization_quadloop = 1 vectorization_strategy = model vectorization_allow_quadrature_changes = 1 quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval -assure_statement_ordering = 1 matrix_free = 1 generate_jacobians = 0 diff --git a/patches/apply_patches.sh b/patches/apply_patches.sh index 4abe4fa9de8f80bac0465ccfa7724035ce586063..2c38492b19c08452f7607b084f855782bb3cd35f 100755 --- a/patches/apply_patches.sh +++ b/patches/apply_patches.sh @@ -2,6 +2,7 @@ pushd python/loopy git apply ../../patches/loopy/Current.patch +git apply ../../patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch popd pushd dune/perftool/vectorclass diff --git a/patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch b/patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch new file mode 100644 index 0000000000000000000000000000000000000000..436533b399471411d105addab125814de66ec4e5 --- /dev/null +++ b/patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch @@ -0,0 +1,33 @@ +From abac8a2068e0333a0f00c276519c24c5c16bedf4 Mon Sep 17 00:00:00 2001 +From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> +Date: Mon, 26 Mar 2018 11:13:42 +0200 +Subject: [PATCH] Disable a logging statement that breaks + +--- + loopy/kernel/tools.py | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py +index 15840180..cb877eb6 100644 +--- a/loopy/kernel/tools.py ++++ b/loopy/kernel/tools.py +@@ -197,11 +197,11 @@ def find_all_insn_inames(kernel): + assert isinstance(write_deps, frozenset), type(insn) + assert isinstance(iname_deps, frozenset), type(insn) + +- logger.debug("%s: find_all_insn_inames: %s (init): %s - " +- "read deps: %s - write deps: %s" % ( +- kernel.name, insn.id, ", ".join(sorted(iname_deps)), +- ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)), +- )) ++# logger.debug("%s: find_all_insn_inames: %s (init): %s - " ++# "read deps: %s - write deps: %s" % ( ++# kernel.name, insn.id, ", ".join(sorted(iname_deps)), ++# ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)), ++# )) + + insn_id_to_inames[insn.id] = iname_deps + insn_assignee_inames[insn.id] = write_deps & kernel.all_inames() +-- +2.11.0 + diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py index df47d5e9fb1ac03d8ed6bde6ec1de203470f4d2f..a4d8292f5f88bc315980efb521bf7c2cf6a95153 100644 --- a/python/dune/perftool/generation/loopy.py +++ b/python/dune/perftool/generation/loopy.py @@ -140,10 +140,9 @@ def _insn_cache_key(code=None, expression=None, **kwargs): def instruction(code=None, expression=None, **kwargs): assert (code is not None) or (expression is not None) assert not ((code is not None) and (expression is not None)) - assert 'id' not in kwargs # Get an ID for this instruction - id = 'insn_{}'.format(str(get_counter('__insn_id')).zfill(4)) + id = kwargs.pop("id", 'insn_{}'.format(str(get_counter('__insn_id')).zfill(4))) # Now create the actual instruction if code: diff --git a/python/dune/perftool/loopy/buffer.py b/python/dune/perftool/loopy/buffer.py deleted file mode 100644 index c0e2ab310ce4cf6ddc8c8ea403d2e5cad1c63365..0000000000000000000000000000000000000000 --- a/python/dune/perftool/loopy/buffer.py +++ /dev/null @@ -1,55 +0,0 @@ -from dune.perftool.error import PerftoolLoopyError -from dune.perftool.generation import (get_counted_variable, - kernel_cached, - temporary_variable, - ) - - -class FlipFlopBuffer(object): - def __init__(self, identifier): - self.identifier = identifier - - # Initialize the counter that switches between the base storages! - self._current = 0 - - # Generate the base storage names - self.base_storage = tuple("{}_base_{}".format(self.identifier, i) for i in (0, 1)) - - def switch_base_storage(self): - self._current = (self._current + 1) % 2 - - def get_temporary(self, **kwargs): - assert("base_storage" not in kwargs) - assert("storage_shape" not in kwargs) - - # Select the base storage and increase counter - base = self.base_storage[self._current] - - # Construct a temporary name - name = kwargs.pop("name", None) - if name is None: - name = get_counted_variable(self.identifier) - - # Construct the temporary and return it - temporary_variable(name, - base_storage=base, - managed=True, - _base_storage_access_may_be_aliasing=True, - **kwargs - ) - - return name - - -@kernel_cached -def initialize_buffer(identifier): - assert isinstance(identifier, str) - return FlipFlopBuffer(identifier) - - -def get_buffer_temporary(identifier, **kwargs): - return initialize_buffer(identifier).get_temporary(**kwargs) - - -def switch_base_storage(identifier): - initialize_buffer(identifier).switch_base_storage() diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index 2193e619182f4090a0ad9d1306c9dd8d771cf4dc..b718c2c632fa2ec92cc907f6af1c0ad3d8464b9c 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -9,6 +9,7 @@ from dune.perftool.generation import (include_file, retrieve_cache_functions, ) from dune.perftool.options import get_option +from dune.perftool.tools import round_to_multiple from loopy.symbolic import Literal from loopy.target import (TargetBase, @@ -154,14 +155,16 @@ class DuneASTBuilder(CASTBuilder): def get_c_expression_to_code_mapper(self): return DuneCExpressionToCodeMapper() - def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info): + def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info): # If this is not a DuneTemporaryVariable, it was introduced by loopy # and it should be totally under loopys control: Call the base class implementation! if not (isinstance(temp_var, DuneTemporaryVariable) and temp_var.custom_declaration): - return CASTBuilder.get_temporary_decl(self, knl, schedule_index, temp_var, decl_info) + return CASTBuilder.get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info) - if temp_var.decl_method: - return cgen.Line(temp_var.decl_method(temp_var.name, temp_var.shape, temp_var.shape_impl)) + if temp_var.custom_declaration: + decl = temp_var.decl_method(temp_var.name, codegen_state.kernel, decl_info) + if decl: + return cgen.Line(decl) def add_vector_access(self, access_expr, index): # There is no generic way of implementing a vector access with VCL, as @@ -176,10 +179,34 @@ class DuneASTBuilder(CASTBuilder): return cgen.Line("BARRIER;") def get_temporary_decls(self, codegen_state, schedule_index): + temps = codegen_state.kernel.temporary_variables.values() + # Declare all the custom base storages + ret = [] + for bs in set(t.custom_base_storage for t in temps if isinstance(t, DuneTemporaryVariable)) - set({None}): + if bs in [a.name for a in codegen_state.kernel.args]: + continue + + # Find the alignment bytes + alignment = [] + size = [] + for t in temps: + if isinstance(t, DuneTemporaryVariable) and t.custom_base_storage == bs: + # TODO Extract alignment from the temporaries after switching to loopy 2018.1 + alignment.append(get_option("max_vector_width") // 8) + from pytools import product + size.append(product(t.shape)) + + alignment = max(alignment) + size = max(size) + size = round_to_multiple(size, alignment) + + decl = "char {}[{}] __attribute__ ((aligned({})));".format(bs, size * 8, alignment) + ret.append(cgen.Line(decl)) + if self.target.declare_temporaries: - return CASTBuilder.get_temporary_decls(self, codegen_state, schedule_index) + return ret + CASTBuilder.get_temporary_decls(self, codegen_state, schedule_index) else: - return [] + return ret class DuneTarget(TargetBase): diff --git a/python/dune/perftool/loopy/temporary.py b/python/dune/perftool/loopy/temporary.py index d916f6b0312ac763d60829047a52056d088014bc..2bf78ce94c573ca0f1614fc89b72fee52fa16333 100644 --- a/python/dune/perftool/loopy/temporary.py +++ b/python/dune/perftool/loopy/temporary.py @@ -5,6 +5,7 @@ from dune.perftool.error import PerftoolLoopyError from loopy import TemporaryVariable +import loopy as lp import numpy @@ -27,7 +28,10 @@ def _temporary_type(shape_impl, shape, first=True): return "Dune::FieldMatrix<{}, {}, {}>".format(_type, shape[0], shape[1]) -def default_declaration(name, shape=(), shape_impl=()): +def default_declaration(name, kernel, decl_info): + shape = kernel.temporary_variables[name].shape + shape_impl = kernel.temporary_variables[name].shape_impl + # Determine the C++ type to use for this temporary. t = _temporary_type(shape_impl, shape) if len(shape_impl) == 0: @@ -44,11 +48,20 @@ def default_declaration(name, shape=(), shape_impl=()): return '{} {}(0.0);'.format(t, name) +def custom_base_storage_temporary_declaration(storage): + def _decl(name, kernel, decl_info): + dtype = kernel.temporary_variables[name].dtype + _type = kernel.target.dtype_to_typename(decl_info.dtype) + return "{0} *{1} = ({0} *){2};".format(_type, name, storage) + + return _decl + + class DuneTemporaryVariable(TemporaryVariable): - allowed_extra_kwargs = TemporaryVariable.allowed_extra_kwargs + ["managed", "shape_impl", "decl_method"] + allowed_extra_kwargs = TemporaryVariable.allowed_extra_kwargs + ["managed", "shape_impl", "decl_method", "custom_base_storage"] - def __init__(self, name, managed=False, shape_impl=None, decl_method=None, **kwargs): + def __init__(self, name, managed=False, shape_impl=None, decl_method=None, custom_base_storage=None, **kwargs): self.managed = managed self.decl_method = decl_method self.shape_impl = shape_impl @@ -59,6 +72,15 @@ class DuneTemporaryVariable(TemporaryVariable): from dune.perftool.loopy.target import dtype_floatingpoint kwargs.setdefault('dtype', dtype_floatingpoint()) + if custom_base_storage and self.decl_method is None: + assert shape_impl is None + self.decl_method = custom_base_storage_temporary_declaration(custom_base_storage) + self.custom_declaration = self.decl_method is not None - TemporaryVariable.__init__(self, name, managed=self.managed, shape_impl=self.shape_impl, decl_method=self.decl_method, **kwargs) + TemporaryVariable.__init__(self, name, + managed=self.managed, + shape_impl=self.shape_impl, + decl_method=self.decl_method, + custom_base_storage=custom_base_storage, + **kwargs) diff --git a/python/dune/perftool/loopy/transformations/disjointgroups.py b/python/dune/perftool/loopy/transformations/disjointgroups.py deleted file mode 100644 index 791a7b35ea26cf96af224cd5ee0623cc2d7f673a..0000000000000000000000000000000000000000 --- a/python/dune/perftool/loopy/transformations/disjointgroups.py +++ /dev/null @@ -1,13 +0,0 @@ -""" A helper transformation that makes all groups conflicting """ - -from dune.perftool.options import get_form_option - - -def make_groups_conflicting(knl): - # As this transformation introduces a performance bug that basically - # kills our CI, we only apply it if really needed - meaning in production. - if get_form_option("assure_statement_ordering"): - groups = frozenset().union(*tuple(i.groups for i in knl.instructions)) - return knl.copy(instructions=[i.copy(conflicts_with_groups=groups - i.groups) for i in knl.instructions]) - else: - return knl diff --git a/python/dune/perftool/loopy/transformations/vectorize_quad.py b/python/dune/perftool/loopy/transformations/vectorize_quad.py index fa5b03c204b4d77f628a7566897d33dae3e67a7c..e3f30cc2d8d4313dbb7e30674cc59a360675fc30 100644 --- a/python/dune/perftool/loopy/transformations/vectorize_quad.py +++ b/python/dune/perftool/loopy/transformations/vectorize_quad.py @@ -7,8 +7,7 @@ from dune.perftool.generation import (function_mangler, ) from dune.perftool.loopy.target import dtype_floatingpoint from dune.perftool.loopy.vcl import get_vcl_type, get_vcl_type_size -from dune.perftool.loopy.transformations.vectorview import (add_temporary_with_vector_view, - add_vector_view, +from dune.perftool.loopy.transformations.vectorview import (add_vector_view, get_vector_view_name, ) from dune.perftool.loopy.symbolic import substitute @@ -149,7 +148,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix): knl = knl.copy(temporary_variables=tmps) # Introduce a vector view of the precomputation result - knl = add_vector_view(knl, prec_quantity, flatview=True) + knl = add_vector_view(knl, prec_quantity) # # Construct a flat loop for the given instructions @@ -196,7 +195,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix): horizontal, vertical = tuple(int(i) for i in re.match("vecsumfac_h(.*)_v(.*)", tag).groups()) # 1. Rotating the input data - knl = add_vector_view(knl, quantity, flatview=True) + knl = add_vector_view(knl, quantity) if horizontal > 1: new_insns.append(lp.CallInstruction((), # assignees prim.Call(TransposeReg(vertical=vertical, horizontal=horizontal), @@ -219,7 +218,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix): elif tag is not None and tag == 'sumfac': # Add a vector view to this quantity expr, = quantity_exprs - knl = add_vector_view(knl, quantity, flatview=True) + knl = add_vector_view(knl, quantity) replacemap[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)), (vector_indices.get(1), prim.Variable(vec_iname)), ) @@ -243,7 +242,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix): for insn in insns: # Get a vector view of the lhs expression lhsname = get_pymbolic_basename(insn.assignee) - knl = add_vector_view(knl, lhsname, pad_to=vec_size, flatview=True) + knl = add_vector_view(knl, lhsname) lhsname = get_vector_view_name(lhsname) rotating = "gradvec" in insn.tags diff --git a/python/dune/perftool/loopy/transformations/vectorview.py b/python/dune/perftool/loopy/transformations/vectorview.py index 9b74e3d8ca67cc28584def2ec0055b5504febfc4..3a812abf50289b58e14bd2f155d155cc7bc3155d 100644 --- a/python/dune/perftool/loopy/transformations/vectorview.py +++ b/python/dune/perftool/loopy/transformations/vectorview.py @@ -5,7 +5,9 @@ being a an array of SIMD vectors """ from dune.perftool.loopy.target import dtype_floatingpoint +from dune.perftool.loopy.temporary import DuneTemporaryVariable from dune.perftool.loopy.vcl import get_vcl_type_size +from dune.perftool.tools import round_to_multiple import loopy as lp import numpy as np @@ -17,86 +19,47 @@ def get_vector_view_name(tmpname): return tmpname + "_vec" -def add_vector_view(knl, tmpname, pad_to=None, flatview=False): - """ - Kernel transformation to add a vector view temporary - that interprets the same memory as another temporary - """ +def add_vector_view(knl, tmpname, pad_to=1): temporaries = knl.temporary_variables - assert tmpname in temporaries temp = temporaries[tmpname] - vecname = get_vector_view_name(tmpname) + vectemp = get_vector_view_name(tmpname) bsname = tmpname + "_base" + vecsize = get_vcl_type_size(temp.dtype) - if vecname in knl.temporary_variables: + # Enforce idempotency + if vectemp in temporaries: return knl - # Add base storage to the original temporary! - if not temp.base_storage: - temp = temp.copy(base_storage=bsname, - _base_storage_access_may_be_aliasing=True, - ) - temporaries[tmpname] = temp - else: - bsname = temp.base_storage - - # Determine the shape by dividing total size by vector size - # Also apply the padding we need for rotation - # TODO: *Only* apply this padding if really needed (a bit hard to figure out) - vecsize = get_vcl_type_size(temp.dtype) - if all(isinstance(s, int) for s in temp.shape): - size = pt.product(temp.shape) // vecsize - if size % vecsize != 0: - size = (size // vecsize + 1) * vecsize + # Modify the original temporary to use our custom base storage mechanism + if isinstance(temp, DuneTemporaryVariable): + if temp.custom_base_storage: + bsname = temp.custom_base_storage + else: + temp = temp.copy(custom_base_storage=bsname) + temporaries[tmpname] = temp else: - size = prim.FloorDiv(prim.Product(temp.shape), vecsize) - size = (size // vecsize + 1) * vecsize - - # Maybe do some padding. - if pad_to: - size = (size // pad_to + 1) * pad_to + temp = DuneTemporaryVariable(custom_base_storage=bsname, + managed=True, + **temp.get_copy_kwargs() + ) + temporaries[tmpname] = temp - # Some vectorview are intentionally flat! (e.g. the output buffers of - # sum factorization kernels - if flatview: - shape = (size, vecsize) - dim_tags = "c,vec" - else: - shape = temp.shape - # This works around a loopy weirdness (which might as well be a bug) - # TODO: investigate this! - if len(shape) == 1: - shape = (1, vecsize) - dim_tags = "c,vec" - else: - dim_tags = temp.dim_tags[:-1] + ("vec",) + size = round_to_multiple(pt.product(temp.shape), vecsize) // vecsize + size = round_to_multiple(size, pad_to) # Now add a vector view temporary - vecname = tmpname + "_vec" - temporaries[vecname] = lp.TemporaryVariable(vecname, - dim_tags=dim_tags, - shape=shape, - base_storage=bsname, - dtype=dtype_floatingpoint(), - scope=lp.temp_var_scope.PRIVATE, - _base_storage_access_may_be_aliasing=True, - ) - - # Avoid that any of these temporaries are eliminated - silenced = ['temp_to_write({})'.format(tmpname), - 'temp_to_write({})'.format(vecname), - 'read_no_write({})'.format(tmpname), - 'read_no_write({})'.format(vecname), + temporaries[vectemp] = DuneTemporaryVariable(vectemp, + dim_tags="c,vec", + shape=(size, vecsize), + custom_base_storage=bsname, + scope=lp.temp_var_scope.PRIVATE, + managed=True, + ) + + # Avoid that these temporaries are eliminated + silenced = ['temp_to_write({})'.format(vectemp), + 'read_no_write({})'.format(vectemp), ] return knl.copy(temporary_variables=temporaries, silenced_warnings=knl.silenced_warnings + silenced) - - -def add_temporary_with_vector_view(knl, name, *args, **kwargs): - temps = knl.temporary_variables - assert name not in temps - temps[name] = lp.TemporaryVariable(name, *args, **kwargs) - knl = knl.copy(temporary_variables=temps) - knl = add_vector_view(knl, name) - return knl diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index 38a3be311037dedae17ab5db93393f6ea398fd29..229b989002dd417ddb67d451f6c0de11f2bfc124 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -86,7 +86,6 @@ class PerftoolFormOptionsArray(ImmutableRecord): vectorization_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers") vectorization_list_index = PerftoolOption(default=None, helpstr="Which vectorization to pick from a list (only valid with vectorization_strategy=fromlist).") simplify = PerftoolOption(default=False, helpstr="Whether to simplify expressions using sympy") - assure_statement_ordering = PerftoolOption(default=False, helpstr="Whether special care should be taken for a good statement ordering in sumfact kernels, runs into a loopy scheduler performance bug, but is necessary for production.") generate_jacobians = PerftoolOption(default=True, helpstr="Whether jacobian_* methods should be generated. This is set to false automatically, when numerical_jacobian is set to true.") generate_residuals = PerftoolOption(default=True, helpstr="Whether alpha_* methods should be generated.") unroll_dimension_loops = PerftoolOption(default=False, helpstr="whether loops over the geometric dimension should be unrolled") diff --git a/python/dune/perftool/pdelab/adjoint.py b/python/dune/perftool/pdelab/adjoint.py index 14d660e80f5c012676c1118943e1790115411af5..aef03bbbf08e45f2510bd90f164bdd70b071580c 100644 --- a/python/dune/perftool/pdelab/adjoint.py +++ b/python/dune/perftool/pdelab/adjoint.py @@ -145,7 +145,11 @@ def generate_kernel(forms): for i, form in enumerate(forms): for integral in form: visit_integral(integral, i, len(forms)) - knl = extract_kernel_from_cache("kernel_default") + + from dune.perftool.pdelab.signatures import kernel_name, assembly_routine_signature + name = kernel_name() + signature = assembly_routine_signature() + knl = extract_kernel_from_cache("kernel_default", name, signature) delete_cache_items("kernel_default") # Reset the quadrature degree diff --git a/python/dune/perftool/pdelab/basis.py b/python/dune/perftool/pdelab/basis.py index d4db8834090378d5f037bc037c11a91176a7ebf3..2c724f8e4a716c46475434d4f1949de6ba512632 100644 --- a/python/dune/perftool/pdelab/basis.py +++ b/python/dune/perftool/pdelab/basis.py @@ -86,7 +86,7 @@ def declare_cache_temporary(element, restriction, which): t_cache = type_localbasis_cache(element) lfs = name_leaf_lfs(element, restriction) - def decl(name, shape, shape_impl): + def decl(name, kernel, decl_info): return "typename {}::{}ReturnType {};".format(t_cache, which, name, diff --git a/python/dune/perftool/pdelab/driver/__init__.py b/python/dune/perftool/pdelab/driver/__init__.py index c859c99d0ab964e298a55bf2c9854f170c507599..3877165c574ac8ef7b6d7197b56f97d071f38f42 100644 --- a/python/dune/perftool/pdelab/driver/__init__.py +++ b/python/dune/perftool/pdelab/driver/__init__.py @@ -298,10 +298,10 @@ def generate_driver(): add_section("constraints", "Set up constraints container...") add_section("gridoperator", "Set up grid grid operators...") add_section("vector", "Set up solution vectors...") + add_section("timings", "Maybe take performance measurements...") add_section("solver", "Set up (non)linear solvers...") add_section("vtk", "Do visualization...") add_section("instat", "Set up instationary stuff...") - add_section("timings", "Maybe take performance measurements...") add_section("printing", "Maybe print residuals and matrices to stdout...") add_section("error", "Maybe calculate errors for test results...") diff --git a/python/dune/perftool/pdelab/driver/solve.py b/python/dune/perftool/pdelab/driver/solve.py index 8f3ef96ff9fd2babe838a2b3ceb7baeeb63d67f7..0648df399d809fd84d30624711dfc2fb02bbc631 100644 --- a/python/dune/perftool/pdelab/driver/solve.py +++ b/python/dune/perftool/pdelab/driver/solve.py @@ -54,7 +54,8 @@ def dune_solve(): print_matrix() if get_option('instrumentation_level') >= 2: - from dune.perftool.pdelab.driver.timings import setup_timer, name_timing_stream + from dune.perftool.pdelab.driver.timings import setup_timer, name_timing_stream, name_timing_identifier + timestream = name_timing_stream() setup_timer() from dune.perftool.generation import post_include post_include("HP_DECLARE_TIMER(solve);", filetag="driver") @@ -68,7 +69,6 @@ def dune_solve(): if get_option('instrumentation_level') >= 3: from dune.perftool.pdelab.driver.gridoperator import name_localoperator lop_name = name_localoperator(form_ident) - timestream = name_timing_stream() solve.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier())) return solve diff --git a/python/dune/perftool/pdelab/function.py b/python/dune/perftool/pdelab/function.py index c10324003955d0bbe2bbbf53f3adc73c57fd10c4..c1dadef18fdcb8a60a88d06aff5fac186a80432d 100644 --- a/python/dune/perftool/pdelab/function.py +++ b/python/dune/perftool/pdelab/function.py @@ -19,7 +19,7 @@ def bind_gridfunction_to_element(gf, restriction): def declare_grid_function_range(gridfunction): - def _decl(name, *args): + def _decl(name, kernel, decl_info): return "typename decltype({})::Range {};".format(gridfunction, name) return _decl diff --git a/python/dune/perftool/pdelab/geometry.py b/python/dune/perftool/pdelab/geometry.py index c382faeea09e13208aa7155b120fb070789c80e6..d308917adbd3391d4b61832938b3a1c4f3355eb5 100644 --- a/python/dune/perftool/pdelab/geometry.py +++ b/python/dune/perftool/pdelab/geometry.py @@ -261,7 +261,7 @@ def evaluate_unit_outer_normal(name): @preamble -def declare_normal(name, shape, shape_impl): +def declare_normal(name, kernel, decl_info): ig = name_intersection_geometry_wrapper() return "auto {} = {}.centerUnitOuterNormal();".format(name, ig) @@ -300,7 +300,7 @@ def type_jacobian_inverse_transposed(restriction): @kernel_cached def define_jacobian_inverse_transposed_temporary(restriction): @preamble - def _define_jacobian_inverse_transposed_temporary(name, shape, shape_impl): + def _define_jacobian_inverse_transposed_temporary(name, kernel, decl_info): t = type_jacobian_inverse_transposed(restriction) return "{} {};".format(t, name, diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 6f109f4cc05c21b0962c3065b1820fea749863ab..549cd30cbf2fce97e3f071bf428f14bd4e447e31 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -473,7 +473,11 @@ def generate_kernel(integrals): delete_cache_items("kernel_default") for integral in integrals: visit_integral(integral) - knl = extract_kernel_from_cache("kernel_default") + + from dune.perftool.pdelab.signatures import kernel_name, assembly_routine_signature + name = kernel_name() + signature = assembly_routine_signature() + knl = extract_kernel_from_cache("kernel_default", name, signature) delete_cache_items("kernel_default") # Reset the quadrature degree @@ -491,7 +495,7 @@ def generate_kernels_per_integral(integrals): yield generate_kernel(integrals) -def extract_kernel_from_cache(tag, wrap_in_cgen=True): +def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timings=True): # Now extract regular loopy kernel components from dune.perftool.loopy.target import DuneTarget domains = [i for i in retrieve_cache_items("{} and domain".format(tag))] @@ -512,13 +516,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): check_dep_resolution=False, ) - # Find a name for the kernel - if wrap_in_cgen: - from dune.perftool.pdelab.signatures import kernel_name - name = kernel_name() - else: - name = "constructor_kernel" - # Create the kernel from loopy import make_kernel, preprocess_kernel kernel = make_kernel(domains, @@ -534,9 +531,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): from loopy import make_reduction_inames_unique kernel = make_reduction_inames_unique(kernel) - from dune.perftool.loopy.transformations.disjointgroups import make_groups_conflicting - kernel = make_groups_conflicting(kernel) - # Apply the transformations that were gathered during tree traversals for trafo in transformations: kernel = trafo[0](kernel, *trafo[1], **trafo[2]) @@ -570,9 +564,10 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): if wrap_in_cgen: # Wrap the kernel in something which can generate code - from dune.perftool.pdelab.signatures import assembly_routine_signature - signature = assembly_routine_signature() - kernel = LoopyKernelMethod(signature, kernel) + if signature is None: + from dune.perftool.pdelab.signatures import assembly_routine_signature + signature = assembly_routine_signature() + kernel = LoopyKernelMethod(signature, kernel, add_timings=add_timings) return kernel @@ -673,7 +668,7 @@ def cgen_class_from_cache(tag, members=[]): tparams = [i for i in retrieve_cache_items('{} and template_param'.format(tag))] # Construct the constructor - constructor_knl = extract_kernel_from_cache(tag, wrap_in_cgen=False) + constructor_knl = extract_kernel_from_cache(tag, "constructor_kernel", None, wrap_in_cgen=False) from dune.perftool.loopy.target import DuneTarget constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False)) signature = "{}({})".format(basename, ", ".join(next(iter(p.generate(with_semicolon=False))) for p in constructor_params)) @@ -1031,10 +1026,23 @@ def generate_localoperator_kernels(operator): def generate_localoperator_file(kernels, filename): + logger = logging.getLogger(__name__) + operator_methods = [] for k in kernels.values(): operator_methods.extend(k) + # Generate all the realizations of sum factorization kernel objects needed in this operator + sfkernels = [sf for sf in retrieve_cache_items("kernelimpl")] + if sfkernels: + logger.info("generate_localoperator_kernels: Create {} sumfact kernel realizations".format(len(sfkernels))) + + from dune.perftool.sumfact.realization import realize_sumfact_kernel_function + for sf, qp in sfkernels: + from dune.perftool.sumfact.tabulation import set_quadrature_points + set_quadrature_points(qp) + operator_methods.append(realize_sumfact_kernel_function(sf)) + if get_option('instrumentation_level') >= 3: include_file('dune/perftool/common/timer.hh', filetag='operatorfile') operator_methods.append(TimerMethod()) diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index e928b817eaf7b5f0eb9492f64880f554578c6fe0..3417adf998d43cf9b6720b60e208d9da6518542c 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -23,8 +23,9 @@ from dune.perftool.options import (get_form_option, get_option, ) from dune.perftool.loopy.flatten import flatten_index -from dune.perftool.loopy.buffer import get_buffer_temporary +from dune.perftool.loopy.target import type_floatingpoint from dune.perftool.sumfact.quadrature import nest_quadrature_loops +from dune.perftool.pdelab.driver import FEM_name_mangling from dune.perftool.pdelab.localoperator import determine_accumulation_space from dune.perftool.pdelab.restriction import restricted_name from dune.perftool.pdelab.signatures import assembler_routine_name @@ -36,13 +37,13 @@ from dune.perftool.sumfact.tabulation import (basis_functions_per_direction, from dune.perftool.sumfact.switch import (get_facedir, get_facemod, ) -from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelOutputBase +from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInterfaceBase from dune.perftool.ufl.modified_terminals import extract_modified_arguments from dune.perftool.tools import get_pymbolic_basename, get_leaf from dune.perftool.error import PerftoolError from dune.perftool.sumfact.quadrature import quadrature_inames -from pytools import ImmutableRecord +from pytools import ImmutableRecord, product import loopy as lp import numpy as np @@ -85,7 +86,7 @@ def accum_iname(element, bound, i): return sumfact_iname(bound, "accum{}".format(suffix)) -class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord): +class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord): def __init__(self, accumvar=None, restriction=None, @@ -107,6 +108,14 @@ class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord): def __repr__(self): return ImmutableRecord.__repr__(self) + @property + def stage(self): + return 3 + + @property + def direct_is_possible(self): + return get_form_option("fastdg") + @property def within_inames(self): if self.trial_element is None: @@ -167,52 +176,73 @@ class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord): forced_iname_deps=frozenset(inames + additional_inames + self.within_inames), forced_iname_deps_is_final=True, depends_on=insn_dep, - predicates=sf.predicates + predicates=sf.predicates, + tags=frozenset({"sumfact_stage3"}), ) return frozenset({dep}) - def realize_direct(self, result, inames, shape, args): - ft = get_global_context_value("form_type") - - if self.test_element_index is None: - direct_output = "{}_access".format(self.accumvar) - else: - direct_output = "{}_access_comp{}".format(self.accumvar, self.test_element_index) + def realize_direct(self, result, inames, shape, which=0, **args): + direct_output = "fastdg{}".format(which) ftags = ",".join(["f"] * len(shape)) - from dune.perftool.sumfact.realization import alias_data_array - if ft == 'residual' or ft == 'jacobian_apply': + if self.trial_element is None: globalarg(direct_output, shape=shape, dim_tags=ftags, offset=_dof_offset(self.test_element, self.test_element_index), ) - alias_data_array(direct_output, self.accumvar) lhs = prim.Subscript(prim.Variable(direct_output), inames) else: - assert ft == 'jacobian' - - direct_output = "{}x{}".format(direct_output, self.trial_element_index) rowsize = sum(tuple(s for s in _local_sizes(self.trial_element))) - element = get_leaf(self.trial_element, self.trial_element_index) - other_shape = tuple(element.degree() + 1 for e in shape) - from pytools import product manual_strides = tuple("stride:{}".format(rowsize * product(shape[:i])) for i in range(len(shape))) - dim_tags = "{},{}".format(ftags, ",".join(manual_strides)) + offset = "jacobian_offset{}".format(which) + valuearg(offset) globalarg(direct_output, - shape=other_shape + shape, - offset=rowsize * _dof_offset(self.test_element, self.test_element_index) + _dof_offset(self.trial_element, self.trial_element_index), - dim_tags=dim_tags, + shape=shape, + offset=prim.Variable(offset) + rowsize * _dof_offset(self.test_element, self.test_element_index) + _dof_offset(self.trial_element, self.trial_element_index), + dim_tags=manual_strides, ) - alias_data_array(direct_output, self.accumvar) - # TODO: It is at least questionnable, whether using the *order* of the inames in here - # for indexing is a good idea. Then again, it is hard to find an alternative. - _ansatz_inames = tuple(prim.Variable(i) for i in self.within_inames) - lhs = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + inames) + lhs = prim.Subscript(prim.Variable(direct_output), inames) result = prim.Sum((lhs, result)) - return frozenset({instruction(assignee=lhs, expression=result, **args)}) + return frozenset({instruction(assignee=lhs, + expression=result, + tags=frozenset({"sumfact_stage3"}), + **args)}) + + @property + def function_name_suffix(self): + if get_form_option("fastdg"): + suffix = "_fastdg1_{}comp{}".format(FEM_name_mangling(self.test_element), self.test_element_index) + if self.within_inames: + suffix = "{}x{}comp{}".format(suffix, FEM_name_mangling(self.trial_element), self.trial_element_index) + return suffix + else: + return "" + + @property + def function_args(self): + if get_form_option("fastdg"): + ret = ("{}.data()".format(self.accumvar),) + if get_form_option("fastdg") and self.within_inames: + element = get_leaf(self.trial_element, self.trial_element_index) + shape = tuple(element.degree() + 1 for e in range(element.cell().geometric_dimension())) + jacobian_index = flatten_index(tuple(prim.Variable(i) for i in self.within_inames), shape, order="f") + ret = ret + (str(jacobian_index),) + return ret + else: + return () + + @property + def signature_args(self): + if get_form_option('fastdg'): + ret = ("{}* fastdg0".format(type_floatingpoint()),) + if self.within_inames: + ret = ret + ("unsigned int jacobian_offset0",) + return ret + else: + return () def _local_sizes(element): @@ -411,9 +441,8 @@ def generate_accumulation_instruction(expr, visitor): ) sf = SumfactKernel(matrix_sequence=matrix_sequence, - stage=3, position_priority=priority, - output=output, + interface=output, predicates=predicates, ) @@ -427,11 +456,14 @@ def generate_accumulation_instruction(expr, visitor): vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset() - temp = get_buffer_temporary(buffer, - shape=vsf.quadrature_shape, - dim_tags=vsf.quadrature_dimtags, - name="input_{}".format(buffer), - ) + from dune.perftool.sumfact.realization import name_buffer_storage + temp = "input_{}".format(buffer) + temporary_variable(temp, + shape=vsf.quadrature_shape, + dim_tags=vsf.quadrature_dimtags, + custom_base_storage=name_buffer_storage(buffer, 0), + managed=True, + ) # Those input fields, that are padded need to be set to zero # in order to do a horizontal_add later on @@ -447,12 +479,19 @@ def generate_accumulation_instruction(expr, visitor): # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1) timer_dep = frozenset() if get_option("instrumentation_level") >= 4: - timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop' + timer_name = "{}_kernel_stage1".format(assembler_routine_name()) + timer_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), + depends_on=frozenset({lp.match.Tagged("sumfact_stage1"), 'hptimerstart_{}'.format(timer_name)}), + id="hptimerstop_{}".format(timer_name) + )} + ) + timer_name = '{}_kernel_quadratureloop'.format(assembler_routine_name()) post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') dump_accumulate_timer(timer_name) - if(jacobian_inames): - timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), - within_inames=frozenset(jacobian_inames))}) + timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), + within_inames=frozenset(jacobian_inames), + id="hptimerstart_{}".format(timer_name), + depends_on=timer_dep)}) # Determine dependencies from loopy.match import Or, Writes @@ -476,9 +515,10 @@ def generate_accumulation_instruction(expr, visitor): insn_dep = frozenset({contrib_dep}) if get_option("instrumentation_level") >= 4: - insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), - depends_on=insn_dep, - within_inames=frozenset(jacobian_inames))}) + insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), + depends_on=insn_dep, + within_inames=frozenset(jacobian_inames), + id="hptimerstop_{}".format(timer_name))})) # Add a sum factorization kernel that implements the multiplication # with the test function (stage 3) @@ -486,4 +526,12 @@ def generate_accumulation_instruction(expr, visitor): result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep))) if not get_form_option("fastdg"): - vsf.output.realize(vsf, result, insn_dep) + insn_dep = vsf.interface.realize(vsf, result, insn_dep) + + if get_option("instrumentation_level") >= 4: + assert vsf.stage == 3 + timer_name = '{}_kernel_stage{}'.format(assembler_routine_name(), vsf.stage) + insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), + depends_on=frozenset({lp.match.Tagged("sumfact_stage3")}), + within_inames=frozenset(jacobian_inames), + id="hptimerstop_{}".format(timer_name))}) diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 7c2740e13f195678ff234c9dbd946002f710eb83..39bba49e67fe66a4541ac0ae633e6e403be32c52 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -17,6 +17,7 @@ from dune.perftool.generation import (backend, kernel_cached, temporary_variable, ) +from dune.perftool.loopy.target import type_floatingpoint from dune.perftool.sumfact.tabulation import (basis_functions_per_direction, construct_basis_matrix_sequence, BasisTabulationMatrix, @@ -32,8 +33,7 @@ from dune.perftool.pdelab.argument import name_coefficientcontainer from dune.perftool.pdelab.geometry import (local_dimension, world_dimension, ) -from dune.perftool.loopy.buffer import initialize_buffer, get_buffer_temporary -from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInputBase +from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInterfaceBase from dune.perftool.options import get_form_option from dune.perftool.pdelab.driver import FEM_name_mangling from dune.perftool.pdelab.restriction import restricted_name @@ -51,7 +51,7 @@ from loopy.match import Writes import pymbolic.primitives as prim -class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord): +class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord): def __init__(self, coeff_func=None, element=None, @@ -72,7 +72,11 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord): return repr(self) @property - def direct_input_is_possible(self): + def stage(self): + return 1 + + @property + def direct_is_possible(self): return get_form_option("fastdg") def realize(self, sf, insn_dep, index=0): @@ -83,10 +87,13 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord): coeff = pc(container, lfs, basisiname) # Get the input temporary! - name = get_buffer_temporary(sf.buffer, - shape=(product(mat.basis_size for mat in sf.matrix_sequence), sf.vector_width), - name="input_{}".format(sf.buffer) - ) + from dune.perftool.sumfact.realization import name_buffer_storage + name = "input_{}".format(sf.buffer) + temporary_variable(name, + shape=(product(mat.basis_size for mat in sf.matrix_sequence), sf.vector_width), + custom_base_storage=name_buffer_storage(sf.buffer, 0), + managed=True, + ) assignee = prim.Subscript(prim.Variable(name), (prim.Variable(basisiname),) + (index,)) @@ -98,24 +105,40 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord): return insn_dep.union(frozenset({insn})) - def realize_direct(self, shape, inames): - arg = "{}_access{}".format(self.coeff_func(self.restriction), - "_comp{}".format(self.element_index) if self.element_index else "" - ) + def realize_direct(self, shape, inames, which=0): + arg = "fastdg{}".format(which) from dune.perftool.sumfact.accumulation import _dof_offset - from dune.perftool.sumfact.realization import alias_data_array globalarg(arg, shape=shape, dim_tags=",".join("f" * len(shape)), offset=_dof_offset(self.element, self.element_index), ) - func = self.coeff_func(self.restriction) - alias_data_array(arg, func) - return prim.Subscript(prim.Variable(arg), inames) + @property + def function_name_suffix(self): + if get_form_option("fastdg"): + return "_fastdg1_{}comp{}".format(FEM_name_mangling(self.element), self.element_index) + else: + return "" + + @property + def function_args(self): + if get_form_option("fastdg"): + func = self.coeff_func(self.restriction) + return ("{}.data()".format(func),) + else: + return () + + @property + def signature_args(self): + if get_form_option("fastdg"): + return ("const {}* fastdg0".format(type_floatingpoint()),) + else: + return () + def _basis_functions_per_direction(element): """Number of basis functions per direction """ @@ -162,7 +185,7 @@ def pymbolic_coefficient_gradient(element, restriction, index, coeff_func, visit # The sum factorization kernel object gathering all relevant information sf = SumfactKernel(matrix_sequence=matrix_sequence, position_priority=grad_index, - input=inp, + interface=inp, ) from dune.perftool.sumfact.vectorization import attach_vectorization_info @@ -203,7 +226,7 @@ def pymbolic_coefficient(element, restriction, index, coeff_func, visitor): ) sf = SumfactKernel(matrix_sequence=matrix_sequence, - input=inp, + interface=inp, position_priority=3, ) diff --git a/python/dune/perftool/sumfact/geometry.py b/python/dune/perftool/sumfact/geometry.py index d0763db759836bcd49150c9a983dcd357a49006d..7b78de412d2e2c5a893e2d1d1d7e32315bd6aafd 100644 --- a/python/dune/perftool/sumfact/geometry.py +++ b/python/dune/perftool/sumfact/geometry.py @@ -12,13 +12,12 @@ from dune.perftool.generation import (backend, temporary_variable, globalarg, ) -from dune.perftool.loopy.buffer import get_buffer_temporary from dune.perftool.pdelab.geometry import (local_dimension, world_dimension, name_geometry, ) from dune.perftool.sumfact.switch import get_facedir -from dune.perftool.sumfact.symbolic import SumfactKernelInputBase +from dune.perftool.sumfact.symbolic import SumfactKernelInterfaceBase from dune.perftool.sumfact.vectorization import attach_vectorization_info from dune.perftool.options import get_form_option, option_switch from dune.perftool.ufl.modified_terminals import Restriction @@ -36,15 +35,18 @@ def corner_iname(): return name -class GeoCornersInput(SumfactKernelInputBase, ImmutableRecord): +class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableRecord): def __init__(self, dir): ImmutableRecord.__init__(self, dir=dir) def realize(self, sf, index, insn_dep): - name = get_buffer_temporary(sf.buffer, - shape=(2 ** local_dimension(), sf.vector_width), - name="input_{}".format(sf.buffer) - ) + from dune.perftool.sumfact.realization import name_buffer_storage + name = "input_{}".format(sf.buffer) + temporary_variable(name, + shape=(2 ** local_dimension(), sf.vector_width), + custom_base_storage=name_buffer_storage(sf.buffer, 0), + managed=True, + ) ciname = corner_iname() geo = name_geometry() diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index d4aea877bdfd46157f3447baaf02869a2dfe6131..55e141086584e65ddec50fb1f7397c979661c663 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -3,20 +3,20 @@ The code that triggers the creation of the necessary code constructs to realize a sum factorization kernel """ from dune.perftool.generation import (barrier, + delete_cache_items, dump_accumulate_timer, generator_factory, get_global_context_value, globalarg, instruction, + kernel_cached, post_include, preamble, silenced_warning, temporary_variable, transform, ) -from dune.perftool.loopy.buffer import (get_buffer_temporary, - switch_base_storage, - ) +from dune.perftool.loopy.flatten import flatten_index from dune.perftool.pdelab.argument import pymbolic_coefficient from dune.perftool.pdelab.basis import shape_as_pymbolic from dune.perftool.pdelab.geometry import world_dimension @@ -28,11 +28,17 @@ from dune.perftool.sumfact.permutation import (sumfact_permutation_strategy, permute_backward, permute_forward, ) +from dune.perftool.sumfact.quadrature import quadrature_points_per_direction +from dune.perftool.sumfact.symbolic import (SumfactKernel, + VectorizedSumfactKernel, + ) from dune.perftool.sumfact.vectorization import attach_vectorization_info from dune.perftool.sumfact.accumulation import sumfact_iname from dune.perftool.loopy.target import dtype_floatingpoint from dune.perftool.loopy.vcl import ExplicitVCLCast +from dune.perftool.tools import get_leaf, remove_duplicates +from pytools import product from ufl import MixedElement import loopy as lp @@ -40,6 +46,11 @@ import numpy as np import pymbolic.primitives as prim +# Have a generator function store the necessary sum factorization kernel implementations +# This way then can easily be extracted at the end of the form visiting process +necessary_kernel_implementations = generator_factory(item_tags=("kernelimpl",), cache_key_generator=lambda a: a[0].function_name, no_deco=True) + + def realize_sum_factorization_kernel(sf, **kwargs): if get_global_context_value("dry_run", False): return sf, sf.insn_dep @@ -47,36 +58,114 @@ def realize_sum_factorization_kernel(sf, **kwargs): return _realize_sum_factorization_kernel(sf, **kwargs) -@preamble -def alias_data_array(name, data): - return "auto {} = {}.data();".format(name, data) +def name_buffer_storage(buff, which): + name = "{}_{}".format(buff, which) + return name -@generator_factory(item_tags=("sumfactkernel",), - context_tags=("kernel",), - cache_key_generator=lambda s, **kw: s.cache_key) +@kernel_cached def _realize_sum_factorization_kernel(sf): insn_dep = sf.insn_dep # Measure times and count operations in c++ code if get_option("instrumentation_level") >= 4: - if sf.stage == 1: - setuptimer = '{}_kernel_setup'.format(assembler_routine_name()) - insn_dep = insn_dep.union(frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer), - within_inames=frozenset(sf.within_inames), - depends_on=insn_dep)})) + setuptimer = '{}_kernel_setup'.format(assembler_routine_name()) + timer_dep = frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer), + id="hptimerstop_{}".format(setuptimer))}) + + timer_name = '{}_kernel_stage1'.format(assembler_routine_name()) + post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') + dump_accumulate_timer(timer_name) + timer_dep = timer_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), + id="hptimerstart_{}".format(timer_name), + depends_on=timer_dep, + ), + })) - timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(sf.stage) + timer_name = '{}_kernel_stage{}'.format(assembler_routine_name(), sf.stage) post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') dump_accumulate_timer(timer_name) - insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), - within_inames=frozenset(sf.within_inames), - depends_on=insn_dep, - ), - })) + timer_dep = timer_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), + id="hptimerstart_{}".format(timer_name), + within_inames=frozenset(sf.within_inames), + depends_on=timer_dep.union(insn_dep), + ), + })) + insn_dep = insn_dep.union(timer_dep) + + # Get all the necessary pieces for a function call + buffers = tuple(name_buffer_storage(sf.buffer, i) for i in range(2)) + + # Make sure that the storage is allocated and has a certain minimum size + # This is necessary to allocate buffers that will be passed to sumfact kernel + # functions. Loopy has no knowledge of what happens with those... + for buf in buffers: + # Determine the necessary size of the buffer. We assume that we do not + # underintegrate the form!!! + size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width, + product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width) + temporary_variable("{}_dummy".format(buf), + shape=(size,), + custom_base_storage=buf, + decl_method=lambda n, k, di: None, + ) + + # Realize the input if it is not direct + if sf.stage == 1 and not sf.interface.direct_is_possible: + insn_dep = insn_dep.union(sf.interface.realize(sf, insn_dep)) + + # Trigger generation of the sum factorization kernel function + qp = quadrature_points_per_direction() + necessary_kernel_implementations((sf, qp)) + + # Call the function + code = "{}({});".format(sf.function_name, ", ".join(buffers + sf.interface.function_args)) + tag = "sumfact_stage{}".format(sf.stage) + insn_dep = frozenset({instruction(code=code, + depends_on=insn_dep, + within_inames=frozenset(sf.within_inames), + tags=frozenset({tag}), + predicates=sf.predicates, + ) + }) + + # Interpret the output as a temporary of correct shape + out = "{}_output".format(sf.buffer) + temporary_variable(out, + shape=sf.output_shape, + dim_tags=sf.output_dimtags, + custom_base_storage=buffers[sf.length % 2], + managed=True, + ) + silenced_warning("read_no_write({})".format(out)) + + return lp.TaggedVariable(out, sf.tag), insn_dep + + +class BufferSwitcher(object): + def __init__(self): + self.current = 0 - if not sf.input.direct_input_is_possible: - insn_dep = insn_dep.union(sf.input.realize(sf, insn_dep)) + def get_temporary(self, name=None, **kwargs): + assert name + bs = "buffer{}".format(self.current) + globalarg(bs) + temporary_variable(name, + managed=True, + custom_base_storage=bs, + **kwargs + ) + + return name + + def switch(self): + self.current = (self.current + 1) % 2 + + +def realize_sumfact_kernel_function(sf): + # Get a buffer switcher instance + buffer = BufferSwitcher() + insn_dep = frozenset() # Prepare some dim_tags/shapes for later use ftags = ",".join(["f"] * sf.length) @@ -129,12 +218,12 @@ def _realize_sum_factorization_kernel(sf): # * a global data structure (if FastDGGridOperator is in use) # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator) input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) - if l == 0 and sf.input.direct_input_is_possible: + if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible: # See comment below input_inames = permute_backward(input_inames, perm) inp_shape = permute_backward(inp_shape, perm) - input_summand = sf.input.realize_direct(inp_shape, input_inames) + input_summand = sf.interface.realize_direct(inp_shape, input_inames) else: # If we did permute the order of a matrices above we also # permuted the order of out_inames. Unfortunately the @@ -147,9 +236,10 @@ def _realize_sum_factorization_kernel(sf): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = get_buffer_temporary(sf.buffer, + inp = buffer.get_temporary("buff_step{}_in".format(l), shape=inp_shape + vec_shape, - dim_tags=ftags) + dim_tags=ftags, + ) # The input temporary will only be read from, so we need to silence the loopy warning silenced_warning('read_no_write({})'.format(inp)) @@ -157,7 +247,7 @@ def _realize_sum_factorization_kernel(sf): input_summand = prim.Subscript(prim.Variable(inp), input_inames + vec_iname) - switch_base_storage(sf.buffer) + buffer.switch() # Get a temporary that interprets the base storage of the output. # @@ -171,9 +261,10 @@ def _realize_sum_factorization_kernel(sf): output_shape = tuple(out_shape[1:]) + (out_shape[0],) if l == len(matrix_sequence) - 1: output_shape = permute_backward(output_shape, perm) - out = get_buffer_temporary(sf.buffer, + out = buffer.get_temporary("buff_step{}_out".format(l), shape=output_shape + vec_shape, - dim_tags=ftags) + dim_tags=ftags, + ) # Write the matrix-matrix multiplication expression matprod = prim.Product((matrix.pymbolic((prim.Variable(out_inames[0]), k_expr) + vec_iname), @@ -189,25 +280,15 @@ def _realize_sum_factorization_kernel(sf): if l == len(matrix_sequence) - 1: output_inames = permute_backward(output_inames, perm) - tag = "sumfact_stage{}".format(sf.stage) - if sf.stage == 3: - tag = "{}_{}".format(tag, "_".join(sf.within_inames)) - # Collect the key word arguments for the loopy instruction - insn_args = {"forced_iname_deps": frozenset([i for i in out_inames]).union(frozenset(sf.within_inames)), - "forced_iname_deps_is_final": True, - "depends_on": insn_dep, - "tags": frozenset({tag}), - "predicates": sf.predicates, - "groups": frozenset({sf.group_name}), - } + insn_args = {"depends_on": insn_dep} # In case of direct output we directly accumulate the result # of the Sumfactorization into some global data structure. if l == len(matrix_sequence) - 1 and get_form_option('fastdg') and sf.stage == 3: if sf.vectorized: - insn_args["forced_iname_deps"] = insn_args["forced_iname_deps"].union(frozenset({vec_iname[0].name})) - insn_dep = sf.output.realize_direct(matprod, output_inames, out_shape, insn_args) + insn_args["forced_iname_deps"] = frozenset({vec_iname[0].name}) + insn_dep = sf.interface.realize_direct(matprod, output_inames, out_shape, **insn_args) else: # Issue the reduction instruction that implements the multiplication # at the same time store the instruction ID for the next instruction to depend on @@ -217,22 +298,10 @@ def _realize_sum_factorization_kernel(sf): ) }) - # Measure times and count operations in c++ code - if get_option("instrumentation_level") >= 4: - stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), - depends_on=frozenset({lp.match.Tagged(tag)}), - within_inames=frozenset(sf.within_inames))}) - if sf.stage == 1: - qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop' - post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') - dump_accumulate_timer(timer_name) - frozenset({instruction(code="HP_TIMER_START({});".format(qp_timer_name), - depends_on=stop_insn)}) - - out = get_buffer_temporary(sf.buffer, - shape=sf.output_shape, - dim_tags=sf.output_dimtags, - ) - silenced_warning('read_no_write({})'.format(out)) - - return lp.TaggedVariable(out, sf.tag), insn_dep + # Construct a loopy kernel object + from dune.perftool.pdelab.localoperator import extract_kernel_from_cache + args = ("const char* buffer0", "const char* buffer1") + sf.interface.signature_args + signature = "void {}({}) const".format(sf.function_name, ", ".join(args)) + kernel = extract_kernel_from_cache("kernel_default", sf.function_name, [signature], add_timings=False) + delete_cache_items("kernel_default") + return kernel diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index db93c6cf0145007e13dc82bbb7ff817c0b54e060..fb283a0536318c1585a0963a33ca76105571cb84 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -1,6 +1,6 @@ """ A pymbolic node representing a sum factorization kernel """ -from dune.perftool.options import get_option +from dune.perftool.options import get_form_option, get_option from dune.perftool.generation import (get_counted_variable, subst_rule, transform, @@ -8,9 +8,9 @@ from dune.perftool.generation import (get_counted_variable, from dune.perftool.pdelab.geometry import local_dimension, world_dimension from dune.perftool.sumfact.quadrature import quadrature_inames from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray -from dune.perftool.loopy.target import dtype_floatingpoint +from dune.perftool.loopy.target import dtype_floatingpoint, type_floatingpoint from dune.perftool.loopy.vcl import ExplicitVCLCast, VCLLowerUpperLoad -from dune.perftool.tools import get_leaf, maybe_wrap_subscript +from dune.perftool.tools import get_leaf, maybe_wrap_subscript, remove_duplicates from pytools import ImmutableRecord, product @@ -22,58 +22,85 @@ import frozendict import inspect -class SumfactKernelInputBase(object): +class SumfactKernelInterfaceBase(object): + """ A base class for the input/output of a sum factorization kernel + In stage 1, this represents the input object, in stage 3 the output object. + """ + def realize(self, *a, **kw): + raise NotImplementedError + + def realize_direct(self, *a, **kw): + raise NotImplementedError + @property - def direct_input_is_possible(self): - return False + def within_inames(self): + return () - def realize(self, sf, dep, index=0): - return dep + @property + def direct_is_possible(self): + return False - def realize_direct(self, inames): + @property + def stage(self): raise NotImplementedError + @property + def function_args(self): + return () + + @property + def signature_args(self): + return () + + @property + def function_name_suffix(self): + return "" + def __repr__(self): - return "SumfactKernelInputBase()" + return "SumfactKernelInterfaceBase()" -class VectorSumfactKernelInput(SumfactKernelInputBase): - def __init__(self, inputs): - assert(isinstance(inputs, tuple)) - self.inputs = inputs +class VectorSumfactKernelInput(SumfactKernelInterfaceBase): + def __init__(self, interfaces): + assert(isinstance(interfaces, tuple)) + self.interfaces = interfaces def __repr__(self): - return "_".join(repr(i) for i in self.inputs) + return "_".join(repr(i) for i in self.interfaces) + + @property + def stage(self): + return 1 @property - def direct_input_is_possible(self): - return all(i.direct_input_is_possible for i in self.inputs) + def direct_is_possible(self): + return all(i.direct_is_possible for i in self.interfaces) def realize(self, sf, dep): - for i, inp in enumerate(self.inputs): + for i, inp in enumerate(self.interfaces): dep = dep.union(inp.realize(sf, dep, index=i)) return dep def realize_direct(self, shape, inames): # Check whether the input exhibits a favorable structure # (whether we can broadcast scalar values into SIMD registers) - total = set(self.inputs) - lower = set(self.inputs[:len(self.inputs) // 2]) - upper = set(self.inputs[len(self.inputs) // 2:]) + total = set(self.interfaces) + lower = set(self.interfaces[:len(self.interfaces) // 2]) + upper = set(self.interfaces[len(self.interfaces) // 2:]) if len(total) == 1: # All input coefficients use the exact same input coefficient. # We implement this by broadcasting it into a SIMD register return prim.Call(ExplicitVCLCast(dtype_floatingpoint()), - (self.inputs[0].realize_direct(shape, inames),) + (self.interfaces[0].realize_direct(shape, inames),) ) elif len(total) == 2 and len(lower) == 1 and len(upper) == 1: # The lower and the upper part of the SIMD register use # the same input coefficient, we combine the SIMD register # from two shorter SIMD types return prim.Call(VCLLowerUpperLoad(dtype_floatingpoint()), - (self.inputs[0].realize_direct(shape, inames), - self.inputs[len(self.inputs) // 2].realize_direct(shape, inames), + (self.interfaces[0].realize_direct(shape, inames), + self.interfaces[len(self.interfaces) // 2].realize_direct(shape, inames, which=1), ) ) else: @@ -81,33 +108,41 @@ class VectorSumfactKernelInput(SumfactKernelInputBase): # need to load scalars into the SIMD vector. raise NotImplementedError("SIMD loads from scalars not implemented!") + @property + def function_args(self): + return sum((i.function_args for i in remove_duplicates(self.interfaces)), ()) -class SumfactKernelOutputBase(object): @property - def within_inames(self): - return () + def signature_args(self): + if get_form_option("fastdg"): + return tuple("const {}* fastdg{}".format(type_floatingpoint(), i) for i, _ in enumerate(remove_duplicates(self.interfaces))) + else: + return () - def realize(self, sf, result, insn_dep): - return dep + @property + def function_name_suffix(self): + return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces)) - def realize_direct(self, result, inames, shape, args): - raise NotImplementedError - def __repr__(self): - return "SumfactKernelOutputBase()" +class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): + def __init__(self, interfaces): + self.interfaces = interfaces + def __repr__(self): + return "_".join(repr(o) for o in self.interfaces) -class VectorSumfactKernelOutput(SumfactKernelOutputBase): - def __init__(self, outputs): - self.outputs = outputs + @property + def stage(self): + return 3 - def __repr__(self): - return "_".join(repr(o) for o in self.outputs) + @property + def within_inames(self): + return self.interfaces[0].within_inames def _add_hadd(self, o, result): hadd_function = "horizontal_add" - if len(set(self.outputs)) > 1: - pos = self.outputs.index(o) + if len(set(self.interfaces)) > 1: + pos = self.interfaces.index(o) if pos == 0: hadd_function = "horizontal_add_lower" else: @@ -116,10 +151,10 @@ class VectorSumfactKernelOutput(SumfactKernelOutputBase): return prim.Call(prim.Variable(hadd_function), (result,)) def realize(self, sf, result, insn_dep): - outputs = set(self.outputs) + outputs = set(self.interfaces) - trial_element, = set(o.trial_element for o in self.outputs) - trial_element_index, = set(o.trial_element_index for o in self.outputs) + trial_element, = set(o.trial_element for o in self.interfaces) + trial_element_index, = set(o.trial_element_index for o in self.interfaces) from dune.perftool.sumfact.accumulation import accum_iname element = get_leaf(trial_element, trial_element_index) if trial_element is not None else None inames = tuple(accum_iname(element, mat.rows, i) @@ -134,8 +169,8 @@ class VectorSumfactKernelOutput(SumfactKernelOutputBase): return deps - def realize_direct(self, result, inames, shape, args): - outputs = set(self.outputs) + def realize_direct(self, result, inames, shape, **args): + outputs = set(self.interfaces) # If multiple horizontal_add's are to be performed with 'result' # we need to precompute the result! @@ -143,15 +178,39 @@ class VectorSumfactKernelOutput(SumfactKernelOutputBase): substname = "haddsubst_{}".format("_".join([i.name for i in inames])) subst_rule(substname, (), result) result = prim.Call(prim.Variable(substname), ()) - transform(lp.precompute, substname, precompute_outer_inames=args["forced_iname_deps"]) + transform(lp.precompute, substname) deps = frozenset() for o in outputs: hadd_result = self._add_hadd(o, result) - deps = deps.union(o.realize_direct(hadd_result, inames, shape, args)) + which = tuple(remove_duplicates(self.interfaces)).index(o) + deps = deps.union(o.realize_direct(hadd_result, inames, shape, which=which, **args)) return deps + @property + def function_args(self): + if get_form_option("fastdg"): + return sum((i.function_args for i in remove_duplicates(self.interfaces)), ()) + else: + return() + + @property + def signature_args(self): + if get_form_option("fastdg"): + def _get_pair(i): + ret = ("{}* fastdg{}".format(type_floatingpoint(), i),) + if self.within_inames: + ret = ret + ("unsigned int jacobian_offset{}".format(i),) + return ret + return sum((_get_pair(i) for i, _ in enumerate(remove_duplicates(self.interfaces))), ()) + else: + return () + + @property + def function_name_suffix(self): + return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces)) + class SumfactKernelBase(object): pass @@ -161,11 +220,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): def __init__(self, matrix_sequence=None, buffer=None, - stage=1, position_priority=None, insn_dep=frozenset(), - input=SumfactKernelInputBase(), - output=SumfactKernelOutputBase(), + interface=SumfactKernelInterfaceBase(), predicates=frozenset(), ): """Create a sum factorization kernel @@ -219,31 +276,18 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): for intermediate results. The memory is expected to be pre-initialized with the input or you have to provide direct_input (FastDGGridOperator). - stage: 1 or 3 position_priority: Will be used in the dry run to order kernels when doing vectorization e.g. (dx u,dy u,dz u, u). - restriction: Restriction for faces values. insn_dep: An instruction ID that the first issued instruction should depend upon. All following ones will depend on each other. - input: An SumfactKernelInputBase instance describing the input of the kernel - accumvar: The accumulation variable to accumulate into - trial_element: The leaf element of the trial function space. - Used to correctly nest stage 3 in the jacobian case. - test_element: The leaf element of the test function space - Used to compute offsets in the fastdg case. - test_element_index: the component of the test_element - trial_element_index: the component of the trial_element + interface: An SumfactKernelInterfaceBase instance describing the input + (stage 1) or output (stage 3) of the kernel """ # Assert the inputs! assert isinstance(matrix_sequence, tuple) assert all(isinstance(m, BasisTabulationMatrixBase) for m in matrix_sequence) - - assert stage in (1, 3) - - assert isinstance(input, SumfactKernelInputBase) - assert isinstance(output, SumfactKernelOutputBase) - + assert isinstance(interface, SumfactKernelInterfaceBase) assert isinstance(insn_dep, frozenset) # The following construction is a bit weird: Dict comprehensions do not have @@ -269,7 +313,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): def __str__(self): # Above stringifier just calls back into this return "SF{}:[{}]->[{}]".format(self.stage, - str(self.input), + str(self.interface), ", ".join(str(m) for m in self.matrix_sequence)) mapper_method = "map_sumfact_kernel" @@ -278,6 +322,11 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): # Some cache key definitions # Watch out for the documentation to see which key is used unter what circumstances # + @property + def function_name(self): + """ The name of the function that implements this kernel """ + return "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence), + self.interface.function_name_suffix) @property def parallel_key(self): @@ -304,11 +353,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): work on the same input coefficient (stage 1) or accumulate into the same thing (stage 3) """ - return (repr(self.input), repr(self.output)) - - @property - def group_name(self): - return "sfgroup_{}_{}".format(self.input, self.output) + return repr(self.interface) # # Some convenience methods to extract information about the sum factorization kernel @@ -318,7 +363,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): if self.parallel_key != other.parallel_key: return self.parallel_key < other.parallel_key if self.inout_key != other.inout_key: - return self.input_key < other.input_key + return self.inout_key < other.inout_key if self.position_priority == other.position_priority: return repr(self) < repr(other) if self.position_priority is None: @@ -342,7 +387,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): @property def within_inames(self): - return self.output.within_inames + return self.interface.within_inames def vec_index(self, sf): """ Map an unvectorized sumfact kernel object to its position @@ -428,6 +473,10 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): def tag(self): return "sumfac" + @property + def stage(self): + return self.interface.stage + # # Define properties for conformity with the interface of VectorizedSumfactKernel # @@ -529,7 +578,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) def __str__(self): # Above stringifier just calls back into this return "VSF{}:[{}]->[{}]".format(self.stage, - ", ".join(str(k.input) for k in self.kernels), + ", ".join(str(k.interface) for k in self.kernels), ", ".join(str(mat) for mat in self.matrix_sequence)) mapper_method = "map_vectorized_sumfact_kernel" @@ -540,6 +589,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) # Some cache key definitions # Watch out for the documentation to see which key is used unter what circumstances # + @property + def function_name(self): + return "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence), + self.interface.function_name_suffix) @property def cache_key(self): @@ -596,12 +649,15 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) # @property - def input(self): - return VectorSumfactKernelInput(tuple(k.input for k in self.kernels)) + def stage(self): + return self.kernels[0].stage @property - def output(self): - return VectorSumfactKernelOutput(tuple(k.output for k in self.kernels)) + def interface(self): + if self.stage == 1: + return VectorSumfactKernelInput(tuple(k.interface for k in self.kernels)) + else: + return VectorSumfactKernelOutput(tuple(k.interface for k in self.kernels)) @property def cache_key(self): @@ -611,10 +667,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) def inout_key(self): return tuple(k.inout_key for k in self.kernels) - @property - def group_name(self): - return "_".join(k.group_name for k in self.kernels) - @property def length(self): return self.kernels[0].length diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index daeabf9005cbff9c5917185b5711ceb6cb4342a3..5309fa832c7b805d4dc7e3d31213a2a70c0c2087 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -18,7 +18,6 @@ from dune.perftool.generation import (class_member, transform, valuearg ) -from dune.perftool.loopy.buffer import get_buffer_temporary from dune.perftool.loopy.target import dtype_floatingpoint from dune.perftool.loopy.vcl import ExplicitVCLCast, get_vcl_type_size from dune.perftool.pdelab.localoperator import (name_domain_field, diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py index b29ebe221387a5af529f0dec29592b4de8dc762a..4a95dbced5e24d7efd0ddadc422511a1fc1362cc 100644 --- a/python/dune/perftool/tools.py +++ b/python/dune/perftool/tools.py @@ -93,3 +93,12 @@ def get_leaf(element, index): leaf_element = element.extract_component(index)[1] return leaf_element + + +def remove_duplicates(iterable): + """ Remove duplicates from an iterable while preserving the order """ + seen = set() + for i in iterable: + if i not in seen: + yield i + seen.add(i)