diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py index c8c085c178d7dc2224d1efd1d76efaacf9da5259..e541e71366371039157cb757201f710ffd0a19ca 100644 --- a/python/dune/perftool/generation/__init__.py +++ b/python/dune/perftool/generation/__init__.py @@ -43,6 +43,7 @@ from dune.perftool.generation.loopy import (barrier, kernel_cached, noop_instruction, silenced_warning, + subst_rule, temporary_variable, transform, valuearg, diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py index a97df4744fd1e661554a6febb306a8f858589c31..df47d5e9fb1ac03d8ed6bde6ec1de203470f4d2f 100644 --- a/python/dune/perftool/generation/loopy.py +++ b/python/dune/perftool/generation/loopy.py @@ -172,8 +172,8 @@ def noop_instruction(**kwargs): context_tags="kernel", cache_key_generator=no_caching, ) -def transform(trafo, *args): - return (trafo, args) +def transform(trafo, *args, **kwargs): + return (trafo, args, kwargs) @generator_factory(item_tags=("instruction", "barrier"), @@ -216,3 +216,8 @@ def loopy_class_member(name, classtag=None, potentially_vectorized=False, **kwar globalarg(name, **kwargs) return name + + +@generator_factory(item_tags=("substrule",), context_tags="kernel") +def subst_rule(name, args, expr): + return lp.SubstitutionRule(name, args, expr) diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 4873920b2901f812bbe06e01f233c0f9d3423630..c56f24e25f1e62fa116b8f753c0eb7de04a2d8db 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -539,19 +539,7 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): # Apply the transformations that were gathered during tree traversals for trafo in transformations: - kernel = trafo[0](kernel, *trafo[1]) - - # Precompute all the substrules - for sr in kernel.substitutions: - tmpname = "precompute_{}".format(sr) - kernel = lp.precompute(kernel, - sr, - temporary_name=tmpname, - ) - # Vectorization strategies are actually very likely to eliminate the - # precomputation temporary. To avoid the temporary elimination warning - # we need to explicitly disable it. - kernel = kernel.copy(silenced_warnings=kernel.silenced_warnings + ["temp_to_write({})".format(tmpname)]) + kernel = trafo[0](kernel, *trafo[1], **trafo[2]) from dune.perftool.loopy import heuristic_duplication kernel = heuristic_duplication(kernel) diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index 4776f4fe88cfd26f5585a987cdf117386a1d4fdc..4b704c449a8ea31249faa70011c2fd8db1558746 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -194,7 +194,7 @@ def _realize_sum_factorization_kernel(sf): tag = "{}_{}".format(tag, "_".join(sf.within_inames)) # Collect the key word arguments for the loopy instruction - insn_args = {"forced_iname_deps": frozenset([iname for iname in out_inames]).union(frozenset(sf.within_inames)), + insn_args = {"forced_iname_deps": frozenset([i for i in out_inames]).union(frozenset(sf.within_inames)), "forced_iname_deps_is_final": True, "depends_on": insn_dep, "tags": frozenset({tag}), @@ -205,6 +205,7 @@ def _realize_sum_factorization_kernel(sf): # In case of direct output we directly accumulate the result # of the Sumfactorization into some global data structure. if l == len(matrix_sequence) - 1 and get_form_option('fastdg') and sf.stage == 3: + insn_args["forced_iname_deps"] = insn_args["forced_iname_deps"].union(frozenset({vec_iname[0].name})) insn_dep = sf.output.realize_direct(matprod, output_inames, out_shape, insn_args) else: # Issue the reduction instruction that implements the multiplication diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index 0256d86dad0bdcbb29fa73a2ec40ccb4c82f7ead..4ee4960ec15ba6ece4287ae3c1cb26e2f7a8eb5e 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -2,6 +2,7 @@ from dune.perftool.options import get_option from dune.perftool.generation import (get_counted_variable, + subst_rule, transform, ) from dune.perftool.pdelab.geometry import local_dimension, world_dimension @@ -128,6 +129,14 @@ class VectorSumfactKernelOutput(SumfactKernelOutputBase): def realize_direct(self, result, inames, shape, args): outputs = set(self.outputs) + # If multiple horizontal_add's are to be performed with 'result' + # we need to precompute the result! + if len(outputs) > 1: + substname = "haddsubst_{}".format("_".join([i.name for i in inames])) + subst_rule(substname, (), result) + result = prim.Call(prim.Variable(substname), ()) + transform(lp.precompute, substname, precompute_outer_inames=args["forced_iname_deps"]) + deps = frozenset() for o in outputs: hadd_result = self._add_hadd(o, result)