diff --git a/python/dune/perftool/loopy/buffer.py b/python/dune/perftool/loopy/buffer.py deleted file mode 100644 index c0e2ab310ce4cf6ddc8c8ea403d2e5cad1c63365..0000000000000000000000000000000000000000 --- a/python/dune/perftool/loopy/buffer.py +++ /dev/null @@ -1,55 +0,0 @@ -from dune.perftool.error import PerftoolLoopyError -from dune.perftool.generation import (get_counted_variable, - kernel_cached, - temporary_variable, - ) - - -class FlipFlopBuffer(object): - def __init__(self, identifier): - self.identifier = identifier - - # Initialize the counter that switches between the base storages! - self._current = 0 - - # Generate the base storage names - self.base_storage = tuple("{}_base_{}".format(self.identifier, i) for i in (0, 1)) - - def switch_base_storage(self): - self._current = (self._current + 1) % 2 - - def get_temporary(self, **kwargs): - assert("base_storage" not in kwargs) - assert("storage_shape" not in kwargs) - - # Select the base storage and increase counter - base = self.base_storage[self._current] - - # Construct a temporary name - name = kwargs.pop("name", None) - if name is None: - name = get_counted_variable(self.identifier) - - # Construct the temporary and return it - temporary_variable(name, - base_storage=base, - managed=True, - _base_storage_access_may_be_aliasing=True, - **kwargs - ) - - return name - - -@kernel_cached -def initialize_buffer(identifier): - assert isinstance(identifier, str) - return FlipFlopBuffer(identifier) - - -def get_buffer_temporary(identifier, **kwargs): - return initialize_buffer(identifier).get_temporary(**kwargs) - - -def switch_base_storage(identifier): - initialize_buffer(identifier).switch_base_storage() diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 6f109f4cc05c21b0962c3065b1820fea749863ab..f1b4b7e0fc331403041acab0a0914766b60c3ad9 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -473,7 +473,11 @@ def generate_kernel(integrals): delete_cache_items("kernel_default") for integral in integrals: visit_integral(integral) - knl = extract_kernel_from_cache("kernel_default") + + from dune.perftool.pdelab.signatures import kernel_name, assembly_routine_signature + name = kernel_name() + signature = assembly_routine_signature() + knl = extract_kernel_from_cache("kernel_default", name, signature) delete_cache_items("kernel_default") # Reset the quadrature degree @@ -491,7 +495,7 @@ def generate_kernels_per_integral(integrals): yield generate_kernel(integrals) -def extract_kernel_from_cache(tag, wrap_in_cgen=True): +def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True): # Now extract regular loopy kernel components from dune.perftool.loopy.target import DuneTarget domains = [i for i in retrieve_cache_items("{} and domain".format(tag))] @@ -512,13 +516,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): check_dep_resolution=False, ) - # Find a name for the kernel - if wrap_in_cgen: - from dune.perftool.pdelab.signatures import kernel_name - name = kernel_name() - else: - name = "constructor_kernel" - # Create the kernel from loopy import make_kernel, preprocess_kernel kernel = make_kernel(domains, @@ -570,8 +567,9 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): if wrap_in_cgen: # Wrap the kernel in something which can generate code - from dune.perftool.pdelab.signatures import assembly_routine_signature - signature = assembly_routine_signature() + if signature is None: + from dune.perftool.pdelab.signatures import assembly_routine_signature + signature = assembly_routine_signature() kernel = LoopyKernelMethod(signature, kernel) return kernel @@ -673,7 +671,7 @@ def cgen_class_from_cache(tag, members=[]): tparams = [i for i in retrieve_cache_items('{} and template_param'.format(tag))] # Construct the constructor - constructor_knl = extract_kernel_from_cache(tag, wrap_in_cgen=False) + constructor_knl = extract_kernel_from_cache(tag, "constructor_kernel", None, wrap_in_cgen=False) from dune.perftool.loopy.target import DuneTarget constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False)) signature = "{}({})".format(basename, ", ".join(next(iter(p.generate(with_semicolon=False))) for p in constructor_params)) @@ -1035,6 +1033,13 @@ def generate_localoperator_file(kernels, filename): for k in kernels.values(): operator_methods.extend(k) + # Generate all the realizations of sum factorization kernel objects needed in this operator + from dune.perftool.sumfact.realization import realize_sumfact_kernel_function + for sf, qp in retrieve_cache_items("kernelimpl"): + from dune.perftool.sumfact.tabulation import set_quadrature_points + set_quadrature_points(qp) + operator_methods.append(realize_sumfact_kernel_function(sf)) + if get_option('instrumentation_level') >= 3: include_file('dune/perftool/common/timer.hh', filetag='operatorfile') operator_methods.append(TimerMethod()) diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index e928b817eaf7b5f0eb9492f64880f554578c6fe0..215dc42119a5a628ce1e2dc1f8dba58cb4ddba36 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -23,7 +23,6 @@ from dune.perftool.options import (get_form_option, get_option, ) from dune.perftool.loopy.flatten import flatten_index -from dune.perftool.loopy.buffer import get_buffer_temporary from dune.perftool.sumfact.quadrature import nest_quadrature_loops from dune.perftool.pdelab.localoperator import determine_accumulation_space from dune.perftool.pdelab.restriction import restricted_name @@ -427,11 +426,15 @@ def generate_accumulation_instruction(expr, visitor): vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset() - temp = get_buffer_temporary(buffer, - shape=vsf.quadrature_shape, - dim_tags=vsf.quadrature_dimtags, - name="input_{}".format(buffer), - ) + from dune.perftool.sumfact.realization import name_buffer_storage, buffer_decl, get_sumfact_dtype + storage = name_buffer_storage(buffer, 0) + temp = "input_{}".format(buffer) + temporary_variable(temp, + shape=vsf.quadrature_shape, + dim_tags=vsf.quadrature_dimtags, + decl_method=buffer_decl(storage, get_sumfact_dtype(sf)), + managed=True, + ) # Those input fields, that are padded need to be set to zero # in order to do a horizontal_add later on diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 7c2740e13f195678ff234c9dbd946002f710eb83..1ba10e14bd4b8d213fb9b6a28b40bb4570b26bdb 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -32,7 +32,6 @@ from dune.perftool.pdelab.argument import name_coefficientcontainer from dune.perftool.pdelab.geometry import (local_dimension, world_dimension, ) -from dune.perftool.loopy.buffer import initialize_buffer, get_buffer_temporary from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInputBase from dune.perftool.options import get_form_option from dune.perftool.pdelab.driver import FEM_name_mangling @@ -83,10 +82,14 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord): coeff = pc(container, lfs, basisiname) # Get the input temporary! - name = get_buffer_temporary(sf.buffer, - shape=(product(mat.basis_size for mat in sf.matrix_sequence), sf.vector_width), - name="input_{}".format(sf.buffer) - ) + from dune.perftool.sumfact.realization import name_buffer_storage, buffer_decl, get_sumfact_dtype + storage = name_buffer_storage(sf.buffer, 0) + name = "input_{}".format(sf.buffer) + temporary_variable(name, + shape=(product(mat.basis_size for mat in sf.matrix_sequence), sf.vector_width), + decl_method=buffer_decl(storage, get_sumfact_dtype(sf)), + managed=True, + ) assignee = prim.Subscript(prim.Variable(name), (prim.Variable(basisiname),) + (index,)) diff --git a/python/dune/perftool/sumfact/geometry.py b/python/dune/perftool/sumfact/geometry.py index d0763db759836bcd49150c9a983dcd357a49006d..6afbe6238b6a7f7ce735dbbadb04b4ae791e6d23 100644 --- a/python/dune/perftool/sumfact/geometry.py +++ b/python/dune/perftool/sumfact/geometry.py @@ -12,7 +12,6 @@ from dune.perftool.generation import (backend, temporary_variable, globalarg, ) -from dune.perftool.loopy.buffer import get_buffer_temporary from dune.perftool.pdelab.geometry import (local_dimension, world_dimension, name_geometry, @@ -41,10 +40,14 @@ class GeoCornersInput(SumfactKernelInputBase, ImmutableRecord): ImmutableRecord.__init__(self, dir=dir) def realize(self, sf, index, insn_dep): - name = get_buffer_temporary(sf.buffer, - shape=(2 ** local_dimension(), sf.vector_width), - name="input_{}".format(sf.buffer) - ) + from dune.perftool.sumfact.realization import name_buffer_storage, buffer_decl, get_sumfact_dtype + storage = name_buffer_storage(sf.buffer, 0) + name = name="input_{}".format(sf.buffer) + temporary_variable(name, + shape=(2 ** local_dimension(), sf.vector_width), + decl_method=buffer_decl(storage, get_sumfact_dtype(sf)), + managed=True, + ) ciname = corner_iname() geo = name_geometry() diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index d4aea877bdfd46157f3447baaf02869a2dfe6131..b69264b285e67eebb6139c52e261b8e4b312050c 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -3,20 +3,19 @@ The code that triggers the creation of the necessary code constructs to realize a sum factorization kernel """ from dune.perftool.generation import (barrier, + delete_cache_items, dump_accumulate_timer, generator_factory, get_global_context_value, globalarg, instruction, + kernel_cached, post_include, preamble, silenced_warning, temporary_variable, transform, ) -from dune.perftool.loopy.buffer import (get_buffer_temporary, - switch_base_storage, - ) from dune.perftool.pdelab.argument import pymbolic_coefficient from dune.perftool.pdelab.basis import shape_as_pymbolic from dune.perftool.pdelab.geometry import world_dimension @@ -40,6 +39,22 @@ import numpy as np import pymbolic.primitives as prim +necessary_kernel_implementations = generator_factory(item_tags=("kernelimpl",), no_deco=True) + + +@generator_factory(cache_key_generator=lambda s, qp: (s.function_key, qp)) +def _name_kernel_implementation_function(sf, qp): + name = "sfimpl_{}".format("_".join(str(m) for m in sf.matrix_sequence)) + necessary_kernel_implementations((sf, qp)) + return name + + +def name_kernel_implementation_function(sf): + from dune.perftool.sumfact.quadrature import quadrature_points_per_direction + qp = quadrature_points_per_direction() + return _name_kernel_implementation_function(sf, qp) + + def realize_sum_factorization_kernel(sf, **kwargs): if get_global_context_value("dry_run", False): return sf, sf.insn_dep @@ -52,32 +67,96 @@ def alias_data_array(name, data): return "auto {} = {}.data();".format(name, data) -@generator_factory(item_tags=("sumfactkernel",), - context_tags=("kernel",), - cache_key_generator=lambda s, **kw: s.cache_key) +@preamble +def declare_buffer_storage(name, size, alignment): + return "char {}[{}] __attribute__ ((aligned({})));".format(name, size * alignment, alignment) + + +def name_buffer_storage(buff, which): + name = "{}_{}".format(buff, which) + return name + + +@kernel_cached def _realize_sum_factorization_kernel(sf): insn_dep = sf.insn_dep - # Measure times and count operations in c++ code - if get_option("instrumentation_level") >= 4: - if sf.stage == 1: - setuptimer = '{}_kernel_setup'.format(assembler_routine_name()) - insn_dep = insn_dep.union(frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer), - within_inames=frozenset(sf.within_inames), - depends_on=insn_dep)})) - - timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(sf.stage) - post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') - dump_accumulate_timer(timer_name) - insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), - within_inames=frozenset(sf.within_inames), - depends_on=insn_dep, - ), - })) + # Get all the necessary pieces for a function call + funcname = name_kernel_implementation_function(sf) + #TODO calculate the size and alignment correctly + size = 10000 + alignment = 8 + buffers = tuple(name_buffer_storage(sf.buffer, i) for i in range(2)) + + # Make sure that the storage is allocated + for buf in buffers: + declare_buffer_storage(buf, size, alignment) + # Realize the input if it is not direct if not sf.input.direct_input_is_possible: insn_dep = insn_dep.union(sf.input.realize(sf, insn_dep)) + # Call the function + code = "{}({}, {});".format(funcname, *buffers) + insn_dep = frozenset({instruction(code=code, + depends_on=insn_dep, + within_inames=frozenset(sf.within_inames)) + }) + + # Interpret the output as a temporary of correct shape + out = "{}_output".format(sf.buffer) + temporary_variable(out, + shape=sf.output_shape, + dim_tags=sf.output_dimtags, + decl_method=buffer_decl(buffers[sf.length % 2], get_sumfact_dtype(sf)), + managed=True, + ) + silenced_warning("read_no_write({})".format(out)) + + return lp.TaggedVariable(out, sf.tag), insn_dep + + +def buffer_decl(buffer, dtype): + def _buffer_decl(name, *a): + from dune.perftool.loopy.target import numpy_to_cpp_dtype + _type = numpy_to_cpp_dtype(dtype) + return "{0} *{1} = ({0} *){2};".format(_type, name, buffer) + + return _buffer_decl + + +def get_sumfact_dtype(sf): + if sf.vectorized: + pass + else: + from dune.perftool.loopy.target import dtype_floatingpoint + from loopy.types import NumpyType + return NumpyType(dtype_floatingpoint()).dtype.name + + +class BufferSwitcher(object): + def __init__(self, buffers=("buffer0", "buffer1")): + self.buffers = buffers + self.current = 0 + + def get_temporary(self, name=None, **kwargs): + temporary_variable(name, + managed=True, + decl_method=buffer_decl(self.buffers[self.current], kwargs["dtype"]), + **kwargs + ) + + return name + + def switch(self): + self.current = (self.current + 1) % 2 + + +def realize_sumfact_kernel_function(sf): + # Get a buffer switcher instance + buffer = BufferSwitcher() + insn_dep = frozenset() + # Prepare some dim_tags/shapes for later use ftags = ",".join(["f"] * sf.length) novec_ftags = ftags @@ -147,9 +226,11 @@ def _realize_sum_factorization_kernel(sf): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = get_buffer_temporary(sf.buffer, + inp = buffer.get_temporary("buff_step{}_in".format(l), shape=inp_shape + vec_shape, - dim_tags=ftags) + dim_tags=ftags, + dtype=get_sumfact_dtype(sf), + ) # The input temporary will only be read from, so we need to silence the loopy warning silenced_warning('read_no_write({})'.format(inp)) @@ -157,7 +238,7 @@ def _realize_sum_factorization_kernel(sf): input_summand = prim.Subscript(prim.Variable(inp), input_inames + vec_iname) - switch_base_storage(sf.buffer) + buffer.switch() # Get a temporary that interprets the base storage of the output. # @@ -171,9 +252,11 @@ def _realize_sum_factorization_kernel(sf): output_shape = tuple(out_shape[1:]) + (out_shape[0],) if l == len(matrix_sequence) - 1: output_shape = permute_backward(output_shape, perm) - out = get_buffer_temporary(sf.buffer, + out = buffer.get_temporary("buff_step{}_out".format(l), shape=output_shape + vec_shape, - dim_tags=ftags) + dim_tags=ftags, + dtype=get_sumfact_dtype(sf), + ) # Write the matrix-matrix multiplication expression matprod = prim.Product((matrix.pymbolic((prim.Variable(out_inames[0]), k_expr) + vec_iname), @@ -217,22 +300,196 @@ def _realize_sum_factorization_kernel(sf): ) }) - # Measure times and count operations in c++ code - if get_option("instrumentation_level") >= 4: - stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), - depends_on=frozenset({lp.match.Tagged(tag)}), - within_inames=frozenset(sf.within_inames))}) - if sf.stage == 1: - qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop' - post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') - dump_accumulate_timer(timer_name) - frozenset({instruction(code="HP_TIMER_START({});".format(qp_timer_name), - depends_on=stop_insn)}) - - out = get_buffer_temporary(sf.buffer, - shape=sf.output_shape, - dim_tags=sf.output_dimtags, - ) - silenced_warning('read_no_write({})'.format(out)) - - return lp.TaggedVariable(out, sf.tag), insn_dep + # Construct a loopy kernel object + name = name_kernel_implementation_function(sf) + from dune.perftool.pdelab.localoperator import extract_kernel_from_cache + signature = "void {}(const char* buffer0, const char* buffer1) const".format(name) + kernel = extract_kernel_from_cache("kernel_default", name, [signature]) + delete_cache_items("kernel_default") + return kernel + + +# @generator_factory(item_tags=("sumfactkernel",), +# context_tags=("kernel",), +# cache_key_generator=lambda s, **kw: s.cache_key) +# def old_realize_sum_factorization_kernel(sf): +# insn_dep = sf.insn_dep +# +# # Measure times and count operations in c++ code +# if get_option("instrumentation_level") >= 4: +# if sf.stage == 1: +# setuptimer = '{}_kernel_setup'.format(assembler_routine_name()) +# insn_dep = insn_dep.union(frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer), +# within_inames=frozenset(sf.within_inames), +# depends_on=insn_dep)})) +# +# timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(sf.stage) +# post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') +# dump_accumulate_timer(timer_name) +# insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), +# within_inames=frozenset(sf.within_inames), +# depends_on=insn_dep, +# ), +# })) +# +# if not sf.input.direct_input_is_possible: +# insn_dep = insn_dep.union(sf.input.realize(sf, insn_dep)) +# +# # Prepare some dim_tags/shapes for later use +# ftags = ",".join(["f"] * sf.length) +# novec_ftags = ftags +# ctags = ",".join(["c"] * sf.length) +# vec_shape = () +# if sf.vectorized: +# ftags = ftags + ",vec" +# ctags = ctags + ",vec" +# vec_shape = (sf.vector_width,) +# +# # Decide in which order we want to process directions in the +# # sumfactorization. A clever ordering can lead to a reduced +# # complexity. This will e.g. happen at faces where we only have +# # one quadratue point m_l=1 if l is the normal direction of the +# # face. +# # +# # Rule of thumb: small m's early and large n's late. +# perm = sumfact_permutation_strategy(sf) +# +# # Permute matrix sequence +# matrix_sequence = permute_forward(sf.matrix_sequence, perm) +# +# # Product of all matrices +# for l, matrix in enumerate(matrix_sequence): +# # Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication +# # and get inames that realize the product. +# inp_shape = (matrix.cols,) + tuple(mat.cols for mat in matrix_sequence[l + 1:]) + tuple(mat.rows for mat in matrix_sequence[:l]) +# out_shape = (matrix.rows,) + tuple(mat.cols for mat in matrix_sequence[l + 1:]) + tuple(mat.rows for mat in matrix_sequence[:l]) +# out_inames = tuple(sumfact_iname(length, "out_inames_" + str(k)) for k, length in enumerate(out_shape)) +# vec_iname = () +# if matrix.vectorized: +# iname = sumfact_iname(sf.vector_width, "vec") +# vec_iname = (prim.Variable(iname),) +# transform(lp.tag_inames, [(iname, "vec")]) +# +# # A trivial reduction is implemented as a product, otherwise we run into +# # a code generation corner case producing way too complicated code. This +# # could be fixed upstream, but the loopy code realizing reductions is not +# # trivial and the priority is kind of low. +# if matrix.cols != 1: +# k = sumfact_iname(matrix.cols, "red") +# k_expr = prim.Variable(k) +# else: +# k_expr = 0 +# +# # Setup the input of the sum factorization kernel. In the +# # first matrix multiplication this can be taken from +# # * an input temporary (default) +# # * a global data structure (if FastDGGridOperator is in use) +# # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator) +# input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) +# if l == 0 and sf.input.direct_input_is_possible: +# # See comment below +# input_inames = permute_backward(input_inames, perm) +# inp_shape = permute_backward(inp_shape, perm) +# +# input_summand = sf.input.realize_direct(inp_shape, input_inames) +# else: +# # If we did permute the order of a matrices above we also +# # permuted the order of out_inames. Unfortunately the +# # order of our input is from 0 to d-1. This means we need +# # to permute _back_ to get the right coefficients. +# if l == 0: +# inp_shape = permute_backward(inp_shape, perm) +# input_inames = permute_backward(input_inames, perm) +# +# # Get a temporary that interprets the base storage of the input +# # as a column-major matrix. In later iteration of the matrix loop +# # this reinterprets the output of the previous iteration. +# inp = get_buffer_temporary(sf.buffer, +# shape=inp_shape + vec_shape, +# dim_tags=ftags) +# +# # The input temporary will only be read from, so we need to silence the loopy warning +# silenced_warning('read_no_write({})'.format(inp)) +# +# input_summand = prim.Subscript(prim.Variable(inp), +# input_inames + vec_iname) +# +# switch_base_storage(sf.buffer) +# +# # Get a temporary that interprets the base storage of the output. +# # +# # Note: In this step the reordering of the fastest directions +# # is happening. The new direction (out_inames[0]) and the +# # corresponding shape (out_shape[0]) goes to the end (slowest +# # direction) and everything stays column major (ftags->fortran +# # style). +# # +# # If we are in the last step we reverse the permutation. +# output_shape = tuple(out_shape[1:]) + (out_shape[0],) +# if l == len(matrix_sequence) - 1: +# output_shape = permute_backward(output_shape, perm) +# out = get_buffer_temporary(sf.buffer, +# shape=output_shape + vec_shape, +# dim_tags=ftags) +# +# # Write the matrix-matrix multiplication expression +# matprod = prim.Product((matrix.pymbolic((prim.Variable(out_inames[0]), k_expr) + vec_iname), +# input_summand)) +# +# # ... which may be a reduction, if k>0 +# if matrix.cols != 1: +# matprod = lp.Reduction("sum", k, matprod) +# +# # Here we also move the new direction (out_inames[0]) to the +# # end and reverse permutation +# output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) +# if l == len(matrix_sequence) - 1: +# output_inames = permute_backward(output_inames, perm) +# +# tag = "sumfact_stage{}".format(sf.stage) +# if sf.stage == 3: +# tag = "{}_{}".format(tag, "_".join(sf.within_inames)) +# +# # Collect the key word arguments for the loopy instruction +# insn_args = {"forced_iname_deps": frozenset([i for i in out_inames]).union(frozenset(sf.within_inames)), +# "forced_iname_deps_is_final": True, +# "depends_on": insn_dep, +# "tags": frozenset({tag}), +# "predicates": sf.predicates, +# "groups": frozenset({sf.group_name}), +# } +# +# # In case of direct output we directly accumulate the result +# # of the Sumfactorization into some global data structure. +# if l == len(matrix_sequence) - 1 and get_form_option('fastdg') and sf.stage == 3: +# if sf.vectorized: +# insn_args["forced_iname_deps"] = insn_args["forced_iname_deps"].union(frozenset({vec_iname[0].name})) +# insn_dep = sf.output.realize_direct(matprod, output_inames, out_shape, insn_args) +# else: +# # Issue the reduction instruction that implements the multiplication +# # at the same time store the instruction ID for the next instruction to depend on +# insn_dep = frozenset({instruction(assignee=prim.Subscript(prim.Variable(out), output_inames + vec_iname), +# expression=matprod, +# **insn_args +# ) +# }) +# +# # Measure times and count operations in c++ code +# if get_option("instrumentation_level") >= 4: +# stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), +# depends_on=frozenset({lp.match.Tagged(tag)}), +# within_inames=frozenset(sf.within_inames))}) +# if sf.stage == 1: +# qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop' +# post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') +# dump_accumulate_timer(timer_name) +# frozenset({instruction(code="HP_TIMER_START({});".format(qp_timer_name), +# depends_on=stop_insn)}) +# +# out = get_buffer_temporary(sf.buffer, +# shape=sf.output_shape, +# dim_tags=sf.output_dimtags, +# ) +# silenced_warning('read_no_write({})'.format(out)) +# +# return lp.TaggedVariable(out, sf.tag), insn_dep diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index db93c6cf0145007e13dc82bbb7ff817c0b54e060..6794296ea591bec064a250ba29fea81c3caf201a 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -278,6 +278,10 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): # Some cache key definitions # Watch out for the documentation to see which key is used unter what circumstances # + @property + def function_key(self): + """ Kernels sharing this key may use the same kernel implementation function """ + return self.matrix_sequence @property def parallel_key(self): diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index daeabf9005cbff9c5917185b5711ceb6cb4342a3..5309fa832c7b805d4dc7e3d31213a2a70c0c2087 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -18,7 +18,6 @@ from dune.perftool.generation import (class_member, transform, valuearg ) -from dune.perftool.loopy.buffer import get_buffer_temporary from dune.perftool.loopy.target import dtype_floatingpoint from dune.perftool.loopy.vcl import ExplicitVCLCast, get_vcl_type_size from dune.perftool.pdelab.localoperator import (name_domain_field,