diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index 2193e619182f4090a0ad9d1306c9dd8d771cf4dc..bdee930a9dd9b68183282d3905131d317241af8e 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -154,14 +154,16 @@ class DuneASTBuilder(CASTBuilder): def get_c_expression_to_code_mapper(self): return DuneCExpressionToCodeMapper() - def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info): + def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info): # If this is not a DuneTemporaryVariable, it was introduced by loopy # and it should be totally under loopys control: Call the base class implementation! if not (isinstance(temp_var, DuneTemporaryVariable) and temp_var.custom_declaration): - return CASTBuilder.get_temporary_decl(self, knl, schedule_index, temp_var, decl_info) + return CASTBuilder.get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info) - if temp_var.decl_method: - return cgen.Line(temp_var.decl_method(temp_var.name, temp_var.shape, temp_var.shape_impl)) + if temp_var.custom_declaration: + decl = temp_var.decl_method(temp_var.name, temp_var.shape, temp_var.shape_impl) + if decl: + return cgen.Line(decl) def add_vector_access(self, access_expr, index): # There is no generic way of implementing a vector access with VCL, as @@ -176,10 +178,33 @@ class DuneASTBuilder(CASTBuilder): return cgen.Line("BARRIER;") def get_temporary_decls(self, codegen_state, schedule_index): + temps = codegen_state.kernel.temporary_variables.values() + # Declare all the custom base storages + ret = [] + for bs in set(t.custom_base_storage for t in temps if isinstance(t, DuneTemporaryVariable)) - set({None}): + if bs in [a.name for a in codegen_state.kernel.args]: + continue + + # Find the alignment bytes + alignment = [] + size = [] + for t in temps: + if t.custom_base_storage == bs: + # TODO: Extract correct size + alignment.append(8) + from pytools import product + size.append(product(t.shape)) + + alignment = max(alignment) + size = max(size) + + decl = "char {}[{}] __attribute__ ((aligned({})));".format(bs, size * alignment, alignment) + ret.append(cgen.Line(decl)) + if self.target.declare_temporaries: - return CASTBuilder.get_temporary_decls(self, codegen_state, schedule_index) + return ret + CASTBuilder.get_temporary_decls(self, codegen_state, schedule_index) else: - return [] + return ret class DuneTarget(TargetBase): diff --git a/python/dune/perftool/loopy/temporary.py b/python/dune/perftool/loopy/temporary.py index d916f6b0312ac763d60829047a52056d088014bc..21e0e9d0117ad5872d298f3b0cab4fc88b4a69f0 100644 --- a/python/dune/perftool/loopy/temporary.py +++ b/python/dune/perftool/loopy/temporary.py @@ -5,6 +5,7 @@ from dune.perftool.error import PerftoolLoopyError from loopy import TemporaryVariable +import loopy as lp import numpy @@ -44,11 +45,20 @@ def default_declaration(name, shape=(), shape_impl=()): return '{} {}(0.0);'.format(t, name) +def custom_base_storage_temporary_declaration(storage, dtype): + def _decl(name, *a): + from dune.perftool.loopy.target import numpy_to_cpp_dtype + _type = numpy_to_cpp_dtype(lp.types.NumpyType(dtype).dtype.name) + return "{0} *{1} = ({0} *){2};".format(_type, name, storage) + + return _decl + + class DuneTemporaryVariable(TemporaryVariable): - allowed_extra_kwargs = TemporaryVariable.allowed_extra_kwargs + ["managed", "shape_impl", "decl_method"] + allowed_extra_kwargs = TemporaryVariable.allowed_extra_kwargs + ["managed", "shape_impl", "decl_method", "custom_base_storage"] - def __init__(self, name, managed=False, shape_impl=None, decl_method=None, **kwargs): + def __init__(self, name, managed=False, shape_impl=None, decl_method=None, custom_base_storage=None, **kwargs): self.managed = managed self.decl_method = decl_method self.shape_impl = shape_impl @@ -59,6 +69,15 @@ class DuneTemporaryVariable(TemporaryVariable): from dune.perftool.loopy.target import dtype_floatingpoint kwargs.setdefault('dtype', dtype_floatingpoint()) + if custom_base_storage and self.decl_method is None: + assert shape_impl is None + self.decl_method = custom_base_storage_temporary_declaration(custom_base_storage, kwargs["dtype"]) + self.custom_declaration = self.decl_method is not None - TemporaryVariable.__init__(self, name, managed=self.managed, shape_impl=self.shape_impl, decl_method=self.decl_method, **kwargs) + TemporaryVariable.__init__(self, name, + managed=self.managed, + shape_impl=self.shape_impl, + decl_method=self.decl_method, + custom_base_storage=custom_base_storage, + **kwargs) diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index 215dc42119a5a628ce1e2dc1f8dba58cb4ddba36..8881a510fefed4ed72fa49d2222de57e9c3cf7fb 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -426,13 +426,12 @@ def generate_accumulation_instruction(expr, visitor): vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset() - from dune.perftool.sumfact.realization import name_buffer_storage, buffer_decl, get_sumfact_dtype - storage = name_buffer_storage(buffer, 0) + from dune.perftool.sumfact.realization import name_buffer_storage temp = "input_{}".format(buffer) temporary_variable(temp, shape=vsf.quadrature_shape, dim_tags=vsf.quadrature_dimtags, - decl_method=buffer_decl(storage, get_sumfact_dtype(sf)), + custom_base_storage=name_buffer_storage(buffer, 0), managed=True, ) diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 1ba10e14bd4b8d213fb9b6a28b40bb4570b26bdb..ee2f068f9eb04f09af174944ee81574acf8d7014 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -82,12 +82,11 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord): coeff = pc(container, lfs, basisiname) # Get the input temporary! - from dune.perftool.sumfact.realization import name_buffer_storage, buffer_decl, get_sumfact_dtype - storage = name_buffer_storage(sf.buffer, 0) + from dune.perftool.sumfact.realization import name_buffer_storage name = "input_{}".format(sf.buffer) temporary_variable(name, shape=(product(mat.basis_size for mat in sf.matrix_sequence), sf.vector_width), - decl_method=buffer_decl(storage, get_sumfact_dtype(sf)), + custom_base_storage=name_buffer_storage(sf.buffer, 0), managed=True, ) diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index b69264b285e67eebb6139c52e261b8e4b312050c..1d6c6219c0a8ca01a1156b0d2fd9a9a1b5ad599b 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -67,11 +67,6 @@ def alias_data_array(name, data): return "auto {} = {}.data();".format(name, data) -@preamble -def declare_buffer_storage(name, size, alignment): - return "char {}[{}] __attribute__ ((aligned({})));".format(name, size * alignment, alignment) - - def name_buffer_storage(buff, which): name = "{}_{}".format(buff, which) return name @@ -88,9 +83,15 @@ def _realize_sum_factorization_kernel(sf): alignment = 8 buffers = tuple(name_buffer_storage(sf.buffer, i) for i in range(2)) - # Make sure that the storage is allocated + # Make sure that the storage is allocated and has a certain minimum size + # This is necessary to allocate buffers that will be passed to sumfact kernel + # functions. Loopy has no knowledge of what happens with those... for buf in buffers: - declare_buffer_storage(buf, size, alignment) + temporary_variable("{}_dummy".format(buf), + shape=(10000,), + custom_base_storage=buf, + decl_method=lambda *a: None, + ) # Realize the input if it is not direct if not sf.input.direct_input_is_possible: @@ -108,7 +109,7 @@ def _realize_sum_factorization_kernel(sf): temporary_variable(out, shape=sf.output_shape, dim_tags=sf.output_dimtags, - decl_method=buffer_decl(buffers[sf.length % 2], get_sumfact_dtype(sf)), + custom_base_storage=buffers[sf.length % 2], managed=True, ) silenced_warning("read_no_write({})".format(out)) @@ -125,24 +126,17 @@ def buffer_decl(buffer, dtype): return _buffer_decl -def get_sumfact_dtype(sf): - if sf.vectorized: - pass - else: - from dune.perftool.loopy.target import dtype_floatingpoint - from loopy.types import NumpyType - return NumpyType(dtype_floatingpoint()).dtype.name - - class BufferSwitcher(object): def __init__(self, buffers=("buffer0", "buffer1")): self.buffers = buffers self.current = 0 def get_temporary(self, name=None, **kwargs): + bs = self.buffers[self.current] + globalarg(bs) temporary_variable(name, managed=True, - decl_method=buffer_decl(self.buffers[self.current], kwargs["dtype"]), + custom_base_storage=self.buffers[self.current], **kwargs ) @@ -229,7 +223,6 @@ def realize_sumfact_kernel_function(sf): inp = buffer.get_temporary("buff_step{}_in".format(l), shape=inp_shape + vec_shape, dim_tags=ftags, - dtype=get_sumfact_dtype(sf), ) # The input temporary will only be read from, so we need to silence the loopy warning @@ -255,7 +248,6 @@ def realize_sumfact_kernel_function(sf): out = buffer.get_temporary("buff_step{}_out".format(l), shape=output_shape + vec_shape, dim_tags=ftags, - dtype=get_sumfact_dtype(sf), ) # Write the matrix-matrix multiplication expression