diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index e60151028fe7857680728df5aeebe88f44a30c6f..9a00a87662f563b4f034a4cb3cd72996fea68af7 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -190,15 +190,15 @@ class DuneASTBuilder(CASTBuilder): size = [] for t in temps: if isinstance(t, DuneTemporaryVariable) and t.custom_base_storage == bs: - # TODO: Extract correct size - alignment.append(8) + #TODO Extract alignment from the temporaries after switching to loopy 2018.1 + alignment.append(get_option("max_vector_width") // 8) from pytools import product size.append(product(t.shape)) alignment = max(alignment) size = max(size) - decl = "char {}[{}] __attribute__ ((aligned({})));".format(bs, size * alignment, alignment) + decl = "char {}[{}] __attribute__ ((aligned({})));".format(bs, size * 8, alignment) ret.append(cgen.Line(decl)) if self.target.declare_temporaries: diff --git a/python/dune/perftool/loopy/transformations/vectorize_quad.py b/python/dune/perftool/loopy/transformations/vectorize_quad.py index 5022ebaf8d58875705777d5c107fae2e0bb34f15..e3f30cc2d8d4313dbb7e30674cc59a360675fc30 100644 --- a/python/dune/perftool/loopy/transformations/vectorize_quad.py +++ b/python/dune/perftool/loopy/transformations/vectorize_quad.py @@ -242,7 +242,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix): for insn in insns: # Get a vector view of the lhs expression lhsname = get_pymbolic_basename(insn.assignee) - knl = add_vector_view(knl, lhsname, pad_to=vec_size) + knl = add_vector_view(knl, lhsname) lhsname = get_vector_view_name(lhsname) rotating = "gradvec" in insn.tags diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index 0ce8a59565da22d9e861ec55038b7617eb328bc0..d113f8151ae4ba5298ed68b335bafbb74ecc5807 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -32,6 +32,7 @@ from dune.perftool.sumfact.accumulation import sumfact_iname from dune.perftool.loopy.target import dtype_floatingpoint from dune.perftool.loopy.vcl import ExplicitVCLCast +from pytools import product from ufl import MixedElement import loopy as lp @@ -78,17 +79,17 @@ def _realize_sum_factorization_kernel(sf): # Get all the necessary pieces for a function call funcname = name_kernel_implementation_function(sf) - #TODO calculate the size and alignment correctly - size = 10000 - alignment = 8 buffers = tuple(name_buffer_storage(sf.buffer, i) for i in range(2)) # Make sure that the storage is allocated and has a certain minimum size # This is necessary to allocate buffers that will be passed to sumfact kernel # functions. Loopy has no knowledge of what happens with those... for buf in buffers: + # Determine the necessary size of the buffer. We assume that we do not + # underintegrate the form!!! + size = product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width temporary_variable("{}_dummy".format(buf), - shape=(10000,), + shape=(size,), custom_base_storage=buf, decl_method=lambda n, k, di: None, )