diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 46a639fb9c5b269db31ab10721f8630f74ca6d97..86a9b354efa1d1b6bc07d90edc45b8c531ff577c 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -108,6 +108,7 @@ def pymbolic_trialfunction_gradient(element, restriction, component, visitor): preferred_position=i, insn_dep=insn_dep, restriction=restriction, + outshape=tuple(mat.rows for mat in a_matrices if mat.face is None), direct_input=direct_input, ) buffers.append(var) @@ -174,7 +175,7 @@ def pymbolic_trialfunction(element, restriction, component, visitor): 1, preferred_position=None, insn_dep=frozenset({Writes(inp)}), - outshape=tuple(mat.rows for mat in a_matrices if mat.rows != 1), + outshape=tuple(mat.rows for mat in a_matrices if mat.face is None), restriction=restriction, direct_input=direct_input, ) diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index e135fb75a1a8650a088852f95d67d1da03a4b77e..2429da4ef3d688ebe651f7d4de140ca76a8a779a 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -144,7 +144,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, (accterm.argument.restriction, restriction)) # Initialize a base storage for this buffer and get a temporay pointing to it - shape = tuple(mat.cols for mat in a_matrices if mat.cols != 1) + shape = tuple(mat.cols for mat in a_matrices if mat.face is None) dim_tags = ",".join(['f'] * local_dimension()) if index is not None: shape = shape + (4,) @@ -434,7 +434,9 @@ def sum_factorization_kernel(a_matrices, }) if outshape is None: - outshape = tuple(mat.rows for mat in a_matrices if mat.rows != 1) + assert stage == 3 + outshape = tuple(mat.rows for mat in a_matrices) + dim_tags = ",".join(['f'] * len(outshape)) if next(iter(a_matrices)).vectorized: