diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py index 22942d662f27e0921a042adb8edb92e46a5a5ffa..cfdfc7945ec506327c81cc5dcddb0a9f480c8dd5 100644 --- a/python/dune/codegen/blockstructured/vectorization.py +++ b/python/dune/codegen/blockstructured/vectorization.py @@ -1,6 +1,7 @@ import loopy as lp import numpy as np import pymbolic.primitives as prim +from dune.codegen.blockstructured.tools import sub_element_inames from loopy.match import Tagged, Id, Writes, Reads, And, Or, Iname, All, Not from islpy import BasicSet @@ -468,13 +469,18 @@ def realize_tail(knl, inner_iname, outer_iname, outer_bound, tail_iname, vcl_siz temporary_variables=dict(**knl.temporary_variables, **temporaries_to_duplicate)) common_inames = knl.all_inames() - for insn in insns_to_duplicate: + for insn in new_insns: common_inames = common_inames & (insn.within_inames | insn.reduction_inames()) + if get_form_option('vectorization_blockstructured_tail_ordering') == 'blocked': + # TODO need to be more clever to get the right inames + macro_inames = frozenset((iname + '_0' * level) for iname in sub_element_inames()) + common_inames = common_inames - macro_inames + additional_inames_to_duplicate = frozenset() - for insn in insns_to_duplicate: - additional_inames_to_duplicate = additional_inames_to_duplicate | ((insn.within_inames | - insn.reduction_inames()) - common_inames) + for insn in new_insns: + insn_inames = insn.within_inames | insn.reduction_inames() + additional_inames_to_duplicate = additional_inames_to_duplicate | (insn_inames - common_inames) knl = lp.duplicate_inames(knl, tuple(additional_inames_to_duplicate), Or(tuple((Id(insn.id) for insn in new_insns)))) diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py index bf80a72a94d7facee6bfb019863883bf3c287feb..00fc2e48941a2dd52dcdd3beec5a7ef49795d02a 100644 --- a/python/dune/codegen/options.py +++ b/python/dune/codegen/options.py @@ -101,7 +101,8 @@ class CodegenFormOptionsArray(ImmutableRecord): blockstructured = CodegenOption(default=False, helpstr="Use block structure") number_of_blocks = CodegenOption(default=1, helpstr="Number of sub blocks in one direction") vectorization_blockstructured = CodegenOption(default=False, helpstr="Vectorize block structuring") - vectorization_blockstructured_tail = CodegenOption(default=True, helpstr="Try to fully vectorize block structuring even when 'nunmber_of_blocks' is not divisible by vector length.") + vectorization_blockstructured_tail = CodegenOption(default=True, helpstr="Try to fully vectorize block structuring even when 'nunmber_of_blocks' is not divisible by vector length") + vectorization_blockstructured_tail_ordering = CodegenOption(default='consecutive', helpstr="Ordering of the tail w.r.t the vectorized loop. Possible values: consecutive|blocked") adjoint = CodegenOption(default=False, helpstr="Generate adjoint operator") control = CodegenOption(default=False, helpstr="Generate operator of derivative w.r.t. the control variable") objective_function = CodegenOption(default=None, helpstr="Name of form representing the objective function in UFL file")