diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py index 03ce2cf678f57d10c6d76a81aaf89fa3fe79af98..c03ebc1f12a73243bd0ba204c81802633cfec631 100644 --- a/python/dune/codegen/blockstructured/vectorization.py +++ b/python/dune/codegen/blockstructured/vectorization.py @@ -186,11 +186,17 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0): from loopy.symbolic import IdentityMapper class AliasIndexCollector(CombineMapper, IdentityMapper): + def __init__(self): + self.found_alias = False + def combine(self, values): return sum(values, tuple()) def map_constant(self, expr): - return tuple() + if self.found_alias: + return (expr,) + else: + return tuple() map_variable = map_constant map_function_symbol = map_constant @@ -198,7 +204,10 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0): def map_subscript(self, expr): if expr.aggregate.name.endswith(alias_suffix): - return expr.aggregate, expr.index_tuple + self.found_alias = True + indices = self.combine((self.rec(index) for index in expr.index_tuple)) + self.found_alias = False + return expr.aggregate, indices else: return tuple() @@ -467,50 +476,48 @@ def vectorize_micro_elements(knl): orig_iname = vec_iname if vec_iname in knl.all_inames() and get_global_context_value('integral_type') == 'cell': vcl_size = get_vcl_type_size(np.float64) + knl = add_iname_array(knl, vec_iname) - has_tail = get_form_option('number_of_blocks') % vcl_size > 0 - vectorize_tail = True + def _do_vectorization(knl, vec_iname, vcl_size, level=0): + inner_iname = vec_iname + '_inner' + outer_iname = vec_iname + '_outer' - knl = add_iname_array(knl, vec_iname) + has_tail = get_form_option('number_of_blocks') % vcl_size > 0 + vectorize_tail = (vcl_size // 2) > 1 and \ + (get_form_option('number_of_blocks') % vcl_size) % (vcl_size // 2) == 0 - # manually add tail, since split_iname with slabs tries to vectorize the tail - if has_tail: - vectorizable_bound = (get_form_option('number_of_blocks') // vcl_size) * vcl_size - from loopy.kernel.tools import DomainChanger - domch = DomainChanger(knl, (vec_iname,)) - knl = knl.copy(domains=domch.get_domains_with( - BasicSet('{{ [{0}]: 0<={0}<{1} }}'.format(vec_iname, vectorizable_bound)))) + # manually add tail, since split_iname with slabs tries to vectorize the tail + if has_tail: + # fake suitable loop bound + vectorizable_bound = (get_form_option('number_of_blocks') // vcl_size) * vcl_size + from loopy.kernel.tools import DomainChanger + domch = DomainChanger(knl, (vec_iname,)) + knl = knl.copy(domains=domch.get_domains_with( + BasicSet('{{ [{0}]: 0<={0}<{1} }}'.format(vec_iname, vectorizable_bound)))) - knl = lp.split_iname(knl, vec_iname, vcl_size) - knl = realize_tail(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size) + knl = lp.split_iname(knl, vec_iname, vcl_size, outer_iname=outer_iname, inner_iname=inner_iname) - tail_iname = vec_iname + '_inner' + '_tail' - else: - knl = lp.split_iname(knl, vec_iname, vcl_size) + tail_iname = vec_iname + '_inner' + '_tail' + knl = realize_tail(knl, inner_iname, outer_iname, tail_iname, vcl_size) + else: + knl = lp.split_iname(knl, vec_iname, vcl_size) - knl = lp.tag_inames(knl, [(vec_iname + '_inner', 'vec')]) + knl = lp.tag_inames(knl, [(inner_iname, 'vec')]) - array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')] - knl = lp.split_array_axis(knl, array_alias, 0, vcl_size) + array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')] + knl = lp.split_array_axis(knl, array_alias, level, vcl_size) - def _do_vectorization(knl, iname_inner, iname_outer, vcl_size, levels=0): - knl = add_vcl_iname_array(knl, orig_iname, iname_inner, vcl_size) + knl = add_vcl_iname_array(knl, orig_iname, inner_iname, vcl_size) knl = add_vcl_temporaries(knl, vcl_size) - knl = add_vcl_accum_insns(knl, iname_inner, iname_outer, vcl_size) - knl = add_vcl_access(knl, iname_inner, vcl_size, levels) - return knl + knl = add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size) + knl = add_vcl_access(knl, inner_iname, vcl_size, level) - knl = _do_vectorization(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size) - if has_tail and vectorize_tail: - vcl_size = vcl_size // 2 - vec_iname = tail_iname + if has_tail and vectorize_tail: + from pudb import set_trace; set_trace() + knl = _do_vectorization(knl, tail_iname, vcl_size // 2, level + 1) - knl = lp.split_iname(knl, vec_iname, vcl_size) - knl = lp.tag_inames(knl, [(vec_iname + '_inner', 'vec')]) - - array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')] - knl = lp.split_array_axis(knl, array_alias, 1, vcl_size) + return knl - knl = _do_vectorization(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size, levels=1) + knl = _do_vectorization(knl, orig_iname, vcl_size) return knl