diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py index c03ebc1f12a73243bd0ba204c81802633cfec631..5accb340eeba612f95e45152d0c7a70679c685bd 100644 --- a/python/dune/codegen/blockstructured/vectorization.py +++ b/python/dune/codegen/blockstructured/vectorization.py @@ -423,14 +423,14 @@ def add_vcl_iname_array(knl, iname, vec_iname, vcl_size): return knl -def realize_tail(knl, inner_iname, outer_iname, tail_iname, vcl_size): +def realize_tail(knl, inner_iname, outer_iname, outer_bound, tail_iname, vcl_size): tail_size = get_form_option('number_of_blocks') % vcl_size new_dom = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(tail_iname, tail_size)) insns_to_duplicate = lp.find_instructions(knl, Iname(inner_iname)) ids_to_duplicate = tuple((insn.id for insn in insns_to_duplicate)) - subst_map = dict([(outer_iname, get_form_option('number_of_blocks') // vcl_size), + subst_map = dict([(outer_iname, outer_bound // vcl_size), (inner_iname, prim.Variable(tail_iname))]) temporaries_to_duplicate = dict() @@ -478,18 +478,17 @@ def vectorize_micro_elements(knl): vcl_size = get_vcl_type_size(np.float64) knl = add_iname_array(knl, vec_iname) - def _do_vectorization(knl, vec_iname, vcl_size, level=0): + def _do_vectorization(knl, vec_iname, iname_bound, vcl_size, level=0): inner_iname = vec_iname + '_inner' outer_iname = vec_iname + '_outer' - has_tail = get_form_option('number_of_blocks') % vcl_size > 0 - vectorize_tail = (vcl_size // 2) > 1 and \ - (get_form_option('number_of_blocks') % vcl_size) % (vcl_size // 2) == 0 + tail_size = iname_bound % vcl_size + vectorize_tail = ((vcl_size // 2) > 1) and (tail_size >= (vcl_size // 2)) # manually add tail, since split_iname with slabs tries to vectorize the tail - if has_tail: + if tail_size > 0: # fake suitable loop bound - vectorizable_bound = (get_form_option('number_of_blocks') // vcl_size) * vcl_size + vectorizable_bound = (iname_bound // vcl_size) * vcl_size from loopy.kernel.tools import DomainChanger domch = DomainChanger(knl, (vec_iname,)) knl = knl.copy(domains=domch.get_domains_with( @@ -498,7 +497,7 @@ def vectorize_micro_elements(knl): knl = lp.split_iname(knl, vec_iname, vcl_size, outer_iname=outer_iname, inner_iname=inner_iname) tail_iname = vec_iname + '_inner' + '_tail' - knl = realize_tail(knl, inner_iname, outer_iname, tail_iname, vcl_size) + knl = realize_tail(knl, inner_iname, outer_iname, iname_bound, tail_iname, vcl_size) else: knl = lp.split_iname(knl, vec_iname, vcl_size) @@ -512,12 +511,11 @@ def vectorize_micro_elements(knl): knl = add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size) knl = add_vcl_access(knl, inner_iname, vcl_size, level) - if has_tail and vectorize_tail: - from pudb import set_trace; set_trace() - knl = _do_vectorization(knl, tail_iname, vcl_size // 2, level + 1) + if tail_size > 0 and vectorize_tail: + knl = _do_vectorization(knl, tail_iname, tail_size, vcl_size // 2, level + 1) return knl - knl = _do_vectorization(knl, orig_iname, vcl_size) + knl = _do_vectorization(knl, orig_iname, get_form_option('number_of_blocks'), vcl_size) return knl