Skip to content
Snippets Groups Projects
Commit 406d2502 authored by Marcel Koch's avatar Marcel Koch
Browse files

correctly handle tail if tail_size % new_vcl_size != 0

parent 13ab3cc0
No related branches found
No related tags found
No related merge requests found
......@@ -423,14 +423,14 @@ def add_vcl_iname_array(knl, iname, vec_iname, vcl_size):
return knl
def realize_tail(knl, inner_iname, outer_iname, tail_iname, vcl_size):
def realize_tail(knl, inner_iname, outer_iname, outer_bound, tail_iname, vcl_size):
tail_size = get_form_option('number_of_blocks') % vcl_size
new_dom = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(tail_iname, tail_size))
insns_to_duplicate = lp.find_instructions(knl, Iname(inner_iname))
ids_to_duplicate = tuple((insn.id for insn in insns_to_duplicate))
subst_map = dict([(outer_iname, get_form_option('number_of_blocks') // vcl_size),
subst_map = dict([(outer_iname, outer_bound // vcl_size),
(inner_iname, prim.Variable(tail_iname))])
temporaries_to_duplicate = dict()
......@@ -478,18 +478,17 @@ def vectorize_micro_elements(knl):
vcl_size = get_vcl_type_size(np.float64)
knl = add_iname_array(knl, vec_iname)
def _do_vectorization(knl, vec_iname, vcl_size, level=0):
def _do_vectorization(knl, vec_iname, iname_bound, vcl_size, level=0):
inner_iname = vec_iname + '_inner'
outer_iname = vec_iname + '_outer'
has_tail = get_form_option('number_of_blocks') % vcl_size > 0
vectorize_tail = (vcl_size // 2) > 1 and \
(get_form_option('number_of_blocks') % vcl_size) % (vcl_size // 2) == 0
tail_size = iname_bound % vcl_size
vectorize_tail = ((vcl_size // 2) > 1) and (tail_size >= (vcl_size // 2))
# manually add tail, since split_iname with slabs tries to vectorize the tail
if has_tail:
if tail_size > 0:
# fake suitable loop bound
vectorizable_bound = (get_form_option('number_of_blocks') // vcl_size) * vcl_size
vectorizable_bound = (iname_bound // vcl_size) * vcl_size
from loopy.kernel.tools import DomainChanger
domch = DomainChanger(knl, (vec_iname,))
knl = knl.copy(domains=domch.get_domains_with(
......@@ -498,7 +497,7 @@ def vectorize_micro_elements(knl):
knl = lp.split_iname(knl, vec_iname, vcl_size, outer_iname=outer_iname, inner_iname=inner_iname)
tail_iname = vec_iname + '_inner' + '_tail'
knl = realize_tail(knl, inner_iname, outer_iname, tail_iname, vcl_size)
knl = realize_tail(knl, inner_iname, outer_iname, iname_bound, tail_iname, vcl_size)
else:
knl = lp.split_iname(knl, vec_iname, vcl_size)
......@@ -512,12 +511,11 @@ def vectorize_micro_elements(knl):
knl = add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size)
knl = add_vcl_access(knl, inner_iname, vcl_size, level)
if has_tail and vectorize_tail:
from pudb import set_trace; set_trace()
knl = _do_vectorization(knl, tail_iname, vcl_size // 2, level + 1)
if tail_size > 0 and vectorize_tail:
knl = _do_vectorization(knl, tail_iname, tail_size, vcl_size // 2, level + 1)
return knl
knl = _do_vectorization(knl, orig_iname, vcl_size)
knl = _do_vectorization(knl, orig_iname, get_form_option('number_of_blocks'), vcl_size)
return knl
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment