Skip to content
Snippets Groups Projects
Commit 8097cac4 authored by Marcel Koch's avatar Marcel Koch
Browse files

vectorize tail

odd behavior for unstructured tensor test...
parent e27b8015
No related branches found
No related tags found
No related merge requests found
......@@ -173,8 +173,7 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer, vcl_size):
temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries))
def add_vcl_access(knl, iname_inner, vcl_size):
from loopy.match import Reads, Tagged
def add_vcl_access(knl, iname_inner, vcl_size, levels=0):
accum_insns = set((insn.id for insn in lp.find_instructions(knl, And((Tagged('accum_vec{}'.format(vcl_size)),
Iname(iname_inner))))))
read_insns = set((insn.id for insn in lp.find_instructions(knl, And((Reads('*alias'), Iname(iname_inner))))))
......@@ -221,15 +220,15 @@ def add_vcl_access(knl, iname_inner, vcl_size):
# compute index without vec iname
strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
if i != 0 and i.name != iname_inner))
flat_index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
if not (isinstance(i, prim.Variable) and i.name == iname_inner)))
# find write insns
write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec)))))
# add load instruction
load_id = idg('insn_' + name_vec + '_load')
call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), index)),))
call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), flat_index)),))
load_insns.append(lp.CallInstruction(assignees=(), expression=call_load,
id=load_id, within_inames=insn.within_inames | insn.reduction_inames(),
depends_on=insn.depends_on | write_ids,))
......@@ -248,15 +247,15 @@ def add_vcl_access(knl, iname_inner, vcl_size):
# flat index without vec iname
strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
if i != 0 and i.name != iname_inner))
flat_index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
if not (isinstance(i, prim.Variable) and i.name == iname_inner)))
# find write insns
write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec)))))
# add store instruction
store_id = idg('insn_' + name_vec + '_store')
call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), index)),))
call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), flat_index)),))
store_insns.append(lp.CallInstruction(assignees=(), expression=call_store,
id=store_id, within_inames=insn.within_inames,
depends_on=(insn.depends_on | frozenset({id}) | read_dependencies[id] |
......@@ -284,14 +283,17 @@ def add_vcl_access(knl, iname_inner, vcl_size):
knl_with_subst_insns = knl_with_subst_insns.copy(instructions=new_insns)
# substitution rule for alias[ex_outer,ex_inner, ey, ix, iy] -> vec[ex_inner]
parameters = 'ex_o,ex_i,' + ','.join(['e' + d for d in dim_names[1:dim]]) + \
',ix,' + ','.join(['i' + d for d in dim_names[1:dim]])
parameters = ','.join(['ex_o{}'.format(l) for l in range(levels + 1)]) + \
',v_i,' + \
','.join(['e' + d for d in dim_names[1:dim]]) + \
',ix,' + \
','.join(['i' + d for d in dim_names[1:dim]])
knl_with_subst_insns = lp.extract_subst(knl_with_subst_insns, alias + '_subst', '{}[{}]'.format(alias, parameters),
parameters=parameters)
new_subst = knl_with_subst_insns.substitutions.copy()
rule = new_subst[alias + '_subst']
rule.expression = prim.Subscript(prim.Variable(alias.replace(alias_suffix, vector_sufix)),
(prim.Variable('ex_i'),))
(prim.Variable('v_i'),))
knl_with_subst_insns = knl_with_subst_insns.copy(substitutions=new_subst)
knl_with_subst_insns = lp.expand_subst(knl_with_subst_insns, Iname(iname_inner))
......@@ -462,10 +464,14 @@ def realize_tail(knl, iname_inner, iname_outer, vcl_size):
def vectorize_micro_elements(knl):
vec_iname = "subel_x"
orig_iname = vec_iname
if vec_iname in knl.all_inames() and get_global_context_value('integral_type') == 'cell':
vcl_size = get_vcl_type_size(np.float64)
has_tail = get_form_option('number_of_blocks') % vcl_size > 0
vectorize_tail = True
knl = add_iname_array(knl, vec_iname)
# manually add tail, since split_iname with slabs tries to vectorize the tail
if has_tail:
......@@ -477,19 +483,34 @@ def vectorize_micro_elements(knl):
knl = lp.split_iname(knl, vec_iname, vcl_size)
knl = realize_tail(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size)
tail_iname = vec_iname + '_inner' + '_tail'
else:
knl = lp.split_iname(knl, vec_iname, vcl_size)
knl = lp.tag_inames(knl, [(vec_iname + '_inner', 'vec')])
knl = add_iname_array(knl, vec_iname)
array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
iname_vector = [a for a in knl.temporary_variables.keys() if a.endswith('vec')]
knl = lp.split_array_axis(knl, array_alias + iname_vector, 0, vcl_size)
knl = lp.tag_array_axes(knl, iname_vector, ('c', 'vec'))
knl = lp.split_array_axis(knl, array_alias, 0, vcl_size)
def _do_vectorization(knl, iname_inner, iname_outer, vcl_size, levels=0):
knl = add_vcl_iname_array(knl, orig_iname, iname_inner, vcl_size)
knl = add_vcl_temporaries(knl, vcl_size)
knl = add_vcl_accum_insns(knl, iname_inner, iname_outer, vcl_size)
knl = add_vcl_access(knl, iname_inner, vcl_size, levels)
return knl
knl = _do_vectorization(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size)
if has_tail and vectorize_tail:
vcl_size = vcl_size // 2
vec_iname = tail_iname
knl = lp.split_iname(knl, vec_iname, vcl_size)
knl = lp.tag_inames(knl, [(vec_iname + '_inner', 'vec')])
array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
knl = lp.split_array_axis(knl, array_alias, 1, vcl_size)
knl = _do_vectorization(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size, levels=1)
knl = add_vcl_temporaries(knl, vcl_size)
knl = add_vcl_accum_insns(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size)
knl = add_vcl_access(knl, vec_iname + '_inner', vcl_size)
return knl
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment