diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py index 95298bb181cc78529789ef7856e24d921f21e90e..ac3a58db7bb8f66c40e154ade21c937504bdffce 100644 --- a/python/dune/codegen/blockstructured/vectorization.py +++ b/python/dune/codegen/blockstructured/vectorization.py @@ -45,8 +45,8 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): nptype = dtype_floatingpoint() vcl_size = get_vcl_type_size(np.float64) - from loopy.match import Tagged, Iname, And - accum_insns = set(lp.find_instructions(knl, And((Tagged('accum'), Iname(iname_inner))))) + accum_insns = lp.find_instructions(knl, And((Tagged('accum'), Iname(iname_inner)))) + accum_ids = [insn.id for insn in accum_insns] new_insns = [] vng = knl.get_var_name_generator() @@ -54,7 +54,7 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): new_vec_temporaries = dict() for insn in knl.instructions: # somehow CInstructions are not hashable.... - if isinstance(insn, lp.MultiAssignmentBase) and insn in accum_insns: + if isinstance(insn, lp.MultiAssignmentBase) and insn.id in accum_ids: # write accum expr as "r = expr + r" expr_without_r = prim.Sum(tuple(e for e in insn.expression.children if not e == insn.assignee)) @@ -94,7 +94,7 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): var_right = prim.Subscript(prim.Variable(identifier_right), (prim.Variable(iname_inner),)) # init a - id_init_a = idg('insn_init_' + identifier_left) + id_init_a = idg('{}_init_' + identifier_left) new_insns.append(lp.Assignment(assignee=substitute(var_left, replace_head_inames), expression=0, id=id_init_a, @@ -106,8 +106,8 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): expr_right = substitute(expr_without_r, {iname_ix: 1}) expr_left = prim.Sum((substitute(expr_without_r, {iname_ix: 0}), var_left)) - id_set_left = idg('insn_' + identifier_left) - id_set_right = idg('insn_' + identifier_right) + id_set_left = idg('{}_{}'.format(insn.id, identifier_left)) + id_set_right = idg('{}_{}'.format(insn.id, identifier_right)) new_insns.append(lp.Assignment(assignee=var_right, expression=expr_right, id=id_set_right, @@ -120,7 +120,7 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): within_inames=insn.within_inames - frozenset({iname_ix}))) # r+=a[iy] - id_accum = idg('insn_mod_accum') + id_accum = idg('{}_mod_accum'.format(insn.id)) expr_accum = prim.Sum((var_left, prim.Call(VCLPermute(nptype, vcl_size, (-1,) + tuple(range(vcl_size - 1))), (var_right,)), @@ -133,7 +133,7 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): within_inames=insn.within_inames - frozenset({iname_ix}), tags=frozenset({'accum'}))) # a[iy] = permute - id_permute = idg('insn_permute') + id_permute = idg('{}_permute'.format(insn.id)) expr_permute = prim.Call(VCLPermute(nptype, vcl_size, (vcl_size - 1,) + (-1,) * (vcl_size - 1)), (var_right,)) new_insns.append(lp.Assignment(assignee=var_left, @@ -145,7 +145,7 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): )) # tail handling, uses tail alias - id_accum_tail = idg('insn_accum_tail') + id_accum_tail = idg('{}_accum_tail'.format(insn.id)) subst_map = {iname_inner: vcl_size - 1, iname_outer: get_form_option("number_of_blocks") // vcl_size - 1, iname_ix: 1, insn.assignee_name: prim.Variable(insn.assignee_name + '_tail'), **replace_tail_inames} @@ -164,7 +164,11 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): inames_micro) | inames_tail, tags=frozenset({'tail'}))) else: - new_insns.append(insn) + if insn.id.endswith('tail') and insn.id.replace('_tail', '') in accum_ids: + accum_id = insn.id.replace('_tail', '') + new_insns.append(insn.copy(depends_on=insn.depends_on | frozenset({accum_id + '_accum_tail'}))) + else: + new_insns.append(insn) return knl.copy(instructions=new_insns, temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries))