diff --git a/python/dune/perftool/blockstructured/accumulation.py b/python/dune/perftool/blockstructured/accumulation.py index b4b57f97b5a379585a00eecb421c8d84a0828a7f..ff5a4ae36f7bf279ac56d582d6408163e30c35a1 100644 --- a/python/dune/perftool/blockstructured/accumulation.py +++ b/python/dune/perftool/blockstructured/accumulation.py @@ -11,14 +11,20 @@ import pymbolic.primitives as prim def name_accumulation_alias(container, accumspace): name = container+"_"+accumspace.lfs.name+"_alias" + name_tail = container+"_"+accumspace.lfs.name+"_alias_tail" k = get_option("number_of_blocks") p = accumspace.element.degree() - globalarg(name, shape=(k, k, p + 1, p + 1), strides=(p, p * k + 1, 1, k + 1), managed=True) - code = "auto {} = &{}.container()({},0);".format(container + "_"+accumspace.lfs.name+"_alias", container, accumspace.lfs.name) - instruction(within_inames=frozenset(), - code=code, - read_variables=frozenset({container}), - assignees=frozenset({name})) + + def _add_alias_insn(name): + globalarg(name, shape=(k, k, p + 1, p + 1), strides=(p, p * k + 1, 1, k + 1), managed=True) + code = "auto {} = &{}.container()({},0);".format(name, container, accumspace.lfs.name) + instruction(within_inames=frozenset(), + code=code, + read_variables=frozenset({container}), + assignees=frozenset({name})) + + _add_alias_insn(name) + _add_alias_insn(name_tail) return name @@ -53,5 +59,6 @@ def generate_accumulation_instruction(expr, visitor): expression=prim.Sum((expr_with_weight, assignee)), forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)), forced_iname_deps_is_final=True, - predicates=predicates + predicates=predicates, + tags=frozenset({'accum'}) ) diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py index af64fab9caa8e6e1038f4f9ecc852ecb4e3b2ca2..00c3b68486c19f37cd80b50927e741e71751f4fd 100644 --- a/python/dune/perftool/blockstructured/vectorization.py +++ b/python/dune/perftool/blockstructured/vectorization.py @@ -4,60 +4,38 @@ import pymbolic.primitives as prim from dune.perftool.loopy.temporary import DuneTemporaryVariable from dune.perftool.loopy.symbolic import substitute from dune.perftool.options import get_option +from dune.perftool.pdelab.argument import PDELabAccumulationFunction +from dune.perftool.tools import get_pymbolic_indices -def add_vcl_vector(knl, iname_inner, iname_outer): - read_insn_to_vec_instance = dict() - write_insn_to_vec_instance = dict() - read_vecs = set() - write_vecs = set() - - from loopy.symbolic import WalkMapper - class FindAlias(WalkMapper): - def __init__(self, id, expr, vecs, insn_to_vec): - self.id = id - self.vecs = vecs - self.insn_to_vec = insn_to_vec - self.rec(expr) - - def visit(self, expr, **kwargs): - if isinstance(expr, prim.Subscript) \ - and expr.aggregate.name.endswith('alias'): - vec_name = expr.aggregate.name.replace('alias', 'vec') - self.vecs.add(vec_name) - self.insn_to_vec.setdefault(self.id, set()) - self.insn_to_vec[self.id].add((vec_name,expr)) - return False - else: - return True - # find all instances where an alias is read from or written to - # save the name of the corresponding vector and the alias subscript expression - for insn in knl.instructions: - if isinstance(insn, lp.MultiAssignmentBase): - FindAlias(insn.id, insn.expression, read_vecs, read_insn_to_vec_instance) - if isinstance(insn, lp.Assignment): - FindAlias(insn.id, insn.assignee, write_vecs, write_insn_to_vec_instance) - - read_insns = [knl.id_to_insn[id] for id in read_insn_to_vec_instance] - write_insns = [knl.id_to_insn[id] for id in write_insn_to_vec_instance] +def add_vcl_temporaries(knl): + vector_alias = [a for a in knl.arg_dict if a.endswith('alias')] # add new temporaries for vectors + # hope one read insn doesn't have two different reads from the same temporary new_vec_temporaries = dict() - for vec in read_vecs|write_vecs: - new_vec_temporaries[vec] = DuneTemporaryVariable(vec, dtype=np.float64, shape=(4,), managed=True, - scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',)) + for alias in vector_alias: + vector_name = alias.replace('alias','vec') + new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64, shape=(4,), managed=True, + scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',)) + + return knl.copy(temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries)) - modified_accum_insn = [] - replace_accum_insn = dict() + +def add_vcl_accum_insns(knl, iname_inner, iname_outer, iname_head, iname_tail): + from loopy.match import Tagged + accum_insns = set(lp.find_instructions(knl, Tagged('accum'))) + + new_insns = [] vng = knl.get_var_name_generator() idg = knl.get_instruction_id_generator() - for insn in write_insns: - if isinstance(insn, lp.Assignment): + new_vec_temporaries = dict() + for insn in knl.instructions: + # somehow CInstructions are not hashable.... + if isinstance(insn, lp.MultiAssignmentBase) and insn in accum_insns: # write accum expr as "r = expr + r" expr_without_r = prim.Sum(tuple(e for e in insn.expression.children if not e == insn.assignee)) - if expr_without_r == insn.expression: - continue - # finde micro inames + iname_ix = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_x"))) iname_iy = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_y"))) @@ -74,14 +52,15 @@ def add_vcl_vector(knl, iname_inner, iname_outer): # init a id_init_a = idg('insn_init_'+identifier_a) - modified_accum_insn.append(lp.Assignment(assignee=substitute(var_a,{iname_iy:prim.Variable(iname_iy+'_head')}), - expression=0, - id=id_init_a, - within_inames=(insn.within_inames-frozenset({iname_ix, iname_iy, - iname_outer})) - |frozenset({iname_iy+'_head'}), - ) - ) + new_insns.append(lp.Assignment(assignee=substitute(var_a,{iname_iy:prim.Variable(iname_head)}), + expression=0, + id=id_init_a, + within_inames=(insn.within_inames-frozenset({iname_ix, iname_iy, + iname_outer})) + | frozenset({iname_head}), + tags=frozenset({'head'}) + ) + ) # setze werte für a und b expr_b = substitute(expr_without_r, {iname_ix:1}) @@ -89,138 +68,170 @@ def add_vcl_vector(knl, iname_inner, iname_outer): id_set_a = idg('insn_'+identifier_a) id_set_b = idg('insn_'+identifier_b) - modified_accum_insn.append(lp.Assignment(assignee=var_b, - expression=expr_b, - id=id_set_b, - depends_on=insn.depends_on, - within_inames=insn.within_inames-frozenset({iname_ix}), - ) - ) - modified_accum_insn.append(lp.Assignment(assignee=var_a, - expression=expr_a, - id=id_set_a, - depends_on=insn.depends_on|frozenset({id_init_a}), - within_inames=insn.within_inames-frozenset({iname_ix}), - ) - ) + new_insns.append(lp.Assignment(assignee=var_b, + expression=expr_b, + id=id_set_b, + depends_on=insn.depends_on, + within_inames=insn.within_inames-frozenset({iname_ix}), + ) + ) + new_insns.append(lp.Assignment(assignee=var_a, + expression=expr_a, + id=id_set_a, + depends_on=insn.depends_on|frozenset({id_init_a}), + within_inames=insn.within_inames-frozenset({iname_ix}), + ) + ) # r+=a[iy] id_accum = idg('insn_mod_accum') expr_accum = prim.Sum((var_a, prim.Call(prim.Variable('permute4d<-1,0,1,2>'), (var_b,)), substitute(insn.assignee, {iname_ix:0}))) - replace_accum_insn[insn.id] = lp.Assignment(assignee=substitute(insn.assignee,{iname_ix:0}), - expression=expr_accum, - id=id_accum, - depends_on=insn.depends_on|frozenset({id_set_a,id_init_a,id_set_b}), - within_inames=insn.within_inames-frozenset({iname_ix}) - ) + new_insns.append(lp.Assignment(assignee=substitute(insn.assignee,{iname_ix:0}), + expression=expr_accum, + id=id_accum, + depends_on=insn.depends_on|frozenset({id_set_a,id_init_a,id_set_b}), + within_inames=insn.within_inames-frozenset({iname_ix}), + tags=frozenset({'accum'}) + ) + ) # a[iy] = permute id_permute = idg('insn_permute') expr_permute = prim.Call(prim.Variable('permute4d<3,-1,-1,-1>'), (var_b,)) - modified_accum_insn.append(lp.Assignment(assignee=var_a, - expression=expr_permute, - id=id_permute, - depends_on=replace_accum_insn[insn.id].depends_on - |frozenset({replace_accum_insn[insn.id].id}), - within_inames=insn.within_inames-frozenset({iname_ix}) - ) - ) - - # tail handling | frozenset({iname_iy+'_tail'}))) + new_insns.append(lp.Assignment(assignee=var_a, + expression=expr_permute, + id=id_permute, + depends_on=insn.depends_on|frozenset({id_set_a,id_init_a,id_set_b, + id_accum}), + within_inames=insn.within_inames-frozenset({iname_ix}) + ) + ) + + # tail handling, uses tail alias id_accum_tail = idg('insn_accum_tail') - subst_map = {iname_inner: 0, iname_outer: get_option("number_of_blocks")/4, iname_iy: prim.Variable(iname_iy+'_tail'), - iname_ix: 0} + subst_map = {iname_inner: 0, iname_outer: get_option("number_of_blocks")//4, iname_iy: prim.Variable(iname_tail), + iname_ix: 0, insn.assignee_name: prim.Variable(insn.assignee_name+'_tail')} assignee_tail = substitute(insn.assignee, subst_map) - expr_tail = prim.Sum((prim.Subscript(prim.Variable(identifier_a), (prim.Variable(iname_iy+'_tail'), 0)), + expr_tail = prim.Sum((prim.Subscript(prim.Variable(identifier_a), (prim.Variable(iname_tail), 0)), assignee_tail)) - modified_accum_insn.append(lp.Assignment(assignee=assignee_tail, - expression=expr_tail, - id=id_accum_tail, - depends_on=frozenset({replace_accum_insn[insn.id].id, - id_permute, id_set_a, id_init_a}), - within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer, - iname_ix, iname_iy})) - | frozenset({iname_iy+'_tail'}))) - - # update found instances with respect to modified accumulation instruction - def update_insn_to_vec_instance(insns, expr, insn_to_vec_instance): - new_insns = [] - for insn in insns: - if insn.id in replace_accum_insn: - new_insns.append(replace_accum_insn[insn.id]) - insn_to_vec_instance.pop(insn.id) - FindAlias(new_insns[-1].id, expr(new_insns[-1]), set(), insn_to_vec_instance) + + new_insns.append(lp.Assignment(assignee=assignee_tail, + expression=expr_tail, + id=id_accum_tail, + depends_on=frozenset({id_accum,id_permute, id_set_a, id_init_a}), + within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer, + iname_ix, iname_iy})) + | frozenset({iname_tail}), + tags=frozenset({'tail'}))) + else: + new_insns.append(insn) + + return knl.copy(instructions=new_insns, + temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries)) + + +def add_vcl_access(knl, iname_inner): + from loopy.match import Reads, Tagged + accum_insns = set((insn.id for insn in lp.find_instructions(knl, Tagged('accum')))) + read_insns = set((insn.id for insn in lp.find_instructions(knl, Reads('*alias')))) + + from loopy.symbolic import CombineMapper + from loopy.symbolic import IdentityMapper + class AliasIndexCollector(CombineMapper, IdentityMapper): + def combine(self, values): + return sum(values, tuple()) + + def map_constant(self, expr): + return tuple() + + map_variable = map_constant + map_function_symbol = map_constant + + def map_subscript(self, expr): + if expr.aggregate.name.endswith('alias'): + return expr.aggregate, expr.index_tuple else: - new_insns.append(insn) - return new_insns - read_insns = update_insn_to_vec_instance(read_insns, lambda i: i.expression, read_insn_to_vec_instance) - write_insns = update_insn_to_vec_instance(write_insns, lambda i: i.assignee, write_insn_to_vec_instance) + return tuple() # add load instructions if the vector is read, based on the read instruction # TODO brauche mehrere vectoren, falls in einer insn von einem alias mit unterschiedlichen idx gelesen wird # muss dafür eigentlich nur den namen des vectors anpassen + idg = knl.get_instruction_id_generator() + aic = AliasIndexCollector() load_insns = [] read_dependencies = dict() - for insn in read_insns: - for vec, expr in read_insn_to_vec_instance[insn.id]: - alias = vec.replace('vec', 'alias') - - # flat index without vec iname - strides = tuple(tag.stride for tag in knl.arg_dict[alias].dim_tags) - index = prim.Sum(tuple(prim.Product(z) for z in zip(substitute(expr, {iname_inner:0}).index_tuple, strides))) - - # add load instruction - load_id = idg('insn_'+vec+'_load') - call_load = prim.Call(prim.Variable(vec+'.load'), (prim.Sum((prim.Variable(alias), index)),)) - load_insns.append(lp.CallInstruction(assignees=(),expression=call_load, - id=load_id,within_inames=insn.within_inames|insn.reduction_inames(),)) - read_dependencies.setdefault(insn.id, set()) - read_dependencies[insn.id].add(load_id) + for id in read_insns: + insn = knl.id_to_insn[id] + + alias, index = aic(insn.expression) + name_alias = alias.name + name_vec = name_alias.replace('alias', 'vec') + + # compute index without vec iname + strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags) + index = prim.Sum(tuple(prim.Product((i,s)) for i,s in zip(index, strides) if i!=0 and i.name != iname_inner)) + + # add load instruction + load_id = idg('insn_'+name_vec+'_load') + call_load = prim.Call(prim.Variable(name_vec+'.load'), (prim.Sum((prim.Variable(name_alias), index)),)) + load_insns.append(lp.CallInstruction(assignees=(),expression=call_load, + id=load_id,within_inames=insn.within_inames|insn.reduction_inames(),)) + read_dependencies.setdefault(id, set()) + read_dependencies[id].add(load_id) # add store instructions if the vector is written, based on the write instruction store_insns = [] - for insn in write_insns: - for vec, expr in write_insn_to_vec_instance[insn.id]: - alias = vec.replace('vec', 'alias') - - # flat index without vec iname - strides = tuple(tag.stride for tag in knl.arg_dict[alias].dim_tags) - index = prim.Sum(tuple(prim.Product(z) for z in zip(substitute(expr, {iname_inner: 0, iname_ix: 0}).index_tuple, strides))) - - # add store instruction - store_id = idg('insn_'+vec+'_store') - call_store = prim.Call(prim.Variable(vec+'.store'), (prim.Sum((prim.Variable(alias), index)),)) - store_insns.append(lp.CallInstruction(assignees=(),expression=call_store, - id=store_id,within_inames=insn.within_inames, - depends_on=insn.depends_on - | frozenset({insn.id}) - | read_dependencies[insn.id], - )) - - # exchange alias for vector + for id in accum_insns: + insn = knl.id_to_insn[id] + + alias, index = aic(insn.expression) + name_alias = alias.name + name_vec = name_alias.replace('alias', 'vec') + + # flat index without vec iname + strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags) + index = prim.Sum(tuple(prim.Product((i,s)) for i,s in zip(index, strides) if i!=0 and i.name != iname_inner)) + + # add store instruction + store_id = idg('insn_'+name_vec+'_store') + call_store = prim.Call(prim.Variable(name_vec+'.store'), (prim.Sum((prim.Variable(name_alias), index)),)) + store_insns.append(lp.CallInstruction(assignees=(),expression=call_store, + id=store_id,within_inames=insn.within_inames, + depends_on=insn.depends_on + | frozenset({id}) + | read_dependencies[id], + )) + + # replace alias with vcl vector, except for accumulation assignee + vector_alias = [a for a in knl.arg_dict if a.endswith('alias')] + for alias in vector_alias: + knl = lp.extract_subst(knl, alias+'_subst', '{}[ex_o, ex_i,ey,ix,iy]'.format(alias), + parameters='ex_o,ex_i,ey,ix,iy') + new_subst = knl.substitutions.copy() + rule = new_subst[alias+'_subst'] + rule.expression = prim.Subscript(prim.Variable(alias.replace('alias','vec')), (prim.Variable('ex_i'),)) + knl = knl.copy(substitutions=new_subst) + + from loopy.match import All + knl = lp.expand_subst(knl, All()) + + # add store and load dependencies and set right accumulation assignee new_insns = [] for insn in knl.instructions: - insn = replace_accum_insn.get(insn.id, insn) - if insn.id not in read_insn_to_vec_instance.keys() | write_insn_to_vec_instance.keys(): + if insn.id not in read_insns | accum_insns: new_insns.append(insn) else: - subst_map = dict() - for vec, expr in read_insn_to_vec_instance[insn.id]: - subst_map[expr] = prim.Subscript(prim.Variable(vec), (prim.Variable(iname_inner),)) - - new_insn = insn - - if insn in read_insns: - new_insn = new_insn.copy(expression=substitute(new_insn.expression, subst_map), - depends_on=new_insn.depends_on | read_dependencies[insn.id]) - if insn in write_insns: - new_insn = new_insn.copy(assignee=substitute(new_insn.assignee, subst_map)) - - new_insns.append(new_insn) + if insn.id in accum_insns: + assignee_alias = insn.assignee + assignee_vec = next((expr for expr in insn.expression.children + if isinstance(expr, prim.Subscript) + and expr.aggregate.name.replace('vec','alias') == assignee_alias.aggregate.name)) + new_insns.append(insn.copy(assignee=assignee_vec, + depends_on=insn.depends_on | read_dependencies[insn.id])) + else: + new_insns.append(insn.copy(depends_on=insn.depends_on | read_dependencies[insn.id])) - from loopy.kernel.creation import resolve_dependencies - return resolve_dependencies(knl.copy(instructions=new_insns+load_insns+store_insns+modified_accum_insn, - temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries))) + return knl.copy(instructions=new_insns+load_insns+store_insns) def find_accumulation_inames(knl): @@ -240,8 +251,16 @@ def vectorize_micro_elements(knl): from loopy.match import Not, All accum_inames = find_accumulation_inames(knl) for iname in accum_inames: + iname_head = iname+'_head' + iname_tail = iname+'_tail' knl = lp.duplicate_inames(knl, iname, Not(All()), suffix='_tail') knl = lp.duplicate_inames(knl, iname, Not(All()), suffix='_head') + knl = lp.split_iname(knl,"subel_x",4, inner_tag='vec') - knl = add_vcl_vector(knl,'subel_x_inner', 'subel_x_outer') + array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')] + knl = lp.split_array_axis(knl, array_alias, 0, 4) + + knl = add_vcl_temporaries(knl) + knl = add_vcl_accum_insns(knl, 'subel_x_inner', 'subel_x_outer', iname_head, iname_tail) + knl = add_vcl_access(knl, 'subel_x_inner') return knl