diff --git a/python/dune/perftool/blockstructured/accumulation.py b/python/dune/perftool/blockstructured/accumulation.py new file mode 100644 index 0000000000000000000000000000000000000000..05b2c8b61e56b0edc96a14640ad2c7e8512c7066 --- /dev/null +++ b/python/dune/perftool/blockstructured/accumulation.py @@ -0,0 +1,47 @@ +from dune.perftool.generation import temporary_variable, instruction +from dune.perftool.options import get_option +from dune.perftool.pdelab.localoperator import determine_accumulation_space +from dune.perftool.pdelab.argument import name_accumulation_variable +from dune.perftool.pdelab.localoperator import boundary_predicates +import pymbolic.primitives as prim + + +def name_accumulation_alias(container, accumspace): + name = container+"_"+accumspace.lfs.name+"_alias" + k = get_option("number_of_blocks") + p = accumspace.element.degree() + temporary_variable(name, shape=(k + 1, k + 1, p + 1, p + 1), strides=(p, p * k + 1, 1, k + 1), + base_storage="dummy_"+container, managed=True, _base_storage_access_may_be_aliasing=True) + code = "{} = &{}.container()({},0);".format(container + "_"+accumspace.lfs.name+"_alias", container, accumspace.lfs.name) + instruction(within_inames=frozenset(), + code=code, + read_variables=frozenset({container}), + assignees=frozenset({name})) + return name + + +def generate_accumulation_instruction(expr, visitor): + # Collect the lfs and lfs indices for the accumulate call + test_lfs = determine_accumulation_space(visitor.test_info, 0) + # In the jacobian case, also determine the space for the ansatz space + ansatz_lfs = determine_accumulation_space(visitor.trial_info, 1) + + # Collect the lfs and lfs indices for the accumulate call + accumvar = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction()) + accumvar_alias = name_accumulation_alias(accumvar, test_lfs) + + predicates = boundary_predicates(expr, visitor.measure, visitor.subdomain_id) + + quad_inames = visitor.interface.quadrature_inames() + lfs_inames = visitor.test_info.inames + if visitor.trial_info: + lfs_inames = lfs_inames + visitor.trial_info.inames + + assignee = prim.Subscript(prim.Variable(accumvar_alias), tuple(prim.Variable(i) for i in lfs_inames)) + + instruction(assignee=assignee, + expression=prim.Sum((expr, assignee)), + forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)), + forced_iname_deps_is_final=True, + predicates=predicates + ) diff --git a/python/dune/perftool/blockstructured/argument.py b/python/dune/perftool/blockstructured/argument.py index 1d7312b3748cb8d8963b3fe057e50e763e52c687..1d4fe319408230aa317ed0c23e9e3073e8870f4d 100644 --- a/python/dune/perftool/blockstructured/argument.py +++ b/python/dune/perftool/blockstructured/argument.py @@ -9,6 +9,20 @@ import pymbolic.primitives as prim import loopy as lp +def name_alias(container, lfs, element): + name = container+"_"+lfs.name+"_alias" + k = get_option("number_of_blocks") + p = element.degree() + temporary_variable(name, shape=(k + 1, k + 1, p + 1, p + 1), strides=(p, p * k + 1, 1, k + 1), + base_storage="dummy_"+container, managed=True, _base_storage_access_may_be_aliasing=True) + code = "{} = &{}({},0);".format(name,container, lfs.name) + instruction(within_inames=frozenset(), + code=code, + read_variables=frozenset({container}), + assignees=frozenset({name})) + return name + + # TODO remove the need for element @kernel_cached @backend(interface="pymbolic_coefficient", name="blockstructured") @@ -22,4 +36,9 @@ def pymbolic_coefficient(container, lfs, element, index): lfs = prim.Variable(lfs) # use higher order FEM index instead of Q1 index - return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),)) + if get_option("vectorization_blockstructured"): + subelem_inames = sub_element_inames() + coeff_alias = name_alias(container, lfs, element) + return prim.Subscript(lp.TaggedVariable(coeff_alias, 'coeff_alias'), tuple(prim.Variable(i) for i in subelem_inames+index)) + else: + return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),)) diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py index cef42b962b8a1c7876ccc7ae2914796f0acdbc28..0e15869eb3572b2a65aa746c38eb2a92fbc4d27c 100644 --- a/python/dune/perftool/blockstructured/vectorization.py +++ b/python/dune/perftool/blockstructured/vectorization.py @@ -17,32 +17,28 @@ def add_vcl_vector(knl, iname_inner, iname_outer): from loopy.symbolic import WalkMapper class FindAlias(WalkMapper): - def __init__(self, id, expr): + def __init__(self, id, expr, vecs, insn_to_vec): self.id = id + self.vecs = vecs + self.insn_to_vec = insn_to_vec self.rec(expr) def post_visit(self,expr): - if isinstance(expr, prim.Call): - func = expr.function - param = expr.parameters - if (isinstance(func, CoefficientAccess) or isinstance(func, PDELabAccumulationFunction)) \ - and iname_inner in get_pymbolic_indices(prim.Subscript((), (param[1],))): - lfs = param[0].name - vec_name = (func.container if isinstance(func, CoefficientAccess) else func.accumobj) + "_" + lfs + "_vec" - read_vecs.add(vec_name) - read_insn_to_vec_instance.setdefault(self.id, set()) - read_insn_to_vec_instance[self.id].add((vec_name, expr)) - - if isinstance(func, PDELabAccumulationFunction): - write_vecs.add(vec_name) - write_insn_to_vec_instance[self.id] = (vec_name, expr) - + if isinstance(expr, prim.Subscript) \ + and isinstance(expr.aggregate, prim.Variable) \ + and expr.aggregate.name.endswith('alias'): + vec_name = expr.aggregate.name.replace('alias', 'vec') + self.vecs.add(vec_name) + self.insn_to_vec.setdefault(self.id, set()) + self.insn_to_vec[self.id].add((vec_name,expr)) return # find all instances where an alias is read from or written to # save the name of the corresponding vector and the alias subscript expression for insn in knl.instructions: if isinstance(insn, lp.MultiAssignmentBase): - FindAlias(insn.id, insn.expression) + FindAlias(insn.id, insn.expression, read_vecs, read_insn_to_vec_instance) + if isinstance(insn, lp.Assignment): + FindAlias(insn.id, insn.assignee, write_vecs, write_insn_to_vec_instance) read_insns = [knl.id_to_insn[id] for id in read_insn_to_vec_instance] write_insns = [knl.id_to_insn[id] for id in write_insn_to_vec_instance] @@ -53,16 +49,16 @@ def add_vcl_vector(knl, iname_inner, iname_outer): new_vec_temporaries[vec] = DuneTemporaryVariable(vec, dtype=np.float64, shape=(4,), managed=True, scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',)) - # write accum expr as "expr + r" modified_accum_insn = [] replace_accum_insn = dict() vng = knl.get_var_name_generator() idg = knl.get_instruction_id_generator() for insn in write_insns: - if isinstance(insn, lp.CallInstruction) and isinstance(insn.expression.function, PDELabAccumulationFunction): - vec_name, expr = write_insn_to_vec_instance[insn.id] - expr_accum = insn.expression.parameters[-1] - + if isinstance(insn, lp.Assignment): + # write accum expr as "r = expr + r" + expr_without_r = prim.Sum(tuple(e for e in insn.expression.children if not e == insn.assignee)) + if expr_without_r == insn.expression: + continue # finde micro inames iname_ix = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_x"))) iname_iy = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_y"))) @@ -80,7 +76,7 @@ def add_vcl_vector(knl, iname_inner, iname_outer): # init a id_init_a = idg('insn_init_'+identifier_a) - modified_accum_insn.append(lp.Assignment(assignee=substitute(var_a, {iname_iy: prim.Variable(iname_iy+'_head')}), + modified_accum_insn.append(lp.Assignment(assignee=substitute(var_a,{iname_iy:prim.Variable(iname_iy+'_head')}), expression=0, id=id_init_a, within_inames=(insn.within_inames-frozenset({iname_ix, iname_iy, @@ -90,8 +86,8 @@ def add_vcl_vector(knl, iname_inner, iname_outer): ) # setze werte für a und b - expr_b = substitute(expr_accum, {iname_ix: 1}) - expr_a = prim.Sum((substitute(expr_accum, {iname_ix: 0}), var_a)) + expr_b = substitute(expr_without_r, {iname_ix:1}) + expr_a = prim.Sum((substitute(expr_without_r, {iname_ix:0}), var_a)) id_set_a = idg('insn_'+identifier_a) id_set_b = idg('insn_'+identifier_b) @@ -112,13 +108,12 @@ def add_vcl_vector(knl, iname_inner, iname_outer): # r+=a[iy] id_accum = idg('insn_mod_accum') - r_vec = prim.Subscript(prim.Variable(vec_name),(prim.Variable(iname_inner),)) - expr_accum_mod = prim.Sum((var_a, prim.Call(prim.Variable('permute4d<-1,0,1,2>'), (var_b,)), r_vec)) - replace_accum_insn[insn.id] = lp.Assignment(assignee=r_vec, - expression=expr_accum_mod, + expr_accum = prim.Sum((var_a, prim.Call(prim.Variable('permute4d<-1,0,1,2>'), (var_b,)), + substitute(insn.assignee, {iname_ix:0}))) + replace_accum_insn[insn.id] = lp.Assignment(assignee=substitute(insn.assignee,{iname_ix:0}), + expression=expr_accum, id=id_accum, - depends_on=insn.depends_on|frozenset({id_set_a, id_set_b, - id_init_a}), + depends_on=insn.depends_on|frozenset({id_set_a,id_init_a,id_set_b}), within_inames=insn.within_inames-frozenset({iname_ix}) ) # a[iy] = permute @@ -133,21 +128,35 @@ def add_vcl_vector(knl, iname_inner, iname_outer): ) ) - # tail handling + # tail handling | frozenset({iname_iy+'_tail'}))) id_accum_tail = idg('insn_accum_tail') subst_map = {iname_inner: 0, iname_outer: get_option("number_of_blocks")/4, iname_iy: prim.Variable(iname_iy+'_tail'), iname_ix: 0} - expr_tail = prim.Call(expr.function, tuple(substitute(p, subst_map) for p in expr.parameters[:-1]) - + (prim.Subscript(prim.Variable(identifier_a), (prim.Variable(iname_iy+'_tail'), 0)),)) - - modified_accum_insn.append(lp.CallInstruction(assignees=(), - expression=expr_tail, - id=id_accum_tail, - depends_on=frozenset({replace_accum_insn[insn.id].id, - id_permute, id_set_a, id_init_a}), - within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer, - iname_ix, iname_iy})) - | frozenset({iname_iy+'_tail'}))) + assignee_tail = substitute(insn.assignee, subst_map) + expr_tail = prim.Sum((prim.Subscript(prim.Variable(identifier_a), (prim.Variable(iname_iy+'_tail'), 0)), + assignee_tail)) + modified_accum_insn.append(lp.Assignment(assignee=assignee_tail, + expression=expr_tail, + id=id_accum_tail, + depends_on=frozenset({replace_accum_insn[insn.id].id, + id_permute, id_set_a, id_init_a}), + within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer, + iname_ix, iname_iy})) + | frozenset({iname_iy+'_tail'}))) + + # update found instances with respect to modified accumulation instruction + def update_insn_to_vec_instance(insns, expr, insn_to_vec_instance): + new_insns = [] + for insn in insns: + if insn.id in replace_accum_insn: + new_insns.append(replace_accum_insn[insn.id]) + insn_to_vec_instance.pop(insn.id) + FindAlias(new_insns[-1].id, expr(new_insns[-1]), set(), insn_to_vec_instance) + else: + new_insns.append(insn) + return new_insns + read_insns = update_insn_to_vec_instance(read_insns, lambda i: i.expression, read_insn_to_vec_instance) + write_insns = update_insn_to_vec_instance(write_insns, lambda i: i.assignee, write_insn_to_vec_instance) # add load instructions if the vector is read, based on the read instruction # TODO brauche mehrere vectoren, falls in einer insn von einem alias mit unterschiedlichen idx gelesen wird @@ -156,79 +165,59 @@ def add_vcl_vector(knl, iname_inner, iname_outer): read_dependencies = dict() for insn in read_insns: for vec, expr in read_insn_to_vec_instance[insn.id]: - # different code for accumulation variable - if isinstance(expr.function, PDELabAccumulationFunction): - iname_ix = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_x"))) - index = substitute(expr.parameters[1], {iname_inner:0, iname_ix: 0}) - code = "{}.load(&{}.container()({}, {}));".format(vec, expr.function.accumobj, expr.parameters[0], index) - within_inames = insn.within_inames-frozenset({iname_ix}) - else: - # flat index without vec iname - # add load instruction - index = substitute(expr.parameters[1], {iname_inner:0}) - code = "{}.load(&{}({}, {}));".format(vec, expr.function.name, expr.parameters[0], index) - within_inames = insn.within_inames|insn.reduction_inames() + alias = vec.replace('vec', 'alias') + + # flat index without vec iname + strides = tuple(tag.stride for tag in knl.temporary_variables[alias].dim_tags) + index = prim.Sum(tuple(prim.Product(z) for z in zip(substitute(expr, {iname_inner:0}).index_tuple, strides))) - # load_id = knl.make_unique_instruction_id(insns=knl.instructions+load_insns, based_on='insn_'+vec+'_load') + # add load instruction load_id = idg('insn_'+vec+'_load') - load_insns.append(lp.CInstruction(iname_exprs=[], code=code, - within_inames=within_inames, - #assignees=(lp.Variable(vec), ), # sonst denkt looy das müsste ein array von Vec4d sein... - id=load_id)) + call_load = prim.Call(prim.Variable(vec+'.load'), (prim.Sum((prim.Variable(alias), index)),)) + load_insns.append(lp.CallInstruction(assignees=(),expression=call_load, + id=load_id,within_inames=insn.within_inames|insn.reduction_inames(),)) read_dependencies.setdefault(insn.id, set()) read_dependencies[insn.id].add(load_id) # add store instructions if the vector is written, based on the write instruction store_insns = [] for insn in write_insns: - vec, expr = write_insn_to_vec_instance[insn.id] - - # flat index without vec iname - iname_ix = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_x"))) - index = substitute(expr.parameters[1], {iname_inner:0, iname_ix: 0}) - - # add store instruction - code = "{}.store(&{}.container()({}, {}));".format(vec, expr.function.accumobj, expr.parameters[0], index) - # store_id = knl.make_unique_instruction_id(insns=knl.instructions+load_insns, based_on='insn_'+vec+'_store') - store_id = idg('insn_'+vec+'_store') - store_insns.append(lp.CInstruction(iname_exprs=[], code=code, - within_inames=insn.within_inames-frozenset({iname_ix}), - depends_on=insn.depends_on - | frozenset({replace_accum_insn[insn.id].id}) - | read_dependencies[insn.id], - id=store_id)) - + for vec, expr in write_insn_to_vec_instance[insn.id]: + alias = vec.replace('vec', 'alias') + + # flat index without vec iname + strides = tuple(tag.stride for tag in knl.temporary_variables[alias].dim_tags) + index = prim.Sum(tuple(prim.Product(z) for z in zip(substitute(expr, {iname_inner: 0, iname_ix: 0}).index_tuple, strides))) + + # add store instruction + store_id = idg('insn_'+vec+'_store') + call_store = prim.Call(prim.Variable(vec+'.store'), (prim.Sum((prim.Variable(alias), index)),)) + store_insns.append(lp.CallInstruction(assignees=(),expression=call_store, + id=store_id,within_inames=insn.within_inames, + depends_on=insn.depends_on + | frozenset({insn.id}) + | read_dependencies[insn.id], + )) + + # exchange alias for vector new_insns = [] for insn in knl.instructions: + insn = replace_accum_insn.get(insn.id, insn) if insn.id not in read_insn_to_vec_instance.keys() | write_insn_to_vec_instance.keys(): new_insns.append(insn) else: - if insn.id in replace_accum_insn: - new_insn = replace_accum_insn[insn.id].copy(depends_on=replace_accum_insn[insn.id].depends_on - | read_dependencies[insn.id]) - else: - subst_map = dict() - for vec, expr in read_insn_to_vec_instance[insn.id]: - subst_map[expr] = prim.Subscript(prim.Variable(vec), (prim.Variable(iname_inner),)) - - class NodeSubstitutor(SubstitutionMapper): - def __init__(self): - from pymbolic.mapper.substitutor import make_subst_func - self.subst_func = make_subst_func(subst_map) - - def map_call(self, expr, *args, **kwargs): - result = self.subst_func(expr) - if result is not None: - return result - else: - return SubstitutionMapper.map_call(self, expr, args, kwargs) - - new_insn = insn - if insn in read_insns: - new_insn = new_insn.copy(expression=NodeSubstitutor()(new_insn.expression), - depends_on=new_insn.depends_on | read_dependencies[insn.id]) - if insn in write_insns: - new_insn = new_insn.copy(assignee=NodeSubstitutor()(new_insn.assignee)) + subst_map = dict() + for vec, expr in read_insn_to_vec_instance[insn.id]: + subst_map[expr] = prim.Subscript(prim.Variable(vec), (prim.Variable(iname_inner),)) + + new_insn = insn + + if insn in read_insns: + new_insn = new_insn.copy(expression=substitute(new_insn.expression, subst_map), + depends_on=new_insn.depends_on | read_dependencies[insn.id]) + if insn in write_insns: + new_insn = new_insn.copy(assignee=substitute(new_insn.assignee, subst_map)) + new_insns.append(new_insn) from loopy.kernel.creation import resolve_dependencies @@ -239,7 +228,7 @@ def add_vcl_vector(knl, iname_inner, iname_outer): def find_accumulation_inames(knl): inames = set() for insn in knl.instructions: - if isinstance(insn, lp.CallInstruction) and isinstance(insn.expression.function, PDELabAccumulationFunction): + if any((n.startswith('r_') and n.endswith('alias') for n in insn.write_dependency_names())): inames |= insn.within_inames inames = set((i for i in inames if i.startswith('micro') and not i.endswith('_x'))) diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index c681ec6fdb5538c0f480b1d30058c4900532254f..ec2937196b8c0cddac70d98b3ec4f99f3aab67c9 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -63,6 +63,14 @@ def type_floatingpoint(): class DuneExpressionToCExpressionMapper(ExpressionToCExpressionMapper): + def map_variable(self, expr, type_context): + tv = self.kernel.temporary_variables.get(expr.name) + if isinstance(tv, DuneTemporaryVariable) and tv.base_storage: + return prim.Variable(expr.name) + else: + return ExpressionToCExpressionMapper.map_variable(self, expr, type_context) + + def map_subscript(self, expr, type_context): arr = self.find_array(expr) if isinstance(arr, (DuneTemporaryVariable, DuneGlobalArg)) and not arr.managed: