diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py index b35918b23b3d6e8eebf12d6ef08c94a927d692f9..d5224e6ffdd19ae88ccbdc6a798a8b416a6fd36d 100644 --- a/python/dune/codegen/blockstructured/vectorization.py +++ b/python/dune/codegen/blockstructured/vectorization.py @@ -1,8 +1,10 @@ import loopy as lp import numpy as np import pymbolic.primitives as prim +from dune.codegen.blockstructured.tools import sub_element_inames -from loopy.match import Tagged, Id, Writes, Or +from loopy.match import Tagged, Id, Writes, Reads, And, Or, Iname, All, Not +from islpy import BasicSet from dune.codegen.generation import get_global_context_value from dune.codegen.loopy.target import dtype_floatingpoint @@ -14,25 +16,25 @@ from dune.codegen.pdelab.geometry import world_dimension from dune.codegen.tools import get_pymbolic_basename -def add_vcl_temporaries(knl): +def add_vcl_temporaries(knl, vcl_size): vector_alias = [a for a in knl.arg_dict if a.endswith('alias')] # add new temporaries for vectors # hope one read insn doesn't have two different reads from the same temporary new_vec_temporaries = dict() new_insns = [] - init_iname = 'init_vec' + init_iname = 'init_vec{}'.format(vcl_size) from islpy import BasicSet init_domain = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(init_iname, get_vcl_type_size(dtype_floatingpoint()))) for alias in vector_alias: - vector_name = alias.replace('alias', 'vec') + vector_name = alias.replace('alias', 'vec{}'.format(vcl_size)) new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64, - shape=(get_vcl_type_size(np.float64),), managed=True, + shape=(vcl_size,), managed=True, scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',)) # write once to the vector such that loopy won't complain new_insns.append(lp.Assignment(assignee=prim.Subscript(prim.Variable(vector_name), prim.Variable(init_iname)), expression=0, within_inames=frozenset({init_iname}), - id='init_' + vector_name)) + id='init_{}'.format(vector_name))) from loopy.kernel.data import VectorizeTag return knl.copy(instructions=knl.instructions + new_insns, domains=knl.domains + [init_domain], @@ -40,12 +42,11 @@ def add_vcl_temporaries(knl): iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()})) -def add_vcl_accum_insns(knl, iname_inner, iname_outer): +def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size, level): nptype = dtype_floatingpoint() - vcl_size = get_vcl_type_size(np.float64) - from loopy.match import Tagged - accum_insns = set(lp.find_instructions(knl, Tagged('accum'))) + accum_insns = lp.find_instructions(knl, And((Tagged('accum'), Iname(inner_iname)))) + accum_ids = [insn.id for insn in accum_insns] new_insns = [] vng = knl.get_var_name_generator() @@ -53,12 +54,12 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): new_vec_temporaries = dict() for insn in knl.instructions: # somehow CInstructions are not hashable.... - if isinstance(insn, lp.MultiAssignmentBase) and insn in accum_insns: + if isinstance(insn, lp.MultiAssignmentBase) and insn.id in accum_ids: # write accum expr as "r = expr + r" expr_without_r = prim.Sum(tuple(e for e in insn.expression.children if not e == insn.assignee)) inames_micro = set((i for i in insn.within_inames if i.startswith('micro'))) - iname_ix = next((i for i in inames_micro if i.endswith("_x"))) + iname_ix = next((i for i in inames_micro if '_x' in i)) # TODO use TaggedIname when available # need inames for head and tail handling a priori from loopy.match import Not, All @@ -75,8 +76,8 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): inames_tail = frozenset((var.name for var in replace_tail_inames.values())) # erstelle a[iy] und b - identifier_left = vng('left_node') - identifier_right = vng('right_node') + identifier_left = vng('left_node_vec{}'.format(vcl_size)) + identifier_right = vng('right_node_vec{}'.format(vcl_size)) new_vec_temporaries[identifier_left] = DuneTemporaryVariable(identifier_left, dtype=np.float64, shape=(2,) * (world_dimension() - 1) + (vcl_size,), @@ -89,37 +90,40 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): var_left = prim.Subscript(prim.Variable(identifier_left), tuple(prim.Variable(i) for i in sorted(inames_micro - frozenset({iname_ix}))) + - (prim.Variable(iname_inner),)) - var_right = prim.Subscript(prim.Variable(identifier_right), (prim.Variable(iname_inner),)) + (prim.Variable(inner_iname),)) + var_right = prim.Subscript(prim.Variable(identifier_right), (prim.Variable(inner_iname),)) # init a id_init_a = idg('insn_init_' + identifier_left) new_insns.append(lp.Assignment(assignee=substitute(var_left, replace_head_inames), expression=0, id=id_init_a, - within_inames=(insn.within_inames - frozenset({iname_outer}) - + within_inames=(insn.within_inames - frozenset({outer_iname}) - inames_micro) | inames_head, - tags=frozenset({'head'}))) + tags=frozenset({'head_vec{}'.format(vcl_size), + 'vectorized_{}'.format(level)}))) # setze werte für a und b expr_right = substitute(expr_without_r, {iname_ix: 1}) expr_left = prim.Sum((substitute(expr_without_r, {iname_ix: 0}), var_left)) - id_set_left = idg('insn_' + identifier_left) - id_set_right = idg('insn_' + identifier_right) + id_set_left = idg('{}_{}'.format(insn.id, identifier_left)) + id_set_right = idg('{}_{}'.format(insn.id, identifier_right)) new_insns.append(lp.Assignment(assignee=var_right, expression=expr_right, id=id_set_right, depends_on=insn.depends_on, - within_inames=insn.within_inames - frozenset({iname_ix}))) + within_inames=insn.within_inames - frozenset({iname_ix}), + tags=frozenset({'vectorized_{}'.format(level)}))) new_insns.append(lp.Assignment(assignee=var_left, expression=expr_left, id=id_set_left, depends_on=insn.depends_on | frozenset({id_init_a}), - within_inames=insn.within_inames - frozenset({iname_ix}))) + within_inames=insn.within_inames - frozenset({iname_ix}), + tags=frozenset({'vectorized_{}'.format(level)}))) # r+=a[iy] - id_accum = idg('insn_mod_accum') + id_accum = idg('{}_mod_accum'.format(insn.id)) expr_accum = prim.Sum((var_left, prim.Call(VCLPermute(nptype, vcl_size, (-1,) + tuple(range(vcl_size - 1))), (var_right,)), @@ -130,9 +134,10 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): depends_on=insn.depends_on | frozenset({id_set_left, id_init_a, id_set_right}), within_inames=insn.within_inames - frozenset({iname_ix}), - tags=frozenset({'accum'}))) + tags=frozenset({'accum_vec{}'.format(vcl_size), + 'vectorized_{}'.format(level)}))) # a[iy] = permute - id_permute = idg('insn_permute') + id_permute = idg('{}_permute'.format(insn.id)) expr_permute = prim.Call(VCLPermute(nptype, vcl_size, (vcl_size - 1,) + (-1,) * (vcl_size - 1)), (var_right,)) new_insns.append(lp.Assignment(assignee=var_left, @@ -140,16 +145,17 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): id=id_permute, depends_on=insn.depends_on | frozenset({id_set_left, id_init_a, id_set_right, id_accum}), - within_inames=insn.within_inames - frozenset({iname_ix}) + within_inames=insn.within_inames - frozenset({iname_ix}), + tags=frozenset({'vectorized_{}'.format(level)}) )) # tail handling, uses tail alias - id_accum_tail = idg('insn_accum_tail') - subst_map = {iname_inner: vcl_size - 1, iname_outer: get_form_option("number_of_blocks") // vcl_size - 1, + id_accum_tail = idg('{}_accum_tail'.format(insn.id)) + subst_map = {inner_iname: vcl_size - 1, outer_iname: get_form_option("number_of_blocks") // vcl_size - 1, iname_ix: 1, insn.assignee_name: prim.Variable(insn.assignee_name + '_tail'), **replace_tail_inames} assignee_tail = substitute(insn.assignee, subst_map) - expr_tail = prim.Sum((substitute(var_left, {iname_inner: 0, **replace_tail_inames}), assignee_tail)) + expr_tail = prim.Sum((substitute(var_left, {inner_iname: 0, **replace_tail_inames}), assignee_tail)) write_to_tail_ids = tuple(i.id for i in lp.find_instructions(knl, Writes(get_pymbolic_basename(assignee_tail)))) @@ -159,39 +165,56 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): id=id_accum_tail, depends_on=(frozenset({id_accum, id_permute, id_set_left, id_init_a}) | frozenset(write_to_tail_ids)), - within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer}) - + within_inames=(insn.within_inames - frozenset({inner_iname, outer_iname}) - inames_micro) | inames_tail, - tags=frozenset({'tail'}))) + tags=frozenset({'tail_vec{}'.format(vcl_size), + 'vectorized_{}'.format(level)}))) else: - new_insns.append(insn) + if insn.id.endswith('tail') and insn.id.replace('_tail', '') in accum_ids: + accum_id = insn.id.replace('_tail', '') + new_insns.append(insn.copy(depends_on=insn.depends_on | frozenset({accum_id + '_accum_tail'}))) + else: + new_insns.append(insn) return knl.copy(instructions=new_insns, temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries)) -def add_vcl_access(knl, iname_inner): - from loopy.match import Reads, Tagged - accum_insns = set((insn.id for insn in lp.find_instructions(knl, Tagged('accum')))) - read_insns = set((insn.id for insn in lp.find_instructions(knl, Reads('*alias')))) +def add_vcl_access(knl, inner_iname, vcl_size, level=0): + accum_insns = set((insn.id for insn in lp.find_instructions(knl, And((Tagged('accum_vec{}'.format(vcl_size)), + Iname(inner_iname)))))) + read_insns = set((insn.id for insn in lp.find_instructions(knl, And((Reads('*alias'), Iname(inner_iname)))))) vectorized_insns = accum_insns | read_insns + alias_suffix = 'alias' + vector_sufix = 'vec{}'.format(vcl_size) + from loopy.symbolic import CombineMapper from loopy.symbolic import IdentityMapper class AliasIndexCollector(CombineMapper, IdentityMapper): + def __init__(self): + self.found_alias = False + def combine(self, values): return sum(values, tuple()) def map_constant(self, expr): - return tuple() + if self.found_alias: + return (expr,) + else: + return tuple() map_variable = map_constant map_function_symbol = map_constant map_loopy_function_identifier = map_constant def map_subscript(self, expr): - if expr.aggregate.name.endswith('alias'): - return expr.aggregate, expr.index_tuple + if expr.aggregate.name.endswith(alias_suffix): + self.found_alias = True + indices = self.combine((self.rec(index) for index in expr.index_tuple)) + self.found_alias = False + return expr.aggregate, indices else: return tuple() @@ -208,23 +231,24 @@ def add_vcl_access(knl, iname_inner): alias, index = aic(insn.expression) name_alias = alias.name - name_vec = name_alias.replace('alias', 'vec') + name_vec = name_alias.replace(alias_suffix, vector_sufix) vectorized_insn_to_vector_names[id] = (name_alias, name_vec) # compute index without vec iname strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags) - index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides) - if i != 0 and i.name != iname_inner)) + flat_index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides) + if not (isinstance(i, prim.Variable) and i.name == inner_iname))) # find write insns write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec))))) # add load instruction load_id = idg('insn_' + name_vec + '_load') - call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), index)),)) + call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), flat_index)),)) load_insns.append(lp.CallInstruction(assignees=(), expression=call_load, id=load_id, within_inames=insn.within_inames | insn.reduction_inames(), - depends_on=insn.depends_on | write_ids,)) + depends_on=insn.depends_on | write_ids, + tags=frozenset({'vectorized_{}'.format(level)}))) read_dependencies.setdefault(id, set()) read_dependencies[id].add(load_id) @@ -235,61 +259,65 @@ def add_vcl_access(knl, iname_inner): alias, index = aic(insn.expression) name_alias = alias.name - name_vec = name_alias.replace('alias', 'vec') + name_vec = name_alias.replace(alias_suffix, vector_sufix) vectorized_insn_to_vector_names[id] = (name_alias, name_vec) # flat index without vec iname strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags) - index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides) - if i != 0 and i.name != iname_inner)) + flat_index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides) + if not (isinstance(i, prim.Variable) and i.name == inner_iname))) # find write insns write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec))))) # add store instruction store_id = idg('insn_' + name_vec + '_store') - call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), index)),)) + call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), flat_index)),)) store_insns.append(lp.CallInstruction(assignees=(), expression=call_store, id=store_id, within_inames=insn.within_inames, depends_on=(insn.depends_on | frozenset({id}) | read_dependencies[id] | - write_ids))) + write_ids), + tags=frozenset({'vectorized_{}'.format(level)}))) # replace alias with vcl vector, except for accumulation assignee - vector_alias = [a for a in knl.arg_dict if a.endswith('alias')] + vector_alias = [a for a in knl.arg_dict if a.endswith(alias_suffix)] dim = world_dimension() dim_names = ["x", "y", "z"] + [str(i) for i in range(4, dim + 1)] # remove CInstructions since loopy extract expects to get only assignments - knl_without_cinsn = knl.copy(instructions=[insn for insn in knl.instructions - if not isinstance(insn, lp.CInstruction)]) + knl_with_subst_insns = knl.copy(instructions=[insn for insn in lp.find_instructions(knl, Iname(inner_iname)) + if not isinstance(insn, lp.CInstruction)]) for alias in vector_alias: # Rename lhs which would match the substitution rule since loopy doesn't want substitutions as lhs new_insns = [] - for insn in knl_without_cinsn.instructions: + for insn in knl_with_subst_insns.instructions: if isinstance(insn, lp.Assignment) and isinstance(insn.assignee, prim.Subscript): if insn.assignee.aggregate.name == alias: new_insns.append(insn.copy(assignee=prim.Subscript(prim.Variable('dummy_' + alias), insn.assignee.index_tuple))) - pass else: new_insns.append(insn) else: new_insns.append(insn) - knl_without_cinsn = knl_without_cinsn.copy(instructions=new_insns) - - # substitution rule for alias[ex_outer,ex_inner, ey, ix, iy] -> vec[ex_inner] - parameters = 'ex_o,ex_i,' + ','.join(['e' + d for d in dim_names[1:dim]]) + \ - ',ix,' + ','.join(['i' + d for d in dim_names[1:dim]]) - knl_without_cinsn = lp.extract_subst(knl_without_cinsn, alias + '_subst', '{}[{}]'.format(alias, parameters), - parameters=parameters) - new_subst = knl_without_cinsn.substitutions.copy() + knl_with_subst_insns = knl_with_subst_insns.copy(instructions=new_insns) + + # substitution rule for alias[[ex_o]*l,ex_inner, ey, ix, iy] -> vec[ex_inner] + parameters = ','.join(['ex_o{}'.format(l) for l in range(level + 1)]) + \ + ',v_i,' + \ + ','.join(['e' + d for d in dim_names[1:dim]]) + \ + ',ix,' + \ + ','.join(['i' + d for d in dim_names[1:dim]]) + knl_with_subst_insns = lp.extract_subst(knl_with_subst_insns, + alias + '_subst', '{}[{}]'.format(alias, parameters), + parameters=parameters) + new_subst = knl_with_subst_insns.substitutions.copy() rule = new_subst[alias + '_subst'] - rule.expression = prim.Subscript(prim.Variable(alias.replace('alias', 'vec')), (prim.Variable('ex_i'),)) - knl_without_cinsn = knl_without_cinsn.copy(substitutions=new_subst) + rule.expression = prim.Subscript(prim.Variable(alias.replace(alias_suffix, vector_sufix)), + (prim.Variable('v_i'),)) + knl_with_subst_insns = knl_with_subst_insns.copy(substitutions=new_subst) - from loopy.match import All - knl_without_cinsn = lp.expand_subst(knl_without_cinsn, All()) - knl = knl_without_cinsn.copy(instructions=knl_without_cinsn.instructions + [insn for insn in knl.instructions - if isinstance(insn, lp.CInstruction)]) + knl_with_subst_insns = lp.expand_subst(knl_with_subst_insns, Iname(inner_iname)) + knl = knl.copy(instructions=knl_with_subst_insns.instructions + [insn for insn in knl.instructions + if insn.id not in knl_with_subst_insns.id_to_insn]) # add store and load dependencies and set right accumulation assignee new_insns = [] @@ -305,17 +333,19 @@ def add_vcl_access(knl, iname_inner): try: assignee_vec = next((expr for expr in insn.expression.children if isinstance(expr, prim.Subscript) and - expr.aggregate.name.replace('vec', 'alias') == + expr.aggregate.name.replace(vector_sufix, alias_suffix) == assignee_alias.aggregate.name.replace('dummy_', ''))) except StopIteration: from dune.codegen.error import CodegenVectorizationError raise CodegenVectorizationError new_insns.append(insn.copy(assignee=assignee_vec, depends_on=(insn.depends_on | read_dependencies[insn.id] | - write_ids))) + write_ids), + tags=insn.tags | frozenset({'vectorized_{}'.format(level)}))) else: new_insns.append(insn.copy(depends_on=(insn.depends_on | read_dependencies[insn.id] | - write_ids))) + write_ids), + tags=insn.tags | frozenset({'vectorized_{}'.format(level)}))) return knl.copy(instructions=new_insns + load_insns + store_insns) @@ -331,38 +361,66 @@ def find_accumulation_inames(knl): return inames -def add_iname_array(knl, vec_iname): - insns_with_macro_points = lp.find_instructions(knl, Tagged(vec_iname)) +def add_iname_array(knl, iname): + insns_with_macro_points = lp.find_instructions(knl, Tagged(iname)) if insns_with_macro_points: - array_name = vec_iname + '_arr' - vector_name = vec_iname + '_vec' + new_iname = knl.get_var_name_generator()(iname) + + new_dom = BasicSet('{{ [{0}] : 0<={0}<{1} }}'.format(new_iname, get_form_option('number_of_blocks'))) + + array_name = iname + '_arr' new_temporaries = dict() new_temporaries[array_name] = DuneTemporaryVariable(array_name, managed=True, shape=(get_form_option('number_of_blocks'),), scope=lp.temp_var_scope.PRIVATE, dtype=np.float64, - base_storage=vec_iname + '_buff', + base_storage=array_name + '_buff', _base_storage_access_may_be_aliasing=True) - new_temporaries[vector_name] = DuneTemporaryVariable(vector_name, managed=True, - shape=(get_form_option('number_of_blocks'),), - scope=lp.temp_var_scope.PRIVATE, dtype=np.float64, - base_storage=vec_iname + '_buff', - _base_storage_access_may_be_aliasing=True) - silenced_warning = ["read_no_write({})".format(vector_name)] - - new_insns = [lp.Assignment(assignee=prim.Subscript(prim.Variable(array_name), (prim.Variable(vec_iname),)), - expression=prim.Variable(vec_iname), - id='init_{}_buffer'.format(vec_iname), - within_inames=frozenset({vec_iname}), within_inames_is_final=True)] replacemap = dict() - replacemap[vec_iname] = prim.Subscript(prim.Variable(vector_name), (prim.Variable(vec_iname),)) + replacemap[iname] = prim.Subscript(prim.Variable(array_name), (prim.Variable(iname),)) + + new_insns = [lp.Assignment(assignee=prim.Subscript(prim.Variable(array_name), (prim.Variable(new_iname),)), + expression=prim.Variable(new_iname), + id='init_{}_buffer'.format(array_name), + within_inames=frozenset({new_iname}), within_inames_is_final=True)] + + for insn in knl.instructions: + if insn in insns_with_macro_points: + transformed_insn = insn.with_transformed_expressions(lambda expr: substitute(expr, replacemap)) + new_insns.append(transformed_insn.copy(depends_on='init_{}_buffer'.format(array_name))) + else: + new_insns.append(insn) + knl = knl.copy(instructions=new_insns, domains=knl.domains + [new_dom], + temporary_variables=dict(**knl.temporary_variables, **new_temporaries)) + + return knl + + +def add_vcl_iname_array(knl, iname, vec_iname, vcl_size, level): + insns_with_macro_points = lp.find_instructions(knl, And((Tagged(iname), Iname(vec_iname)))) + + if insns_with_macro_points: + iname_array = iname + '_arr' + vector_name = iname + '_vec{}'.format(vcl_size) + + new_temporaries = {vector_name: DuneTemporaryVariable(vector_name, managed=True, + shape=(get_form_option('number_of_blocks'),), + scope=lp.temp_var_scope.PRIVATE, dtype=np.float64, + base_storage=iname_array + '_buff', + _base_storage_access_may_be_aliasing=True)} + silenced_warning = ["read_no_write({})".format(vector_name)] + + replacemap = {iname_array: prim.Variable(vector_name)} + + new_insns = [] for insn in knl.instructions: if insn in insns_with_macro_points: transformed_insn = insn.with_transformed_expressions(lambda expr: substitute(expr, replacemap)) - new_insns.append(transformed_insn.copy(depends_on='init_{}_buffer'.format(vec_iname))) + new_insns.append(transformed_insn.copy(depends_on='init_{}_buffer'.format(iname_array), + tags=insn.tags | frozenset({'vectorized_{}'.format(level)}))) else: new_insns.append(insn) @@ -370,27 +428,120 @@ def add_iname_array(knl, vec_iname): temporary_variables=dict(**knl.temporary_variables, **new_temporaries), silenced_warnings=knl.silenced_warnings + silenced_warning) - knl = lp.duplicate_inames(knl, [vec_iname], within=Id('init_{}_buffer'.format(vec_iname))) + knl = lp.split_array_axis(knl, (vector_name,), 0, vcl_size) + knl = lp.tag_array_axes(knl, (vector_name,), ('c', 'vec')) + + return knl + + +def realize_tail(knl, inner_iname, outer_iname, outer_bound, tail_iname, vcl_size, level): + tail_size = get_form_option('number_of_blocks') % vcl_size + new_dom = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(tail_iname, tail_size)) + + insns_to_duplicate = lp.find_instructions(knl, Iname(inner_iname)) + ids_to_duplicate = tuple((insn.id for insn in insns_to_duplicate)) + + subst_map = dict([(outer_iname, outer_bound // vcl_size), + (inner_iname, prim.Variable(tail_iname))]) + + temporaries_to_duplicate = dict() + for insn in insns_to_duplicate: + if isinstance(insn, lp.Assignment): + assignee = insn.assignee + name = get_pymbolic_basename(assignee) + if name in knl.temporary_variables: + new_name = name + '_tail' + temporaries_to_duplicate[new_name] = knl.temporary_variables[name].copy(name=new_name) + subst_map[name] = prim.Variable(new_name) + + new_insns = [] + for insn in insns_to_duplicate: + new_insn = insn.with_transformed_expressions(lambda e: substitute(e, subst_map)) + new_depends_on = frozenset((insn_id + '_tail' if insn_id in ids_to_duplicate else insn_id + for insn_id in insn.depends_on)) + new_within_inames = frozenset((iname + '_tail' if iname == inner_iname else iname + for iname in insn.within_inames)) - frozenset({outer_iname}) + new_insns.append(new_insn.copy(id=insn.id + '_tail', depends_on=new_depends_on, + within_inames=new_within_inames, + tags=insn.tags | frozenset({'tail_{}'.format(level)}))) + + knl = knl.copy(domains=knl.domains + [new_dom], instructions=knl.instructions + new_insns, + temporary_variables=dict(**knl.temporary_variables, **temporaries_to_duplicate)) + + common_inames = knl.all_inames() + for insn in new_insns: + common_inames = common_inames & (insn.within_inames | insn.reduction_inames()) + + if get_form_option('vectorization_blockstructured_tail_ordering') == 'blocked': + # TODO need to be more clever to get the right inames + macro_inames = frozenset((iname + '_0' * level) for iname in sub_element_inames()) + common_inames = common_inames - macro_inames + + additional_inames_to_duplicate = frozenset() + for insn in new_insns: + insn_inames = insn.within_inames | insn.reduction_inames() + additional_inames_to_duplicate = additional_inames_to_duplicate | (insn_inames - common_inames) + + knl = lp.duplicate_inames(knl, tuple(additional_inames_to_duplicate), + Or(tuple((Id(insn.id) for insn in new_insns)))) + + return lp.make_reduction_inames_unique(knl) + + +def do_vectorization(knl, orig_iname, vec_iname, iname_bound, vcl_size, level=0): + inner_iname = vec_iname + '_inner' + outer_iname = vec_iname + '_outer' + + tail_size = iname_bound % vcl_size + if get_form_option('vectorization_blockstructured_tail'): + tail_vcl_size = vcl_size + while tail_vcl_size > tail_size: + tail_vcl_size = tail_vcl_size // 2 + vectorize_tail = tail_vcl_size > 1 + else: + vectorize_tail = False + + # manually add tail, since split_iname with slabs tries to vectorize the tail + if tail_size > 0: + # fake suitable loop bound + vectorizable_bound = (iname_bound // vcl_size) * vcl_size + from loopy.kernel.tools import DomainChanger + domch = DomainChanger(knl, (vec_iname,)) + knl = knl.copy(domains=domch.get_domains_with( + BasicSet('{{ [{0}]: 0<={0}<{1} }}'.format(vec_iname, vectorizable_bound)))) + + knl = lp.split_iname(knl, vec_iname, vcl_size, outer_iname=outer_iname, inner_iname=inner_iname) + + tail_iname = vec_iname + '_inner' + '_tail' + knl = realize_tail(knl, inner_iname, outer_iname, iname_bound, tail_iname, vcl_size, level) + else: + knl = lp.split_iname(knl, vec_iname, vcl_size) + + knl = lp.tag_inames(knl, [(inner_iname, 'vec')]) + + array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')] + knl = lp.split_array_axis(knl, array_alias, level, vcl_size) + + knl = add_vcl_temporaries(knl, vcl_size) + knl = add_vcl_iname_array(knl, orig_iname, inner_iname, vcl_size, level) + knl = add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size, level) + knl = add_vcl_access(knl, inner_iname, vcl_size, level) + + if tail_size > 0: + knl = lp.add_dependency(knl, And((Tagged('tail_{}'.format(level)), Not(Tagged('head*')))), + Tagged('vectorized_{}'.format(level))) + if vectorize_tail: + knl = do_vectorization(knl, orig_iname, tail_iname, tail_size, tail_vcl_size, level + 1) return knl def vectorize_micro_elements(knl): vec_iname = "subel_x" + orig_iname = vec_iname if vec_iname in knl.all_inames() and get_global_context_value('integral_type') == 'cell': vcl_size = get_vcl_type_size(np.float64) - - assert get_form_option('number_of_blocks') % vcl_size == 0 - knl = add_iname_array(knl, vec_iname) - knl = lp.split_iname(knl, vec_iname, vcl_size, inner_tag='vec') - array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')] - iname_vector = [a for a in knl.temporary_variables.keys() if a.endswith('vec')] - knl = lp.split_array_axis(knl, array_alias + iname_vector, 0, vcl_size) - knl = lp.tag_array_axes(knl, iname_vector, ('c', 'vec')) - - knl = add_vcl_temporaries(knl) - knl = add_vcl_accum_insns(knl, vec_iname + '_inner', vec_iname + '_outer') - knl = add_vcl_access(knl, vec_iname + '_inner') + knl = do_vectorization(knl, orig_iname, orig_iname, get_form_option('number_of_blocks'), vcl_size) return knl diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py index 8642486908dbbea2c9bde300fe377cd4ae94c1ba..7455f1bb242fa042011f005e154e8088264ad47f 100644 --- a/python/dune/codegen/options.py +++ b/python/dune/codegen/options.py @@ -102,6 +102,8 @@ class CodegenFormOptionsArray(ImmutableRecord): blockstructured = CodegenOption(default=False, helpstr="Use block structure") number_of_blocks = CodegenOption(default=1, helpstr="Number of sub blocks in one direction") vectorization_blockstructured = CodegenOption(default=False, helpstr="Vectorize block structuring") + vectorization_blockstructured_tail = CodegenOption(default=True, helpstr="Try to fully vectorize block structuring even when 'nunmber_of_blocks' is not divisible by vector length") + vectorization_blockstructured_tail_ordering = CodegenOption(default='consecutive', helpstr="Ordering of the tail w.r.t the vectorized loop. Possible values: consecutive|blocked") adjoint = CodegenOption(default=False, helpstr="Generate adjoint operator") control = CodegenOption(default=False, helpstr="Generate operator of derivative w.r.t. the control variable") objective_function = CodegenOption(default=None, helpstr="Name of form representing the objective function in UFL file") diff --git a/test/blockstructured/poisson/CMakeLists.txt b/test/blockstructured/poisson/CMakeLists.txt index 1b48ca5f343efeffe528a5066d3bc4073898b8a5..aa711fedea7e867da04f96050bb817c7935ef83f 100644 --- a/test/blockstructured/poisson/CMakeLists.txt +++ b/test/blockstructured/poisson/CMakeLists.txt @@ -41,4 +41,9 @@ dune_add_formcompiler_system_test(UFLFILE poisson.ufl dune_add_formcompiler_system_test(UFLFILE poisson.ufl BASENAME blockstructured_poisson_grid INIFILE poisson_grid.mini -) \ No newline at end of file +) + +dune_add_formcompiler_system_test(UFLFILE poisson.ufl + BASENAME blockstructured_poisson_vec_tail + INIFILE poisson_vec_tail.mini + ) diff --git a/test/blockstructured/poisson/poisson_tensor.mini b/test/blockstructured/poisson/poisson_tensor.mini index 6f18a924c4deabebaeb8447abd77076b0e2e127f..1e6e90fa76bdf3c00be899a879921845cbdf73bb 100644 --- a/test/blockstructured/poisson/poisson_tensor.mini +++ b/test/blockstructured/poisson/poisson_tensor.mini @@ -1,8 +1,13 @@ __name = blockstructured_poisson_tensor_{__exec_suffix} -__exec_suffix = {grid_suffix}_{vec_suffix}_{dim_suffix} +__exec_suffix = {grid_suffix}_{vec_suffix}_{dim_suffix}_blocks_{blocks} dim = 2, 3 | expand dimension +blocks_2d = 8, 7 | expand blocks +blocks_3d = 4, 5 | expand blocks + +blocks = {blocks_2d}, {blocks_3d} | expand dimension + grid_suffix = structured, unstructured | expand unstructured vec_suffix = nonvec, vec | expand vectorized dim_suffix = 2d, 3d | expand dimension @@ -26,7 +31,7 @@ matrix_free = 1 vectorization_blockstructured = 0, 1 | expand vectorized generate_jacobians = 0 blockstructured = 1 -number_of_blocks = 8, 4 | expand dimension +number_of_blocks = {blocks} geometry_mixins = blockstructured_equidistant, blockstructured_multilinear | expand unstructured [formcompiler.ufl_variants] diff --git a/test/blockstructured/poisson/poisson_vec_tail.mini b/test/blockstructured/poisson/poisson_vec_tail.mini new file mode 100644 index 0000000000000000000000000000000000000000..9582d752ca3a6b26ea2e9dab28b8754a3f244953 --- /dev/null +++ b/test/blockstructured/poisson/poisson_vec_tail.mini @@ -0,0 +1,34 @@ +__name = blockstructured_poisson_vec_tail_{__exec_suffix} +__exec_suffix = {dimname}_{tail_suffix} + +dim = 2, 3 | expand dimension +dimname = 2d, 3d | expand dimension + +cells = 8, 2 | expand dimension | repeat {dim} +extension = 1. | repeat {dim} + +tail_vec = 0, 1 | expand tail_vec +tail_modus = consecutive, blocked | expand mod +tail_suffix = novec_{tail_modus}, vec_{tail_modus} | expand tail_vec + +[wrapper.vtkcompare] +name = {__name} +reference = poisson_ref +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-7 + +[formcompiler.r] +matrix_free = 1 +generate_jacobians = 0 +blockstructured = 1 +number_of_blocks = 15, 7 | expand dimension +vectorization_blockstructured = 1 +vectorization_blockstructured_tail = {tail_vec} +vectorization_blockstructured_tail_ordering = {tail_modus} +geometry_mixins = blockstructured_equidistant + +[formcompiler.ufl_variants] +cell = quadrilateral, hexahedron | expand dimension +degree = 1 \ No newline at end of file