diff --git a/python/dune/perftool/loopy/transformations/collect_rotate.py b/python/dune/perftool/loopy/transformations/collect_rotate.py index 78070a10be4fef8c535a9c3dad6f0a02fe323edf..d99e530dbc4107691663c6bfd865bab274965caa 100644 --- a/python/dune/perftool/loopy/transformations/collect_rotate.py +++ b/python/dune/perftool/loopy/transformations/collect_rotate.py @@ -30,10 +30,21 @@ def rotate_function_mangler(knl, func, arg_dtypes): # This is not 100% within the loopy philosophy, as we are # passing the vector registers as references and have them # changed. Loopy assumes this function to be read-only. + include_file("dune/perftool/sumfact/transposereg.hh", filetag="operatorfile") vcl = lp.types.NumpyType(get_vcl_type(np.float64, register_size=256)) return lp.CallMangleInfo("transpose_reg", (), (vcl, vcl, vcl, vcl)) +class VectorIndices(object): + def __init__(self): + self.needed = set() + + def get(self, increment): + name = "vec_index_inc{}".format(increment) + self.needed.add((name, increment)) + return prim.Variable(name) + + def collect_vector_data_rotate(knl): # # Process/Assert/Standardize the input @@ -51,6 +62,7 @@ def collect_vector_data_rotate(knl): # Determine the vector lane width # TODO infer the numpy type here vec_size = get_vcl_type_size(np.float64) + vector_indices = VectorIndices() # Add an iname to the kernel which will be used for vectorization new_iname = "quad_vec_{}".format("_".join(inames)) @@ -61,8 +73,6 @@ def collect_vector_data_rotate(knl): new_insns = [] all_writers = [] - tags = frozenset().union(*tuple(i.tags for i in insns)) - rotating = "gradvec" in tags # # Inspect the given instructions for dependent quantities @@ -168,10 +178,9 @@ def collect_vector_data_rotate(knl): # 1. Rotating the input data knl = add_vector_view(knl, quantity, flatview=True) - include_file("dune/perftool/sumfact/transposereg.hh", filetag="operatorfile") new_insns.append(lp.CallInstruction((), # assignees prim.Call(prim.Variable("transpose_reg"), - tuple(prim.Subscript(prim.Variable(get_vector_view_name(quantity)), (prim.Variable("vec_index") + i, prim.Variable(new_iname))) for i in range(4))), + tuple(prim.Subscript(prim.Variable(get_vector_view_name(quantity)), (vector_indices.get(vec_size) + i, prim.Variable(new_iname))) for i in range(4))), depends_on=frozenset({'continue_stmt'}), within_inames=common_inames.union(inames).union(frozenset({new_iname})), within_inames_is_final=True, @@ -183,14 +192,14 @@ def collect_vector_data_rotate(knl): assert isinstance(expr, prim.Subscript) last_index = expr.index[-1] replacemap_vec[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)), - (prim.Variable("vec_index") + last_index, prim.Variable(new_iname)), + (vector_indices.get(vec_size) + last_index, prim.Variable(new_iname)), ) else: # Add a vector view to this quantity expr, = quantities[quantity] knl = add_vector_view(knl, quantity, flatview=True) replacemap_vec[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)), - (prim.Variable("vec_index"), prim.Variable(new_iname)), + (vector_indices.get(1), prim.Variable(new_iname)), ) other_insns = [i for i in knl.instructions if i.id not in [j.id for j in insns + new_insns]] @@ -219,26 +228,6 @@ def collect_vector_data_rotate(knl): id="update_total_index", )) - # Insert a flat consecutive counter 'vec_index', which is increased after a vector chunk is handled - temporaries['vec_index'] = lp.TemporaryVariable('vec_index', # name - dtype=np.int32, - scope=lp.temp_var_scope.PRIVATE, - ) - new_insns.append(lp.Assignment(prim.Variable("vec_index"), # assignee - 0, # expression - within_inames=common_inames, - within_inames_is_final=True, - id="assign_vec_index", - )) - new_insns.append(lp.Assignment(prim.Variable("vec_index"), # assignee - prim.Sum((prim.Variable("vec_index"), vec_size if rotating else 1)), # expression - within_inames=common_inames.union(inames), - within_inames_is_final=True, - depends_on=frozenset({Tagged("vec_write"), "assign_vec_index"}), - depends_on_is_final=True, - id="update_vec_index", - )) - # Insert a rotating index, that counts 0 , .. , vecsize - 1 temporaries['rotate_index'] = lp.TemporaryVariable('rotate_index', # name dtype=np.int32, @@ -291,16 +280,19 @@ def collect_vector_data_rotate(knl): lhsname = get_pymbolic_basename(insn.assignee) knl = add_vector_view(knl, lhsname, pad_to=vec_size, flatview=True) lhsname = get_vector_view_name(lhsname) + rotating = "gradvec" in insn.tags if rotating: assert isinstance(insn.assignee, prim.Subscript) last_index = insn.assignee.index[-1] assert last_index in tuple(range(4)) + vec_index_size = vec_size else: last_index = 0 + vec_index_size = 1 new_insns.append(lp.Assignment(prim.Subscript(prim.Variable(lhsname), - (prim.Variable("vec_index") + last_index, prim.Variable(new_iname)), + (vector_indices.get(vec_index_size) + last_index, prim.Variable(new_iname)), ), substitute(insn.expression, replacemap_vec), depends_on=frozenset({"continue_stmt"}), @@ -316,12 +308,33 @@ def collect_vector_data_rotate(knl): if rotating and "{}_rotateback".format(lhsname) not in [i.id for i in new_insns]: new_insns.append(lp.CallInstruction((), # assignees prim.Call(prim.Variable("transpose_reg"), - tuple(prim.Subscript(prim.Variable(lhsname), (prim.Variable("vec_index") + i, prim.Variable(new_iname))) for i in range(4))), + tuple(prim.Subscript(prim.Variable(lhsname), (vector_indices.get(vec_size) + i, prim.Variable(new_iname))) for i in range(4))), depends_on=frozenset({Tagged("vec_write")}), within_inames=common_inames.union(inames).union(frozenset({new_iname})), within_inames_is_final=True, id="{}_rotateback".format(lhsname), )) + # Add the necessary vector indices + for name, increment in vector_indices.needed: + temporaries[name] = lp.TemporaryVariable(name, # name + dtype=np.int32, + scope=lp.temp_var_scope.PRIVATE, + ) + new_insns.append(lp.Assignment(prim.Variable(name), # assignee + 0, # expression + within_inames=common_inames, + within_inames_is_final=True, + id="assign_{}".format(name), + )) + new_insns.append(lp.Assignment(prim.Variable(name), # assignee + prim.Sum((prim.Variable(name), increment)), # expression + within_inames=common_inames.union(inames), + within_inames_is_final=True, + depends_on=frozenset({Tagged("vec_write"), "assign_{}".format(name)}), + depends_on_is_final=True, + id="update_{}".format(name), + )) + from loopy.kernel.creation import resolve_dependencies return resolve_dependencies(knl.copy(instructions=new_insns + other_insns)) diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index 210677bf7b369f926394c3e1d537437b88bbe116..2c05e07a54e3bdb034c656f545606c3167a5a356 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -174,7 +174,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): expression=0, forced_iname_deps=frozenset(quadrature_inames() + visitor.inames), forced_iname_deps_is_final=True, - tags=frozenset(["quadvec"]) + tags=frozenset(["quadvec", "gradvec"]) ) # Replace gradient iname with correct index for assignement