diff --git a/python/dune/perftool/loopy/collectvector.py b/python/dune/perftool/loopy/collectvector.py index b5023e0e8fce7371c8a79d2c586cc5d64bc4bb4e..3d69d623f28851466fc44690ef876366f4f7aeb3 100644 --- a/python/dune/perftool/loopy/collectvector.py +++ b/python/dune/perftool/loopy/collectvector.py @@ -1,8 +1,12 @@ """ A kernel transformation that collects data until the vector size is reached """ -from dune.perftool.loopy.vcl import VCLLoad, VCLStore +from dune.perftool.loopy.vcl import get_vcl_type_size +from dune.perftool.loopy.vectorview import (add_vector_view, + get_vector_view_name, + ) from dune.perftool.tools import get_pymbolic_basename +from loopy.kernel.creation import parse_domains from loopy.symbolic import pw_aff_to_expr from pymbolic.mapper.dependency import DependencyMapper @@ -13,53 +17,17 @@ import loopy as lp import numpy as np -def load_instruction(vec, basepointer, offset=0, **kwargs): - # TODO: What I really want to pass in here is a pointer type - pointer = basepointer - if offset: - pointer = prim.Sum((pointer, offset)) - return lp.CallInstruction((), - prim.Call(VCLLoad(vec), (pointer,)), - tags=kwargs.get('tags', frozenset()).union(frozenset(["vcl_load"])), - **kwargs - ) - - -def store_instruction(vec, basepointer, offset=0, **kwargs): - # TODO: What I really want to pass in here is a pointer type - pointer = basepointer - if offset: - pointer = prim.Sum((pointer, offset)) - return lp.CallInstruction((), - prim.Call(VCLStore(vec), (pointer,)), - tags=kwargs.get('tags', frozenset()).union(frozenset(["vcl_store"])), - **kwargs - ) - - -def collect_vector_data(knl, insns, inames, vec_size=4): - # TODO: In theory this vec_size should be deduced from the types - # that we find in the actual instructions +def collect_vector_data(knl, insns, inames): + # + # Process/Assert/Standardize the input + # - # Process inames input -> frozenset + # inames input -> tuple if isinstance(inames, str): inames = inames.split(",") - inames = frozenset(inames) - - # Filter instructions into the groups that we need for this transformation: - # 1. Those that should be vectorized by this transformation -> insns - # 2. Writers of quantities that depend on the inames and that 1) need -> write_insns - # 3. "Outside" instructions, that the instructions from 1)+2) depend on -> dep_insns - # 4. All other instructions (needed to correctly reconstruct) -> other_insns - # - # All these should be lists of instruction instances. + inames = tuple(i.strip() for i in inames) - # 1. Those that should be vectorized by this transformation -> insns - # - # Instructions may be passed in as either: - # * A match expression object - # * A comma separated string of IDs - # * An iterable of IDs + # insns -> list of Instruction instances if isinstance(insns, lp.match.MatchExpressionBase): insns = lp.find_instructions(knl, insns) else: @@ -67,205 +35,80 @@ def collect_vector_data(knl, insns, inames, vec_size=4): insns = [i.strip() for i in insns.split(",")] insns = [knl.id_to_insn[i] for i in insns] - # 2. Writers of quantities that depend on the inames and that 1) need -> write_insns # - # To determine these, we need to - # * Find all written quantities in the instructions from 1) - # * Find the instructions that write to these quantities - # * Filter only those that depend on the given inames - quantities = {} - depmapper = DependencyMapper() + # Inspect the given instructions for dependent quantities + # and precompute them + # + + quantities = [] for insn in insns: - for expr in depmapper(insn.expression): - basename = get_pymbolic_basename(expr) - quantities.setdefault(basename, frozenset()) - quantities[basename] = quantities[basename].union(frozenset([expr])) + for expr in DependencyMapper()(insn.expression): + quantities.append(get_pymbolic_basename(expr)) + quantities = set(quantities) + + for quantity in quantities: + # Introduce a substitution rule and find save name + subst_old = knl.substitutions.keys() + knl = lp.assignment_to_subst(knl, quantity, extra_arguments=inames) + subst_new = knl.substitutions.keys() + subst_name, = set(subst_new) - set(subst_old) - write_match = lp.match.Or(tuple(lp.match.Writes(q) for q in quantities)) - iname_match = lp.match.And(tuple(lp.match.Iname(i) for i in inames)) - match = lp.match.And((write_match, iname_match)) - write_insns = lp.find_instructions(knl, match) + # Do precomputation of the quantity + prec_quantity = "{}_precomputed".format(quantity) + compute_id = "{}_compute_id".format(quantity) + compute_ids.append(compute_id) - # 3. "Outside" instructions, that the instructions from 1) depend on -> dep_insns - deps = frozenset([]).union(*(insn.depends_on for insn in insns + write_insns)) - deps = deps - frozenset([i.id for i in insns + write_insns]) - dep_insns = lp.find_instructions(knl, lp.match.Or(tuple(lp.match.Id(i) for i in deps))) + knl = lp.precompute(knl, subst_name, inames, + temporary_name=prec_quantity, + compute_insn_id=compute_id) - # 4. All other instructions (needed to correctly reconstruct) -> other_insns - used_ids = frozenset([i.id for i in insns + write_insns + dep_insns]) - other_insns = lp.find_instructions(knl, lp.match.Not(lp.match.Or(tuple(lp.match.Id(i) for i in used_ids)))) + # Introduce a vector view of the precomputation result + knl = add_vector_view(knl, prec_quantity) # - # Assert some assumptions on the instructions + # Construct a flat loop for the given instructions # - # An instruction occurs in but one of these groups: - assert len(set(insns + write_insns + dep_insns + other_insns)) == len(insns + write_insns + dep_insns + other_insns) - - # Analyse the inames of the given instructions and identify inames - # that they all have in common. Those inames will also be iname dependencies - # of inserted instruction. - common_inames = frozenset([]).union(*(insn.within_inames for insn in insns)) - inames - - # Have a list gathering all *new* instructions new_insns = [] - temporaries = knl.temporary_variables - keep_rules = [] - - # Insert a flat consecutive counter 'total_index' - temporaries['total_index'] = lp.TemporaryVariable('total_index', # name - dtype=np.int32, - scope=lp.temp_var_scope.LOCAL, - ) - new_insns.append(lp.Assignment(prim.Variable("total_index"), # assignee - 0, # expression - within_inames=common_inames, - within_inames_is_final=True, - id="assign_total_index", - )) - new_insns.append(lp.Assignment(prim.Variable("total_index"), # assignee - prim.Sum((prim.Variable("total_index"), 1)), # expression - within_inames=common_inames.union(inames), - within_inames_is_final=True, - depends_on=frozenset([i.id for i in write_insns]).union(frozenset({"assign_total_index"})), - depends_on_is_final=True, - id="update_total_index", - )) + other_insns = [i for i in knl.instructions if i.id not in [j.id for j in insns]] - # Insert a rotating index, that counts 0 , .. , vecsize - 1 - temporaries['rotate_index'] = lp.TemporaryVariable('rotate_index', # name - dtype=np.int32, - scope=lp.temp_var_scope.LOCAL, - ) - new_insns.append(lp.Assignment(prim.Variable("rotate_index"), # assignee - 0, # expression - within_inames=common_inames, - within_inames_is_final=True, - id="assign_rotate_index", - )) - new_insns.append(lp.Assignment(prim.Variable("rotate_index"), # assignee - prim.Remainder(prim.Sum((prim.Variable("rotate_index"), 1)), vec_size), # expression - within_inames=common_inames.union(inames), - within_inames_is_final=True, - depends_on=frozenset([i.id for i in write_insns]).union(frozenset({"assign_rotate_index"})), - depends_on_is_final=True, - id="update_rotate_index", - )) + size = prim.Product(tuple(pw_aff_to_expr(knl.get_iname_bounds(i).size) for i in inames)) + vec_size = get_vcl_type_size(np.float64) + size = prim.FloorDiv(size, vec_size) - # Pre-evaluate all the needed quantities - replacemap_arr = {} - replacemap_poi = {} - replacemap_vec = {} - for quantity, quantity_exprs in quantities.items(): - # TODO for now I only consider the case where an array occurs but once! - assert len(quantity_exprs) == 1 - quantity_expr, = quantity_exprs - - # Determine the shape of the - - arrname = quantity + '_buffered_arr' - temporaries[arrname] = lp.TemporaryVariable(arrname, - dtype=np.float64, - shape=(vec_size,), - dim_tags="c", - base_storage=quantity + '_base_storage', - scope=lp.temp_var_scope.LOCAL, - ) - - vecname = quantity + '_buffered_vec' - temporaries[vecname] = lp.TemporaryVariable(vecname, - dtype=np.float64, - shape=(vec_size,), - dim_tags="vec", - base_storage=quantity + '_base_storage', - scope=lp.temp_var_scope.LOCAL, - ) - - replacemap_arr[quantity] = prim.Subscript(prim.Variable(arrname), (prim.Variable('rotate_index'),)) - replacemap_poi[quantity] = prim.Variable(arrname) - replacemap_vec[quantity_expr] = prim.Variable(vecname) - - for insn in write_insns: - if isinstance(insn, lp.Assignment): - new_insns.append(insn.copy(assignee=replacemap_arr[get_pymbolic_basename(insn.assignee)], - ) - ) - if isinstance(insn, lp.CInstruction): - # Rip apart the code and change the assignee - assignee, expression = insn.code.split("=") - assignee = assignee.strip() - assert assignee in replacemap_arr - - code = "{} ={}".format(str(replacemap_arr[assignee]), expression) - new_insns.append(insn.copy(code=code)) + temporaries = knl.temporary_variables + temporaries["flatsize"] = lp.TemporaryVariable("flatsize", + dtype=np.int32, + shape=(), + ) + new_insns.append(lp.Assignment(prim.Variable("flatsize"), + size, + ) + ) - # Determine the condition for the continue statement - upper_bound = prim.Product(tuple(pw_aff_to_expr(knl.get_iname_bounds(i).size) for i in inames)) - total_check = prim.Comparison(prim.Variable("total_index"), "<", upper_bound) - rotate_check = prim.Comparison(prim.Variable("rotate_index"), "!=", 0) - check = prim.LogicalAnd((rotate_check, total_check)) + # Add an additional domain to the kernel + new_iname = "flat_{}".format("_".join(inames)) + domain = "{{ [{0}] : 0<={0}<flatsize }}".format(new_iname, str(size)) + domain = parse_domains(domain, {}) + knl = knl.copy(domains=knl.domains + domain, + temporary_variables=temporaries) - # Insert the 'continue' statement - new_insns.append(lp.CInstruction((), # iname exprs that the code needs access to - "continue;", # the code - predicates=frozenset({check}), - depends_on=frozenset({"update_rotate_index", "update_total_index"}).union(frozenset([i.id for i in write_insns])), - depends_on_is_final=True, - within_inames=common_inames.union(inames), - within_inames_is_final=True, - id="continue_stmt", - )) + # Split and tag the flat iname + knl = lp.split_iname(knl, new_iname, vec_size, inner_tag="vec") + new_inames = ("{}_outer".format(new_iname), "{}_inner".format(new_iname)) + knl = lp.assume(knl, "flatsize mod {} = 0".format(vec_size)) for insn in insns: - # Turn the assignee array into a base storage array - basename = get_pymbolic_basename(insn.assignee) - assert basename in temporaries - if temporaries[basename].base_storage is None: - temporaries[basename] = temporaries[basename].copy(base_storage="{}_base".format(basename)) - - # Load the data for this instruction - load_instructions = [] - for expr in depmapper(insn.expression): - name = get_pymbolic_basename(expr) - new_insns.append(load_instruction(replacemap_vec[expr], - replacemap_poi[name], - depends_on=frozenset({"continue_stmt"}), - within_inames=inames, - within_inames_is_final=True, - id="{}_load_instr".format(name) - ) - ) - load_instructions.append("{}_load_instr".format(name)) - - # TODO do something about the assignee! - name = "{}_result_buffer".format(insn.id) - temporaries[name] = lp.TemporaryVariable(name, - dtype=np.float64, - shape=(vec_size,), - dim_tags="vec", - base_storage="{}_base".format(basename), - scope=lp.temp_var_scope.LOCAL, - ) - new_insns.append(insn.copy(assignee=prim.Variable(name), - expression=substitute(insn.expression, variable_assignments=replacemap_vec), - depends_on=insn.depends_on.union(frozenset(["continue_stmt"])).union(frozenset(load_instructions)) - ) - ) - - new_insns.append(store_instruction(prim.Variable(name), - prim.Variable(basename), - offset=prim.Sum((prim.Variable("total_index") // vec_size, -1)), - depends_on=frozenset({insn.id}), - within_inames=inames, - within_inames_is_final=True - ) - ) - - keep_rules.append('temp_to_write({})'.format(basename)) - keep_rules.append('read_no_write({})'.format(basename)) + # Get a vector view of the lhs expression + lhsname = get_pymbolic_basename(insn.assignee) + knl = add_vector_view(knl, lhsname) + lhsname = get_vector_view_name(lhsname) + + new_insns.append(lp.Assignment(prim.Subscript(prim.Variable(lhsname), tuple(prim.Variable(i) for i in new_inames)), + prim.Subscript(prim.Variable(get_vector_view_name("wk_precomputed")), tuple(prim.Variable(i) for i in new_inames)), + within_inames=frozenset(new_inames), + within_inames_is_final=True, + ) + ) - # Return a kernel - return knl.copy(instructions=dep_insns + other_insns + new_insns, - temporary_variables=temporaries, - silenced_warnings=knl.silenced_warnings + keep_rules - ) + return knl.copy(instructions=new_insns + other_insns) diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index e19f3e1a4afdb0f3688af5bf2c3a5a42773a67b4..295a6029b3ba6891b84a85132fac2d640df271e6 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -2,7 +2,9 @@ from dune.perftool.generation import post_include from dune.perftool.loopy.temporary import DuneTemporaryVariable from dune.perftool.loopy.vcl import VCLTypeRegistry -from dune.perftool.generation import retrieve_cache_functions +from dune.perftool.generation import (include_file, + retrieve_cache_functions, + ) from loopy.target import (TargetBase, ASTBuilderBase, @@ -109,6 +111,7 @@ class DuneTarget(TargetBase): def dtype_to_typename(self, dtype): if dtype.dtype.kind == "V": + include_file("dune/perftool/vectorclass/vectorclass.h", filetag="operatorfile") return VCLTypeRegistry.names[dtype.dtype] else: return _registry[dtype.dtype.name] diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py index 272e0ec7097a2ac07e0d19f1e1b439ed1a208da8..78dea3cfdf05c34d24b9ba872334a61d9b16d5be 100644 --- a/python/dune/perftool/loopy/vcl.py +++ b/python/dune/perftool/loopy/vcl.py @@ -47,6 +47,10 @@ def _populate_vcl_type_registry(): _populate_vcl_type_registry() +def get_vcl_type_size(nptype, register_size=256): + return register_size // (np.dtype(nptype).itemsize * 8) + + def get_vcl_type(nptype, register_size=None, vec_size=None): assert register_size or vec_size assert not(register_size and vec_size) @@ -55,34 +59,3 @@ def get_vcl_type(nptype, register_size=None, vec_size=None): vec_size = register_size // (np.dtype(nptype).itemsize * 8) return VCLTypeRegistry.types[np.dtype(nptype), vec_size] - - -class VCLLoad(FunctionIdentifier): - def __init__(self, vec): - self.vec = vec - - def __getinitargs__(self): - return (self.vec,) - - @property - def name(self): - return "{}->load".format(self.vec) - - -class VCLStore(FunctionIdentifier): - def __init__(self, vec): - self.vec = vec - - def __getinitargs__(self): - return (self.vec,) - - @property - def name(self): - return "{}->store".format(self.vec) - - -@function_mangler -def vcl_mangler(target, func, dtypes): - if isinstance(func, (VCLLoad, VCLStore)): - include_file("dune/perftool/vectorclass/vectorclass.h", filetag="operatorfile") - return CallMangleInfo(func.name, (), (NumpyType(np.int32),)) diff --git a/python/dune/perftool/loopy/vectorview.py b/python/dune/perftool/loopy/vectorview.py new file mode 100644 index 0000000000000000000000000000000000000000..86aefa81dabc0035b21ca9a846ed0d7111a7bb1d --- /dev/null +++ b/python/dune/perftool/loopy/vectorview.py @@ -0,0 +1,48 @@ +""" +Implement tools around the idea of having two 'views' on memory: +One being an ordinary array with proper shape and so on, and one +being a an array of SIMD vectors +""" + +from dune.perftool.loopy.vcl import get_vcl_type_size + +import loopy as lp +import numpy as np +import pymbolic.primitives as prim + +def get_vector_view_name(tmpname): + return tmpname + "_vec" + + +def add_vector_view(knl, tmpname): + """ + Kernel transformation to add a vector view temporary + that interprets the same memory as another temporary + """ + temporaries = knl.temporary_variables + assert tmpname in temporaries + temp = temporaries[tmpname] + vecname = get_vector_view_name(tmpname) + + if vecname in knl.temporary_variables: + return knl + + # Add base storage to the original temporary! + if not temp.base_storage: + temp = temp.copy(base_storage=tmpname + "_base") + temporaries[tmpname] = temp + + # Determine the shape by dividing total size by vector size + vecsize = get_vcl_type_size(temp.dtype) + size = prim.FloorDiv(prim.Product(temp.shape), vecsize) + + # Now add a vector view temporary + vecname = tmpname + "_vec" + temporaries[vecname] = lp.TemporaryVariable(vecname, + dim_tags="c,vec", + shape=(size, vecsize), + base_storage=tmpname + "_base", + dtype=np.float64, + ) + + return knl.copy(temporary_variables=temporaries)