diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py index 8cc073fc68c89572dd0dd5b9e5947cc5690f5439..171ba6709c27fa8e88876da6180ef1b9debd26f4 100644 --- a/python/dune/perftool/generation/loopy.py +++ b/python/dune/perftool/generation/loopy.py @@ -50,6 +50,8 @@ def valuearg(name, **kw): @generator_factory(item_tags=("domain",), context_tags="kernel") def domain(iname, shape): + if isinstance(iname, tuple): + iname = ",".join(iname) if isinstance(shape, str): valuearg(shape) return "{{ [{0}] : 0<={0}<{1} }}".format(iname, shape) diff --git a/python/dune/perftool/loopy/transformations/collect_precompute.py b/python/dune/perftool/loopy/transformations/collect_precompute.py index 9eda9688efc423057d57a05278da1d22599569f7..73f13619a14e9bf14df4ad808239f6f834a40dbe 100644 --- a/python/dune/perftool/loopy/transformations/collect_precompute.py +++ b/python/dune/perftool/loopy/transformations/collect_precompute.py @@ -1,112 +1,291 @@ """ A kernel transformation that precomputes data and then splits computation in chunks of vector size independent of divisibility of the loop bounds. """ -from dune.perftool.loopy.vcl import get_vcl_type_size -from dune.perftool.loopy.transformations.vectorview import (add_vector_view, +from dune.perftool.generation import (function_mangler, + include_file, + loopy_class_member, + ) +from dune.perftool.loopy.vcl import get_vcl_type, get_vcl_type_size +from dune.perftool.loopy.transformations.vectorview import (add_temporary_with_vector_view, + add_vector_view, get_vector_view_name, ) -from dune.perftool.tools import get_pymbolic_basename +from dune.perftool.loopy.symbolic import substitute +from dune.perftool.sumfact.quadrature import quadrature_inames +from dune.perftool.tools import get_pymbolic_basename, get_pymbolic_tag, ceildiv +from dune.perftool.options import get_option from loopy.kernel.creation import parse_domains from loopy.symbolic import pw_aff_to_expr +from loopy.match import Tagged -from pymbolic.mapper.dependency import DependencyMapper -from pymbolic.mapper.substitutor import substitute +from loopy.symbolic import DependencyMapper +from pytools import product import pymbolic.primitives as prim import loopy as lp import numpy as np +import re -def collect_vector_data_precompute(knl, insns, inames): +class TransposeReg(lp.symbolic.FunctionIdentifier): + def __init__(self, + horizontal=1, + vertical=1, + ): + self.horizontal = horizontal + self.vertical = vertical + + def __getinitargs__(self): + return (self.horizontal, self.vertical) + + @property + def name(self): + return "transpose_reg" + + +@function_mangler +def rotate_function_mangler(knl, func, arg_dtypes): + if isinstance(func, TransposeReg): + # This is not 100% within the loopy philosophy, as we are + # passing the vector registers as references and have them + # changed. Loopy assumes this function to be read-only. + include_file("dune/perftool/sumfact/transposereg.hh", filetag="operatorfile") + vcl = lp.types.NumpyType(get_vcl_type(np.float64, vector_width=func.horizontal * func.vertical)) + return lp.CallMangleInfo(func.name, (), (vcl,) * func.horizontal) + + +class VectorIndices(object): + def __init__(self): + self.needed = set() + + def get(self, increment): + name = "vec_index_inc{}".format(increment) + self.needed.add((name, increment)) + return prim.Variable(name) + + +def collect_vector_data_precompute(knl): # # Process/Assert/Standardize the input # - # inames input -> tuple - if isinstance(inames, str): - inames = inames.split(",") - inames = tuple(i.strip() for i in inames) + insns = [i for i in lp.find_instructions(knl, lp.match.Tagged("quadvec"))] + if not insns: + return knl + inames = quadrature_inames() + + # Analyse the inames of the given instructions and identify inames + # that they all have in common. Those inames will also be iname dependencies + # of inserted instructions. + common_inames = frozenset([]).union(*(insn.within_inames for insn in insns)) - frozenset(inames) - # insns -> list of Instruction instances - if isinstance(insns, lp.match.MatchExpressionBase): - insns = lp.find_instructions(knl, insns) - else: - if isinstance(insns, str): - insns = [i.strip() for i in insns.split(",")] - insns = [knl.id_to_insn[i] for i in insns] + # Determine the vector lane width + # TODO infer the numpy type here + vec_size = get_vcl_type_size(np.float64) + vector_indices = VectorIndices() # # Inspect the given instructions for dependent quantities # and precompute them # + quantities = [] for insn in insns: for expr in DependencyMapper()(insn.expression): quantities.append(get_pymbolic_basename(expr)) quantities = set(quantities) + prec_quantities = [] for quantity in quantities: - # Introduce a substitution rule and find save name - subst_old = knl.substitutions.keys() - knl = lp.assignment_to_subst(knl, quantity, extra_arguments=inames) - subst_new = knl.substitutions.keys() - subst_name, = set(subst_new) - set(subst_old) + # Check whether there is an instruction that writes this quantity within + # the given inames. If so, we need a buffer array. + iname_match = lp.match.And(tuple(lp.match.Iname(i) for i in inames)) + write_match = lp.match.Writes(quantity) + match = lp.match.And((iname_match, write_match)) + write_insns = lp.find_instructions(knl, match) - # Do precomputation of the quantity - prec_quantity = "{}_precomputed".format(quantity) - knl = lp.precompute(knl, subst_name, inames, - temporary_name=prec_quantity, - ) + if write_insns: + # Introduce a substitution rule and find save name + subst_old = knl.substitutions.keys() + knl = lp.assignment_to_subst(knl, quantity) + subst_new = knl.substitutions.keys() + subst_name, = set(subst_new) - set(subst_old) - # Introduce a vector view of the precomputation result - knl = add_vector_view(knl, prec_quantity) + # Do precomputation of the quantity + prec_quantity = "{}_precomputed".format(quantity) + prec_quantities.append(prec_quantity) + knl = lp.precompute(knl, subst_name, inames, + temporary_name=prec_quantity, + ) + + # Enforce memory layout of the precomputation + tmps = knl.temporary_variables + tmps[prec_quantity] = tmps[prec_quantity].copy(dim_tags=",".join(["f"] * len(inames))) + knl = knl.copy(temporary_variables=tmps) + + # Introduce a vector view of the precomputation result + knl = add_vector_view(knl, prec_quantity, flatview=True) # # Construct a flat loop for the given instructions # - new_insns = [] - other_insns = [i for i in knl.instructions if i.id not in [j.id for j in insns]] - size = prim.Product(tuple(pw_aff_to_expr(knl.get_iname_bounds(i).size) for i in inames)) + size = product(tuple(pw_aff_to_expr(knl.get_iname_bounds(i).size) for i in inames)) vec_size = get_vcl_type_size(np.float64) - size = prim.FloorDiv(size, vec_size) - - temporaries = knl.temporary_variables - temporaries["flatsize"] = lp.TemporaryVariable("flatsize", - dtype=np.int32, - shape=(), - ) - new_insns.append(lp.Assignment(prim.Variable("flatsize"), - size, - ) - ) + size = ceildiv(size, vec_size) # Add an additional domain to the kernel - new_iname = "flat_{}".format("_".join(inames)) - domain = "{{ [{0}] : 0<={0}<flatsize }}".format(new_iname, str(size)) - domain = parse_domains(domain, {}) - knl = knl.copy(domains=knl.domains + domain, - temporary_variables=temporaries) + outer_iname = "flat_{}".format("_".join(inames)) + o_domain = "{{ [{0}] : 0<={0}<{1} }}".format(outer_iname, size) + o_domain = parse_domains(o_domain, {}) + vec_iname = "vec_{}".format("_".join(inames)) + i_domain = "{{ [{0}] : 0<={0}<{1} }}".format(vec_iname, vec_size) + i_domain = parse_domains(i_domain, {}) + knl = knl.copy(domains=knl.domains + o_domain + i_domain) + knl = lp.tag_inames(knl, [(vec_iname, "vec")]) + + # Update instruction lists + insns = [i for i in lp.find_instructions(knl, lp.match.Tagged("quadvec"))] + other_insns = [i for i in knl.instructions if i.id not in [j.id for j in insns]] + quantities = {} + for insn in insns: + for expr in DependencyMapper()(insn.expression): + basename = get_pymbolic_basename(expr) + quantities.setdefault(basename, frozenset()) + quantities[basename] = quantities[basename].union(frozenset([expr])) + + replacemap = {} + + # Now gather a replacement map for all the quantities + for quantity, quantity_exprs in quantities.items(): + # This might be a quantity precomputed earlier + if quantity in prec_quantities: + for expr in quantity_exprs: + replacemap[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)), (vector_indices.get(1), prim.Variable(vec_iname))) + # it might also be the output of a sumfactorization kernel + elif quantity in knl.temporary_variables: + tag, = set(get_pymbolic_tag(expr) for expr in quantity_exprs) + if tag is not None and tag.startswith('vecsumfac'): + # Extract information from the tag + horizontal, vertical = tuple(int(i) for i in re.match("vecsumfac_h(.*)_v(.*)", tag).groups()) - # Split and tag the flat iname - knl = lp.split_iname(knl, new_iname, vec_size, inner_tag="vec") - new_inames = ("{}_outer".format(new_iname), "{}_inner".format(new_iname)) - knl = lp.assume(knl, "flatsize mod {} = 0".format(vec_size)) + # 1. Rotating the input data + knl = add_vector_view(knl, quantity, flatview=True) + if horizontal > 1: + new_insns.append(lp.CallInstruction((), # assignees + prim.Call(TransposeReg(vertical=vertical, horizontal=horizontal), + tuple(prim.Subscript(prim.Variable(get_vector_view_name(quantity)), + (vector_indices.get(horizontal) + i, prim.Variable(vec_iname))) + for i in range(horizontal))), + depends_on=frozenset({'continue_stmt'}), + within_inames=common_inames.union(frozenset({outer_iname, vec_iname})), + within_inames_is_final=True, + id="{}_rotate".format(quantity), + )) + + # Add substitution rules + for expr in quantity_exprs: + assert isinstance(expr, prim.Subscript) + last_index = expr.index[-1] // vertical + replacemap[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)), + (vector_indices.get(horizontal) + last_index, prim.Variable(vec_iname)), + ) + elif tag is not None and tag == 'sumfac': + # Add a vector view to this quantity + expr, = quantity_exprs + knl = add_vector_view(knl, quantity, flatview=True) + replacemap[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)), + (vector_indices.get(1), prim.Variable(vec_iname)), + ) + elif quantity in [a.name for a in knl.args]: + arg, = [a for a in knl.args if a.name == quantity] + tags = set(get_pymbolic_tag(expr) for expr in quantity_exprs) + if tags and tags.pop() == "operator_precomputed": + expr, = quantity_exprs + shape=(ceildiv(product(s for s in arg.shape), vec_size), vec_size) + name = loopy_class_member(quantity, + shape=shape, + dim_tags="f,vec", + potentially_vectorized=True, + classtag="operator", + dtype=np.float64, + ) + knl = knl.copy(args=knl.args + [lp.GlobalArg(name, shape=shape, dim_tags="c,vec", dtype=np.float64)]) + replacemap[expr] = prim.Subscript(prim.Variable(name), + (vector_indices.get(1), prim.Variable(vec_iname)), + ) for insn in insns: # Get a vector view of the lhs expression lhsname = get_pymbolic_basename(insn.assignee) - knl = add_vector_view(knl, lhsname) + knl = add_vector_view(knl, lhsname, pad_to=vec_size, flatview=True) lhsname = get_vector_view_name(lhsname) + rotating = "gradvec" in insn.tags + + if rotating: + assert isinstance(insn.assignee, prim.Subscript) + tag = get_pymbolic_tag(insn.assignee) + horizontal, vertical = tuple(int(i) for i in re.match("vecsumfac_h(.*)_v(.*)", tag).groups()) + if horizontal > 1: + last_index = insn.assignee.index[-1] // vertical + else: + last_index = 0 + else: + last_index = 0 + horizontal = 1 - new_insns.append(lp.Assignment(prim.Subscript(prim.Variable(lhsname), tuple(prim.Variable(i) for i in new_inames)), - prim.Subscript(prim.Variable(get_vector_view_name("wk_precomputed")), tuple(prim.Variable(i) for i in new_inames)), - within_inames=frozenset(new_inames), + new_insns.append(lp.Assignment(prim.Subscript(prim.Variable(lhsname), + (vector_indices.get(horizontal) + last_index, prim.Variable(vec_iname)), + ), + substitute(insn.expression, replacemap), + depends_on=frozenset({"continue_stmt"}), + depends_on_is_final=True, + within_inames=common_inames.union(frozenset({outer_iname, vec_iname})), within_inames_is_final=True, + id=insn.id, + tags=frozenset({"vec_write"}) ) ) - return knl.copy(instructions=new_insns + other_insns) + # Rotate back! + if rotating and "{}_rotateback".format(lhsname) not in [i.id for i in new_insns] and horizontal > 1: + new_insns.append(lp.CallInstruction((), # assignees + prim.Call(TransposeReg(horizontal=horizontal, vertical=vertical), + tuple(prim.Subscript(prim.Variable(lhsname), + (vector_indices.get(horizontal) + i, prim.Variable(vec_iname))) + for i in range(horizontal))), + depends_on=frozenset({Tagged("vec_write")}), + within_inames=common_inames.union(frozenset({outer_iname, vec_iname})), + within_inames_is_final=True, + id="{}_rotateback".format(lhsname), + )) + + # Add the necessary vector indices + temporaries = knl.temporary_variables + for name, increment in vector_indices.needed: + temporaries[name] = lp.TemporaryVariable(name, # name + dtype=np.int32, + scope=lp.temp_var_scope.PRIVATE, + ) + new_insns.append(lp.Assignment(prim.Variable(name), # assignee + 0, # expression + within_inames=common_inames, + within_inames_is_final=True, + id="assign_{}".format(name), + )) + new_insns.append(lp.Assignment(prim.Variable(name), # assignee + prim.Sum((prim.Variable(name), increment)), # expression + within_inames=common_inames.union(frozenset({outer_iname})), + within_inames_is_final=True, + depends_on=frozenset({Tagged("vec_write"), "assign_{}".format(name)}), + depends_on_is_final=True, + id="update_{}".format(name), + )) + + from loopy.kernel.creation import resolve_dependencies + return resolve_dependencies(knl.copy(instructions=new_insns + other_insns, + temporary_variables=temporaries, + )) diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 4e7227e12b1925ee59588023685471460046c3e6..d0b1d450d52a6ac6aad3ee892afdce4bdb64a899 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -503,7 +503,9 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): if get_option("vectorize_quad"): if get_option("sumfact"): from dune.perftool.loopy.transformations.collect_rotate import collect_vector_data_rotate - kernel = collect_vector_data_rotate(kernel) + from dune.perftool.loopy.transformations.collect_precompute import collect_vector_data_precompute + kernel = collect_vector_data_precompute(kernel) +# kernel = collect_vector_data_rotate(kernel) else: raise NotImplementedError("Only vectorizing sumfactorized code right now!") diff --git a/python/dune/perftool/sumfact/quadrature.py b/python/dune/perftool/sumfact/quadrature.py index d12b7aeb84da5c107235c7cf5872245a0e0500e1..b86f77514e60b6d8e0d76bb6b41dbcdba3fe0f08 100644 --- a/python/dune/perftool/sumfact/quadrature.py +++ b/python/dune/perftool/sumfact/quadrature.py @@ -4,6 +4,7 @@ from dune.perftool.generation import (backend, get_global_context_value, iname, instruction, + kernel_cached, loopy_class_member, temporary_variable, ) @@ -74,16 +75,12 @@ def pymbolic_base_weight(): return 1.0 -@iname -def sumfact_quad_iname(d, bound): - name = "quad_{}".format(d) - domain(name, quadrature_points_per_direction()) - return name - - @backend(interface="quad_inames", name="sumfact") +@kernel_cached def quadrature_inames(): - return tuple(sumfact_quad_iname(d, quadrature_points_per_direction()) for d in range(local_dimension())) + names = tuple("quad_{}".format(d) for d in range(local_dimension())) + domain(names, quadrature_points_per_direction()) + return names @iname(kernel="operator")