From 0c2f06d34510c5b8ecb0890950ce06892a4423a3 Mon Sep 17 00:00:00 2001 From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> Date: Thu, 17 Nov 2016 14:03:53 +0100 Subject: [PATCH] Saving some work --- python/dune/perftool/loopy/__init__.py | 1 + .../loopy/transformations/collect_rotate.py | 184 ++++++++++++++++++ .../loopy/transformations/vectorview.py | 9 + 3 files changed, 194 insertions(+) create mode 100644 python/dune/perftool/loopy/transformations/collect_rotate.py diff --git a/python/dune/perftool/loopy/__init__.py b/python/dune/perftool/loopy/__init__.py index 2ef32a24..e645b4c8 100644 --- a/python/dune/perftool/loopy/__init__.py +++ b/python/dune/perftool/loopy/__init__.py @@ -1,4 +1,5 @@ """ Export the interface interesting to the rest of the project """ from dune.perftool.loopy.transformations.collect_precompute import collect_vector_data_precompute +from dune.perftool.loopy.transformations.collect_rotate import collect_vector_data_rotate from dune.perftool.loopy.transformations.duplicate import heuristic_duplication diff --git a/python/dune/perftool/loopy/transformations/collect_rotate.py b/python/dune/perftool/loopy/transformations/collect_rotate.py new file mode 100644 index 00000000..d702ab8c --- /dev/null +++ b/python/dune/perftool/loopy/transformations/collect_rotate.py @@ -0,0 +1,184 @@ +""" A kernel transformation that precomputes quantities until a vector register +is filled and then does vector computations """ + +from dune.perftool.loopy.vcl import get_vcl_type_size +from dune.perftool.loopy.transformations.vectorview import (add_temporary_with_vector_view, + add_vector_view, + get_vector_view_name, + ) +from dune.perftool.tools import get_pymbolic_basename + +from loopy.kernel.creation import parse_domains +from loopy.symbolic import pw_aff_to_expr + +from pymbolic.mapper.dependency import DependencyMapper +from pymbolic.mapper.substitutor import substitute + +import pymbolic.primitives as prim +import loopy as lp +import numpy as np + + +def collect_vector_data_rotate(knl, insns, inames): + # + # Process/Assert/Standardize the input + # + + # inames input -> tuple + if isinstance(inames, str): + inames = inames.split(",") + inames = tuple(i.strip() for i in inames) + + # insns -> list of Instruction instances + if isinstance(insns, lp.match.MatchExpressionBase): + insns = lp.find_instructions(knl, insns) + else: + if isinstance(insns, str): + insns = [i.strip() for i in insns.split(",")] + insns = [knl.id_to_insn[i] for i in insns] + + # Analyse the inames of the given instructions and identify inames + # that they all have in common. Those inames will also be iname dependencies + # of inserted instructions. + common_inames = frozenset([]).union(*(insn.within_inames for insn in insns)) - frozenset(inames) + + # Determine the vector lane width + # TODO infer the numpy type here + vec_size = get_vcl_type_size(np.float64) + + # + # Inspect the given instructions for dependent quantities + # + + quantities = {} + for insn in insns: + for expr in DependencyMapper()(insn.expression): + basename = get_pymbolic_basename(expr) + quantities.setdefault(basename, frozenset()) + quantities[basename] = quantities[basename].union(frozenset([expr])) + assert all(len(q) == 1 for q in quantities.values()) + + # Add vector size buffers for all these quantities + replacemap_arr = {} + replacemap_vec = {} + for quantity in quantities: + expr, = quantities[quantity] + arrname = quantity + '_buffered_arr' + knl = add_temporary_with_vector_view(knl, + arrname, + dtype=np.float64, + shape=(vec_size,), + dim_tags="c", + base_storage=quantity + '_base_storage', + ) + + replacemap_arr[quantity] = prim.Subscript(prim.Variable(arrname), (prim.Variable('rotate_index'),)) + replacemap_vec[expr] = prim.Variable(get_vector_view_name(arrname)) + + write_match = lp.match.Or(tuple(lp.match.Writes(q) for q in quantities)) + iname_match = lp.match.And(tuple(lp.match.Iname(i) for i in inames)) + match = lp.match.And((write_match, iname_match)) + write_insns = lp.find_instructions(knl, match) + + other_insns = [i for i in knl.instructions if i.id not in [j.id for j in insns + write_insns]] + new_insns = [] + temporaries = knl.temporary_variables + + for insn in write_insns: + if isinstance(insn, lp.Assignment): + new_insns.append(insn.copy(assignee=replacemap_arr[get_pymbolic_basename(insn.assignee)], + ) + ) + elif isinstance(insn, lp.CInstruction): + pass + else: + raise NotImplementedError + + # + # Add two counter variables to the kernel + # + + # Insert a flat consecutive counter 'total_index' + temporaries['total_index'] = lp.TemporaryVariable('total_index', # name + dtype=np.int32, + ) + new_insns.append(lp.Assignment(prim.Variable("total_index"), # assignee + 0, # expression + within_inames=common_inames, + within_inames_is_final=True, + id="assign_total_index", + )) + new_insns.append(lp.Assignment(prim.Variable("total_index"), # assignee + prim.Sum((prim.Variable("total_index"), 1)), # expression + within_inames=common_inames.union(inames), + within_inames_is_final=True, + depends_on=frozenset([i.id for i in write_insns]).union(frozenset({"assign_total_index"})), + depends_on_is_final=True, + id="update_total_index", + )) + + # Insert a rotating index, that counts 0 , .. , vecsize - 1 + temporaries['rotate_index'] = lp.TemporaryVariable('rotate_index', # name + dtype=np.int32, + ) + new_insns.append(lp.Assignment(prim.Variable("rotate_index"), # assignee + 0, # expression + within_inames=common_inames, + within_inames_is_final=True, + id="assign_rotate_index", + )) + new_insns.append(lp.Assignment(prim.Variable("rotate_index"), # assignee + prim.Remainder(prim.Sum((prim.Variable("rotate_index"), 1)), vec_size), # expression + within_inames=common_inames.union(inames), + within_inames_is_final=True, + depends_on=frozenset([i.id for i in write_insns]).union(frozenset({"assign_rotate_index"})), + depends_on_is_final=True, + id="update_rotate_index", + )) + + # + # Construct a flat loop for the given instructions + # + +# new_insns = [] +# other_insns = [i for i in knl.instructions if i.id not in [j.id for j in insns]] +# +# size = prim.Product(tuple(pw_aff_to_expr(knl.get_iname_bounds(i).size) for i in inames)) +# size = prim.FloorDiv(size, vec_size) +# +# temporaries = knl.temporary_variables +# temporaries["flatsize"] = lp.TemporaryVariable("flatsize", +# dtype=np.int32, +# shape=(), +# ) +# new_insns.append(lp.Assignment(prim.Variable("flatsize"), +# size, +# ) +# ) +# +# # Add an additional domain to the kernel +# new_iname = "flat_{}".format("_".join(inames)) +# domain = "{{ [{0}] : 0<={0}<flatsize }}".format(new_iname, str(size)) +# domain = parse_domains(domain, {}) +# knl = knl.copy(domains=knl.domains + domain, +# temporary_variables=temporaries) +# +# # Split and tag the flat iname +# knl = lp.split_iname(knl, new_iname, vec_size, inner_tag="vec") +# new_inames = ("{}_outer".format(new_iname), "{}_inner".format(new_iname)) +# knl = lp.assume(knl, "flatsize mod {} = 0".format(vec_size)) +# +# for insn in insns: +# # Get a vector view of the lhs expression +# lhsname = get_pymbolic_basename(insn.assignee) +# knl = add_vector_view(knl, lhsname) +# lhsname = get_vector_view_name(lhsname) +# +# new_insns.append(lp.Assignment(prim.Subscript(prim.Variable(lhsname), tuple(prim.Variable(i) for i in new_inames)), +# prim.Subscript(prim.Variable(get_vector_view_name("wk_precomputed")), tuple(prim.Variable(i) for i in new_inames)), +# within_inames=frozenset(new_inames), +# within_inames_is_final=True, +# ) +# ) + + return knl.copy(instructions=new_insns + other_insns) diff --git a/python/dune/perftool/loopy/transformations/vectorview.py b/python/dune/perftool/loopy/transformations/vectorview.py index 86aefa81..dd36969b 100644 --- a/python/dune/perftool/loopy/transformations/vectorview.py +++ b/python/dune/perftool/loopy/transformations/vectorview.py @@ -46,3 +46,12 @@ def add_vector_view(knl, tmpname): ) return knl.copy(temporary_variables=temporaries) + + +def add_temporary_with_vector_view(knl, name, *args, **kwargs): + temps = knl.temporary_variables + assert name not in temps + temps[name] = lp.TemporaryVariable(name, *args, **kwargs) + knl = knl.copy(temporary_variables=temps) + knl = add_vector_view(knl, name) + return knl \ No newline at end of file -- GitLab