Skip to content
Snippets Groups Projects
Commit 1797a0ec authored by Dominic Kempf's avatar Dominic Kempf
Browse files

New implementation of precompute based quadrature loop vectorization

parent 593d03c5
No related branches found
No related tags found
No related merge requests found
...@@ -50,6 +50,8 @@ def valuearg(name, **kw): ...@@ -50,6 +50,8 @@ def valuearg(name, **kw):
@generator_factory(item_tags=("domain",), context_tags="kernel") @generator_factory(item_tags=("domain",), context_tags="kernel")
def domain(iname, shape): def domain(iname, shape):
if isinstance(iname, tuple):
iname = ",".join(iname)
if isinstance(shape, str): if isinstance(shape, str):
valuearg(shape) valuearg(shape)
return "{{ [{0}] : 0<={0}<{1} }}".format(iname, shape) return "{{ [{0}] : 0<={0}<{1} }}".format(iname, shape)
......
""" A kernel transformation that precomputes data and then splits computation """ A kernel transformation that precomputes data and then splits computation
in chunks of vector size independent of divisibility of the loop bounds. """ in chunks of vector size independent of divisibility of the loop bounds. """
from dune.perftool.loopy.vcl import get_vcl_type_size from dune.perftool.generation import (function_mangler,
from dune.perftool.loopy.transformations.vectorview import (add_vector_view, include_file,
loopy_class_member,
)
from dune.perftool.loopy.vcl import get_vcl_type, get_vcl_type_size
from dune.perftool.loopy.transformations.vectorview import (add_temporary_with_vector_view,
add_vector_view,
get_vector_view_name, get_vector_view_name,
) )
from dune.perftool.tools import get_pymbolic_basename from dune.perftool.loopy.symbolic import substitute
from dune.perftool.sumfact.quadrature import quadrature_inames
from dune.perftool.tools import get_pymbolic_basename, get_pymbolic_tag, ceildiv
from dune.perftool.options import get_option
from loopy.kernel.creation import parse_domains from loopy.kernel.creation import parse_domains
from loopy.symbolic import pw_aff_to_expr from loopy.symbolic import pw_aff_to_expr
from loopy.match import Tagged
from pymbolic.mapper.dependency import DependencyMapper from loopy.symbolic import DependencyMapper
from pymbolic.mapper.substitutor import substitute from pytools import product
import pymbolic.primitives as prim import pymbolic.primitives as prim
import loopy as lp import loopy as lp
import numpy as np import numpy as np
import re
def collect_vector_data_precompute(knl, insns, inames): class TransposeReg(lp.symbolic.FunctionIdentifier):
def __init__(self,
horizontal=1,
vertical=1,
):
self.horizontal = horizontal
self.vertical = vertical
def __getinitargs__(self):
return (self.horizontal, self.vertical)
@property
def name(self):
return "transpose_reg"
@function_mangler
def rotate_function_mangler(knl, func, arg_dtypes):
if isinstance(func, TransposeReg):
# This is not 100% within the loopy philosophy, as we are
# passing the vector registers as references and have them
# changed. Loopy assumes this function to be read-only.
include_file("dune/perftool/sumfact/transposereg.hh", filetag="operatorfile")
vcl = lp.types.NumpyType(get_vcl_type(np.float64, vector_width=func.horizontal * func.vertical))
return lp.CallMangleInfo(func.name, (), (vcl,) * func.horizontal)
class VectorIndices(object):
def __init__(self):
self.needed = set()
def get(self, increment):
name = "vec_index_inc{}".format(increment)
self.needed.add((name, increment))
return prim.Variable(name)
def collect_vector_data_precompute(knl):
# #
# Process/Assert/Standardize the input # Process/Assert/Standardize the input
# #
# inames input -> tuple insns = [i for i in lp.find_instructions(knl, lp.match.Tagged("quadvec"))]
if isinstance(inames, str): if not insns:
inames = inames.split(",") return knl
inames = tuple(i.strip() for i in inames) inames = quadrature_inames()
# Analyse the inames of the given instructions and identify inames
# that they all have in common. Those inames will also be iname dependencies
# of inserted instructions.
common_inames = frozenset([]).union(*(insn.within_inames for insn in insns)) - frozenset(inames)
# insns -> list of Instruction instances # Determine the vector lane width
if isinstance(insns, lp.match.MatchExpressionBase): # TODO infer the numpy type here
insns = lp.find_instructions(knl, insns) vec_size = get_vcl_type_size(np.float64)
else: vector_indices = VectorIndices()
if isinstance(insns, str):
insns = [i.strip() for i in insns.split(",")]
insns = [knl.id_to_insn[i] for i in insns]
# #
# Inspect the given instructions for dependent quantities # Inspect the given instructions for dependent quantities
# and precompute them # and precompute them
# #
quantities = [] quantities = []
for insn in insns: for insn in insns:
for expr in DependencyMapper()(insn.expression): for expr in DependencyMapper()(insn.expression):
quantities.append(get_pymbolic_basename(expr)) quantities.append(get_pymbolic_basename(expr))
quantities = set(quantities) quantities = set(quantities)
prec_quantities = []
for quantity in quantities: for quantity in quantities:
# Introduce a substitution rule and find save name # Check whether there is an instruction that writes this quantity within
subst_old = knl.substitutions.keys() # the given inames. If so, we need a buffer array.
knl = lp.assignment_to_subst(knl, quantity, extra_arguments=inames) iname_match = lp.match.And(tuple(lp.match.Iname(i) for i in inames))
subst_new = knl.substitutions.keys() write_match = lp.match.Writes(quantity)
subst_name, = set(subst_new) - set(subst_old) match = lp.match.And((iname_match, write_match))
write_insns = lp.find_instructions(knl, match)
# Do precomputation of the quantity if write_insns:
prec_quantity = "{}_precomputed".format(quantity) # Introduce a substitution rule and find save name
knl = lp.precompute(knl, subst_name, inames, subst_old = knl.substitutions.keys()
temporary_name=prec_quantity, knl = lp.assignment_to_subst(knl, quantity)
) subst_new = knl.substitutions.keys()
subst_name, = set(subst_new) - set(subst_old)
# Introduce a vector view of the precomputation result # Do precomputation of the quantity
knl = add_vector_view(knl, prec_quantity) prec_quantity = "{}_precomputed".format(quantity)
prec_quantities.append(prec_quantity)
knl = lp.precompute(knl, subst_name, inames,
temporary_name=prec_quantity,
)
# Enforce memory layout of the precomputation
tmps = knl.temporary_variables
tmps[prec_quantity] = tmps[prec_quantity].copy(dim_tags=",".join(["f"] * len(inames)))
knl = knl.copy(temporary_variables=tmps)
# Introduce a vector view of the precomputation result
knl = add_vector_view(knl, prec_quantity, flatview=True)
# #
# Construct a flat loop for the given instructions # Construct a flat loop for the given instructions
# #
new_insns = [] new_insns = []
other_insns = [i for i in knl.instructions if i.id not in [j.id for j in insns]]
size = prim.Product(tuple(pw_aff_to_expr(knl.get_iname_bounds(i).size) for i in inames)) size = product(tuple(pw_aff_to_expr(knl.get_iname_bounds(i).size) for i in inames))
vec_size = get_vcl_type_size(np.float64) vec_size = get_vcl_type_size(np.float64)
size = prim.FloorDiv(size, vec_size) size = ceildiv(size, vec_size)
temporaries = knl.temporary_variables
temporaries["flatsize"] = lp.TemporaryVariable("flatsize",
dtype=np.int32,
shape=(),
)
new_insns.append(lp.Assignment(prim.Variable("flatsize"),
size,
)
)
# Add an additional domain to the kernel # Add an additional domain to the kernel
new_iname = "flat_{}".format("_".join(inames)) outer_iname = "flat_{}".format("_".join(inames))
domain = "{{ [{0}] : 0<={0}<flatsize }}".format(new_iname, str(size)) o_domain = "{{ [{0}] : 0<={0}<{1} }}".format(outer_iname, size)
domain = parse_domains(domain, {}) o_domain = parse_domains(o_domain, {})
knl = knl.copy(domains=knl.domains + domain, vec_iname = "vec_{}".format("_".join(inames))
temporary_variables=temporaries) i_domain = "{{ [{0}] : 0<={0}<{1} }}".format(vec_iname, vec_size)
i_domain = parse_domains(i_domain, {})
knl = knl.copy(domains=knl.domains + o_domain + i_domain)
knl = lp.tag_inames(knl, [(vec_iname, "vec")])
# Update instruction lists
insns = [i for i in lp.find_instructions(knl, lp.match.Tagged("quadvec"))]
other_insns = [i for i in knl.instructions if i.id not in [j.id for j in insns]]
quantities = {}
for insn in insns:
for expr in DependencyMapper()(insn.expression):
basename = get_pymbolic_basename(expr)
quantities.setdefault(basename, frozenset())
quantities[basename] = quantities[basename].union(frozenset([expr]))
replacemap = {}
# Now gather a replacement map for all the quantities
for quantity, quantity_exprs in quantities.items():
# This might be a quantity precomputed earlier
if quantity in prec_quantities:
for expr in quantity_exprs:
replacemap[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)), (vector_indices.get(1), prim.Variable(vec_iname)))
# it might also be the output of a sumfactorization kernel
elif quantity in knl.temporary_variables:
tag, = set(get_pymbolic_tag(expr) for expr in quantity_exprs)
if tag is not None and tag.startswith('vecsumfac'):
# Extract information from the tag
horizontal, vertical = tuple(int(i) for i in re.match("vecsumfac_h(.*)_v(.*)", tag).groups())
# Split and tag the flat iname # 1. Rotating the input data
knl = lp.split_iname(knl, new_iname, vec_size, inner_tag="vec") knl = add_vector_view(knl, quantity, flatview=True)
new_inames = ("{}_outer".format(new_iname), "{}_inner".format(new_iname)) if horizontal > 1:
knl = lp.assume(knl, "flatsize mod {} = 0".format(vec_size)) new_insns.append(lp.CallInstruction((), # assignees
prim.Call(TransposeReg(vertical=vertical, horizontal=horizontal),
tuple(prim.Subscript(prim.Variable(get_vector_view_name(quantity)),
(vector_indices.get(horizontal) + i, prim.Variable(vec_iname)))
for i in range(horizontal))),
depends_on=frozenset({'continue_stmt'}),
within_inames=common_inames.union(frozenset({outer_iname, vec_iname})),
within_inames_is_final=True,
id="{}_rotate".format(quantity),
))
# Add substitution rules
for expr in quantity_exprs:
assert isinstance(expr, prim.Subscript)
last_index = expr.index[-1] // vertical
replacemap[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)),
(vector_indices.get(horizontal) + last_index, prim.Variable(vec_iname)),
)
elif tag is not None and tag == 'sumfac':
# Add a vector view to this quantity
expr, = quantity_exprs
knl = add_vector_view(knl, quantity, flatview=True)
replacemap[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)),
(vector_indices.get(1), prim.Variable(vec_iname)),
)
elif quantity in [a.name for a in knl.args]:
arg, = [a for a in knl.args if a.name == quantity]
tags = set(get_pymbolic_tag(expr) for expr in quantity_exprs)
if tags and tags.pop() == "operator_precomputed":
expr, = quantity_exprs
shape=(ceildiv(product(s for s in arg.shape), vec_size), vec_size)
name = loopy_class_member(quantity,
shape=shape,
dim_tags="f,vec",
potentially_vectorized=True,
classtag="operator",
dtype=np.float64,
)
knl = knl.copy(args=knl.args + [lp.GlobalArg(name, shape=shape, dim_tags="c,vec", dtype=np.float64)])
replacemap[expr] = prim.Subscript(prim.Variable(name),
(vector_indices.get(1), prim.Variable(vec_iname)),
)
for insn in insns: for insn in insns:
# Get a vector view of the lhs expression # Get a vector view of the lhs expression
lhsname = get_pymbolic_basename(insn.assignee) lhsname = get_pymbolic_basename(insn.assignee)
knl = add_vector_view(knl, lhsname) knl = add_vector_view(knl, lhsname, pad_to=vec_size, flatview=True)
lhsname = get_vector_view_name(lhsname) lhsname = get_vector_view_name(lhsname)
rotating = "gradvec" in insn.tags
if rotating:
assert isinstance(insn.assignee, prim.Subscript)
tag = get_pymbolic_tag(insn.assignee)
horizontal, vertical = tuple(int(i) for i in re.match("vecsumfac_h(.*)_v(.*)", tag).groups())
if horizontal > 1:
last_index = insn.assignee.index[-1] // vertical
else:
last_index = 0
else:
last_index = 0
horizontal = 1
new_insns.append(lp.Assignment(prim.Subscript(prim.Variable(lhsname), tuple(prim.Variable(i) for i in new_inames)), new_insns.append(lp.Assignment(prim.Subscript(prim.Variable(lhsname),
prim.Subscript(prim.Variable(get_vector_view_name("wk_precomputed")), tuple(prim.Variable(i) for i in new_inames)), (vector_indices.get(horizontal) + last_index, prim.Variable(vec_iname)),
within_inames=frozenset(new_inames), ),
substitute(insn.expression, replacemap),
depends_on=frozenset({"continue_stmt"}),
depends_on_is_final=True,
within_inames=common_inames.union(frozenset({outer_iname, vec_iname})),
within_inames_is_final=True, within_inames_is_final=True,
id=insn.id,
tags=frozenset({"vec_write"})
) )
) )
return knl.copy(instructions=new_insns + other_insns) # Rotate back!
if rotating and "{}_rotateback".format(lhsname) not in [i.id for i in new_insns] and horizontal > 1:
new_insns.append(lp.CallInstruction((), # assignees
prim.Call(TransposeReg(horizontal=horizontal, vertical=vertical),
tuple(prim.Subscript(prim.Variable(lhsname),
(vector_indices.get(horizontal) + i, prim.Variable(vec_iname)))
for i in range(horizontal))),
depends_on=frozenset({Tagged("vec_write")}),
within_inames=common_inames.union(frozenset({outer_iname, vec_iname})),
within_inames_is_final=True,
id="{}_rotateback".format(lhsname),
))
# Add the necessary vector indices
temporaries = knl.temporary_variables
for name, increment in vector_indices.needed:
temporaries[name] = lp.TemporaryVariable(name, # name
dtype=np.int32,
scope=lp.temp_var_scope.PRIVATE,
)
new_insns.append(lp.Assignment(prim.Variable(name), # assignee
0, # expression
within_inames=common_inames,
within_inames_is_final=True,
id="assign_{}".format(name),
))
new_insns.append(lp.Assignment(prim.Variable(name), # assignee
prim.Sum((prim.Variable(name), increment)), # expression
within_inames=common_inames.union(frozenset({outer_iname})),
within_inames_is_final=True,
depends_on=frozenset({Tagged("vec_write"), "assign_{}".format(name)}),
depends_on_is_final=True,
id="update_{}".format(name),
))
from loopy.kernel.creation import resolve_dependencies
return resolve_dependencies(knl.copy(instructions=new_insns + other_insns,
temporary_variables=temporaries,
))
...@@ -503,7 +503,9 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): ...@@ -503,7 +503,9 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
if get_option("vectorize_quad"): if get_option("vectorize_quad"):
if get_option("sumfact"): if get_option("sumfact"):
from dune.perftool.loopy.transformations.collect_rotate import collect_vector_data_rotate from dune.perftool.loopy.transformations.collect_rotate import collect_vector_data_rotate
kernel = collect_vector_data_rotate(kernel) from dune.perftool.loopy.transformations.collect_precompute import collect_vector_data_precompute
kernel = collect_vector_data_precompute(kernel)
# kernel = collect_vector_data_rotate(kernel)
else: else:
raise NotImplementedError("Only vectorizing sumfactorized code right now!") raise NotImplementedError("Only vectorizing sumfactorized code right now!")
......
...@@ -4,6 +4,7 @@ from dune.perftool.generation import (backend, ...@@ -4,6 +4,7 @@ from dune.perftool.generation import (backend,
get_global_context_value, get_global_context_value,
iname, iname,
instruction, instruction,
kernel_cached,
loopy_class_member, loopy_class_member,
temporary_variable, temporary_variable,
) )
...@@ -74,16 +75,12 @@ def pymbolic_base_weight(): ...@@ -74,16 +75,12 @@ def pymbolic_base_weight():
return 1.0 return 1.0
@iname
def sumfact_quad_iname(d, bound):
name = "quad_{}".format(d)
domain(name, quadrature_points_per_direction())
return name
@backend(interface="quad_inames", name="sumfact") @backend(interface="quad_inames", name="sumfact")
@kernel_cached
def quadrature_inames(): def quadrature_inames():
return tuple(sumfact_quad_iname(d, quadrature_points_per_direction()) for d in range(local_dimension())) names = tuple("quad_{}".format(d) for d in range(local_dimension()))
domain(names, quadrature_points_per_direction())
return names
@iname(kernel="operator") @iname(kernel="operator")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment