Skip to content
Snippets Groups Projects
Commit 193fb5f7 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

A first draft of lower/upper half vectorization for stage 1

parent 880adbdc
No related branches found
No related tags found
No related merge requests found
......@@ -76,8 +76,17 @@ class ExplicitVCLCast(lp.symbolic.FunctionIdentifier):
return get_vcl_typename(self.nptype, vector_width=self.vector_width)
class VCLLowerUpperLoad(ExplicitVCLCast):
pass
@function_mangler
def vcl_cast_mangler(knl, func, arg_dtypes):
if isinstance(func, VCLLowerUpperLoad):
return lp.CallMangleInfo(func.name,
(lp.types.NumpyType(func.nptype),),
arg_dtypes)
if isinstance(func, ExplicitVCLCast):
return lp.CallMangleInfo(func.name, (lp.types.NumpyType(func.nptype),), (arg_dtypes[0],))
......
......@@ -6,7 +6,7 @@ from dune.perftool.pdelab.geometry import local_dimension, world_dimension
from dune.perftool.sumfact.quadrature import quadrature_inames
from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray
from dune.perftool.loopy.target import dtype_floatingpoint
from dune.perftool.loopy.vcl import ExplicitVCLCast
from dune.perftool.loopy.vcl import ExplicitVCLCast, VCLLowerUpperLoad
from pytools import ImmutableRecord, product
......@@ -61,7 +61,11 @@ class VectorSumfactKernelInput(SumfactKernelInputBase):
# The lower and the upper part of the SIMD register use
# the same input coefficient, we combine the SIMD register
# from two shorter SIMD types
raise NotImplementedError("Lower/Upper half SIMD loads not implemented!")
return prim.Call(VCLLowerUpperLoad(dtype_floatingpoint()),
(self.inputs[0].realize_direct(shape, inames),
self.inputs[len(self.inputs) // 2].realize_direct(shape, inames),
)
)
else:
# The input does not exhibit a broadcastable structure, we
# need to load scalars into the SIMD vector.
......@@ -405,7 +409,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
# Assert all the properties that need to be the same across all subkernels
assert len(set(k.stage for k in kernels)) == 1
assert len(set(k.length for k in kernels)) == 1
assert len(set(k.restriction for k in kernels)) == 1
assert len(set(k.within_inames for k in kernels)) == 1
assert len(set(k.predicates for k in kernels)) == 1
......@@ -413,7 +416,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
for i in range(kernels[0].length):
assert len(set(tuple(k.matrix_sequence[i].rows for k in kernels))) == 1
assert len(set(tuple(k.matrix_sequence[i].cols for k in kernels))) == 1
assert len(set(tuple(k.matrix_sequence[i].face for k in kernels))) == 1
assert len(set(tuple(k.matrix_sequence[i].direction for k in kernels))) == 1
assert len(set(tuple(k.matrix_sequence[i].transpose for k in kernels))) == 1
# Join the instruction dependencies of all subkernels
......
......@@ -131,7 +131,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
assert len(set(t.quadrature_size for t in tabs)) == 1
assert len(set(t.basis_size for t in tabs)) == 1
assert len(set(t.transpose for t in tabs)) == 1
assert len(set(t.face for t in tabs)) == 1
assert len(set(t.direction for t in tabs)) == 1
assert len(set(t.slice_size for t in tabs)) == 1
self.tabs = tabs
......
......@@ -265,85 +265,131 @@ def level2_optimal_vectorization_strategy(sumfacts, width, qp):
for key in keys:
key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key)
key_strategy = level3_optimal_vectorization_strategy(key_sumfacts, width, qp)
key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp),
key=fixedqp_strategy_costfunction(qp))
sfdict = add_to_frozendict(sfdict, key_strategy)
return sfdict
def level3_optimal_vectorization_strategy(sumfacts, width, qp):
# Find the sets of simultaneously realizable kernels (thats an equivalence relation)
keys = frozenset(sf.input_key for sf in sumfacts)
def level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
for opp in _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
# Add non-vectorized implementation information to all kernels that are not present in
# the optimal strategy
yield add_to_frozendict(opp,
{sf: sf.copy(buffer=get_counted_variable("buffer")) for sf in sumfacts if sf not in opp})
# Find minimums for each of these sets
sfdict = frozendict()
for key in keys:
key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key)
minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp),
key=fixedqp_strategy_costfunction(qp))
sfdict = add_to_frozendict(sfdict, minimum)
def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, already=frozendict()):
if len(sumfacts) == 0:
yield already
return
return sfdict
# We store the information whether a vectorization opportunity has been yielded from this
# generator to yield an incomplete strategy if not (which is then completed with unvectorized
# kernel implementations)
yielded = False
# Find the number of input coefficients we can work on
keys = frozenset(sf.input_key for sf in sumfacts)
inputkey_sumfacts = [frozenset(filter(lambda sf: sf.input_key == key, sumfacts)) for key in keys]
def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
if len(sumfacts) == 0:
# We have gone into recursion deep enough to have all sum factorization nodes
# assigned their vectorized counterpart. We can yield the result now!
for parallel in (1, 2):
if parallel == 2 and next(iter(sumfacts)).stage == 3:
continue
for which in filter(lambda w: w == tuple(sorted(w)),
it.permutations(range(len(keys)), parallel)):
horizontal = 1
while horizontal <= width // parallel:
generators = [filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))),
it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal) for part in which)))]
if horizontal >=4:
generators.append(filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))),
it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal - 1) for part in which))))
for combo in it.chain(*generators):
combo = sum(combo, ())
vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp)
if vecdict is None:
# This particular choice was rejected for some reason.
# Possible reasons:
# * the quadrature point tuple not being suitable
# for this vectorization strategy
continue
# Go into recursion to also vectorize all kernels not in this combo
for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo),
width,
qp,
add_to_frozendict(already, vecdict),
):
yielded = True
yield opp
horizontal *= 2
# If we did not yield on this recursion level, yield what we got so far
if not yielded:
yield already
return
# Ensure a deterministic order of the given sumfact kernels. This is necessary for the
# fromlist strategy to pick correct strategies across different program runs
sumfacts = sorted(sumfacts, key=lambda sf: repr(sf))
# Otherwise we pick a random sum factorization kernel and construct all the vectorization
# opportunities realizing this particular kernel and go into recursion.
sf_to_decide = sumfacts[0]
# Have "unvectorized" as an option, although it is not good
for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]),
width,
qp,
add_to_frozendict(already,
{sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
),
):
yield opp
horizontal = 1
while horizontal <= width:
# Iterate over the possible combinations of sum factorization kernels
# taking into account all the permutations of kernels. This also includes
# combinations which use a padding of 1 - but only for pure horizontality.
generators = [it.permutations(sumfacts, horizontal)]
if horizontal >= 4:
generators.append(it.permutations(sumfacts, horizontal - 1))
for combo in it.chain(*generators):
# The chosen kernels must be part of the kernels for recursion
# to work correctly
if sf_to_decide not in combo:
continue
# Set up the vectorization dict for this combo
vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
if vecdict is None:
# This particular choice was rejected for some reason.
# Possible reasons:
# * the quadrature point tuple not being suitable
# for this vectorization strategy
continue
# Go into recursion to also vectorize all kernels not in this combo
for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo),
width,
qp,
add_to_frozendict(already, vecdict),
):
yield opp
horizontal = horizontal * 2
#
# def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
# if len(sumfacts) == 0:
# # We have gone into recursion deep enough to have all sum factorization nodes
# # assigned their vectorized counterpart. We can yield the result now!
# yield already
# return
#
# # Ensure a deterministic order of the given sumfact kernels. This is necessary for the
# # fromlist strategy to pick correct strategies across different program runs
# sumfacts = sorted(sumfacts, key=lambda sf: repr(sf))
#
# # Otherwise we pick a random sum factorization kernel and construct all the vectorization
# # opportunities realizing this particular kernel and go into recursion.
# sf_to_decide = sumfacts[0]
#
# # Have "unvectorized" as an option, although it is not good
# for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]),
# width,
# qp,
# add_to_frozendict(already,
# {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
# ),
# ):
# yield opp
#
# horizontal = 1
# while horizontal <= width:
# # Iterate over the possible combinations of sum factorization kernels
# # taking into account all the permutations of kernels. This also includes
# # combinations which use a padding of 1 - but only for pure horizontality.
# generators = [it.permutations(sumfacts, horizontal)]
# if horizontal >= 4:
# generators.append(it.permutations(sumfacts, horizontal - 1))
# for combo in it.chain(*generators):
# # The chosen kernels must be part of the kernels for recursion
# # to work correctly
# if sf_to_decide not in combo:
# continue
#
# # Set up the vectorization dict for this combo
# vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
# if vecdict is None:
# # This particular choice was rejected for some reason.
# # Possible reasons:
# # * the quadrature point tuple not being suitable
# # for this vectorization strategy
# continue
#
# # Go into recursion to also vectorize all kernels not in this combo
# for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo),
# width,
# qp,
# add_to_frozendict(already, vecdict),
# ):
# yield opp
#
# horizontal = horizontal * 2
def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment