Newer
Older
""" Sum factorization vectorization """
from dune.perftool.loopy.target import dtype_floatingpoint
from dune.perftool.loopy.vcl import get_vcl_type_size
from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel
from dune.perftool.generation import (backend,
generator_factory,
get_backend,
get_counted_variable,
from dune.perftool.pdelab.restriction import (Restriction,
restricted_name,
)
from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray,
quadrature_points_per_direction,
set_quadrature_points,
from dune.perftool.error import PerftoolVectorizationError
from dune.perftool.options import get_form_option
from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff
from pytools import product
from frozendict import frozendict
import itertools as it
import loopy as lp
import numpy as np
@generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o)
def _cache_vectorization_info(old, new):
if new is None:
raise PerftoolVectorizationError("Vectorization info for sum factorization kernel was not gathered correctly!")
_collect_sumfact_nodes = generator_factory(item_tags=("sumfactnodes", "dryrundata"), context_tags="kernel", no_deco=True)
def attach_vectorization_info(sf):
assert isinstance(sf, SumfactKernel)
if get_global_context_value("dry_run"):
def position_penalty_factor(sf):
Dominic Kempf
committed
if isinstance(sf, SumfactKernel) or sf.vertical_width > 1:
return 1
else:
return 1 + sum(abs(sf.kernels[i].position_priority - i) if sf.kernels[i].position_priority is not None else 0 for i in range(sf.length))
@backend(interface="vectorization_strategy", name="model")
def costmodel(sf):
# Penalize vertical vectorization
vertical_penalty = 1 + math.log(sf.vertical_width)
Dominic Kempf
committed
# Penalize scalar sum factorization kernels
scalar_penalty = 1
if isinstance(sf, SumfactKernel):
scalar_penalty = get_vcl_type_size(dtype_floatingpoint())
Dominic Kempf
committed
Dominic Kempf
committed
return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty
@backend(interface="vectorization_strategy", name="fromlist")
def fromlist_costmodel(sf):
# The fromlist strategy needs to reuse the cost model!
return costmodel(sf)
def explicit_costfunction(sf):
# Read the explicitly set values for horizontal and vertical vectorization
width = get_vcl_type_size(dtype_floatingpoint())
horizontal = get_form_option("vectorization_horizontal")
vertical = get_form_option("vectorization_vertical")
if vertical is None:
vertical = 1
horizontal = int(horizontal)
vertical = int(vertical)
if sf.horizontal_width == horizontal and sf.vertical_width == vertical:
# Penalize position mapping
return sf.operations * position_penalty_factor(sf)
return 1000000000000
def strategy_cost(strat_tuple):
qp, strategy = strat_tuple
selector=lambda: get_form_option("vectorization_strategy"))
keys = set(sf.cache_key for sf in strategy.values())
set_quadrature_points(qp)
# Sum over all the sum factorization kernels in the realization
score = 0.0
for sf in strategy.values():
if sf.cache_key in keys:
score = score + float(func(sf))
keys.discard(sf.cache_key)
return score
def fixedqp_strategy_costfunction(qp):
def _cost(strategy):
return strategy_cost((qp, strategy))
return _cost
def stringify_vectorization_strategy(strategy):
result = []
qp, strategy = strategy
result.append("Printing potential vectorization strategy:")
result.append("Quadrature point tuple: {}".format(qp))
# Look for all realizations in the strategy and iterate over them
cache_keys = frozenset(v.cache_key for v in strategy.values())
for ck in cache_keys:
# Filter all the kernels that are realized by this and print
for key in strategy:
if strategy[key].cache_key == ck:
# Find one representative to print
for val in strategy.values():
if val.cache_key == ck:
def decide_vectorization_strategy():
""" Decide how to vectorize!
Note that the vectorization of the quadrature loop is independent of this,
as it is implemented through a post-processing (== loopy transformation) step.
"""
logger = logging.getLogger(__name__)
# Retrieve all sum factorization kernels for stage 1 and 3
from dune.perftool.generation import retrieve_cache_items
all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")]
# Stage 1 sum factorizations that were actually used
basis_sumfacts = [i for i in retrieve_cache_items('kernel_default and basis_sf_kernels')]
# This means we can have sum factorizations that will not get used
inactive_sumfacts = [i for i in all_sumfacts if i.stage == 1 and i not in basis_sumfacts]
# All sum factorization kernels that get used
active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts]
# If no vectorization is needed, abort now
if get_form_option("vectorization_strategy") == "none":
for sf in all_sumfacts:
_cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer")))
return
logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes"
.format(len(active_sumfacts)))
# Find the best vectorization strategy by using a costmodel
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# Note that this optimization procedure uses a hierarchic approach to bypass
# the problems of unfavorable complexity of the set of all possible vectorization
# opportunities. Optimizations are performed at different levels (you find these
# levels in the function names implementing them), where optimal solutions at a
# higher level are combined into lower level solutions or optima of optimal solutions
# at higher level are calculated:
# * Level 1: Finding an optimal quadrature tuple (by finding optimum of level 2 optima)
# * Level 2: Split by parallelizability and combine optima into optimal solution
# * Level 3: Optimize number of different inputs to consider
# * Level 4: Optimize horizontal/vertical/hybrid strategy
width = get_vcl_type_size(dtype_floatingpoint())
qp, sfdict = level1_optimal_vectorization_strategy(active_sumfacts, width)
# TODO: Check how the 'fromlist' generator fits into the new overall picture
#
# if get_form_option("vectorization_strategy") == "fromlist":
# # This is a bit special and does not follow the minimization procedure at all
#
# def _choose_strategy_from_list(stage1_sumfacts):
# strategy = 0
# for qp in quad_points:
# for strat in fixed_quad_vectorization_opportunity_generator(frozenset(stage1_sumfacts), width, qp):
# if strategy == int(get_form_option("vectorization_list_index")):
# # Output the strategy and its cost into a separate file
# if get_global_context_value("form_type") == "jacobian_apply":
# with open("strategycosts.csv", "a") as f:
# f.write("{} {}\n".format(strategy, strategy_cost((qp, strat))))
# return qp, strat
# strategy = strategy + 1
#
# raise PerftoolVectorizationError("Specified vectorization list index '{}' was too high!".format(get_form_option("vectorization_list_index")))
#
# s1_sumfacts = frozenset(sf for sf in active_sumfacts if sf.stage == 1)
#
# total = sum(len([s for s in fixed_quad_vectorization_opportunity_generator(frozenset(s1_sumfacts), width, qp)]) for qp in quad_points)
# print("'fromlist' vectorization is attempting to pick #{} of {} strategies...".format(int(get_form_option("vectorization_list_index")),
# total))
# qp, sfdict = _choose_strategy_from_list(s1_sumfacts)
#
# keys = frozenset(sf.input_key for sf in active_sumfacts if sf.stage != 1)
# for key in keys:
# key_sumfacts = frozenset(sf for sf in active_sumfacts if sf.input_key == key)
# minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp),
# key=fixedqp_strategy_costfunction(qp))
# sfdict = add_to_frozendict(sfdict, minimum)
Dominic Kempf
committed
set_quadrature_points(qp)
logger.debug("decide_vectorization_strategy: Decided for the following strategy:"
"\n".join(stringify_vectorization_strategy((qp, sfdict))))
# We map inactive sum factorization kernels to 0
sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts})
# Register the results
for sf in all_sumfacts:
_cache_vectorization_info(sf, sfdict[sf])
def level1_optimal_vectorization_strategy(sumfacts, width):
# Gather a list of possible quadrature point tuples
quad_points = [quadrature_points_per_direction()]
if get_form_option("vectorization_allow_quadrature_changes"):
sf = next(iter(sumfacts))
depth = 1
while depth <= width:
i = 0 if sf.matrix_sequence[0].face is None else 1
quad = list(quadrature_points_per_direction())
quad[i] = round_to_multiple(quad[i], depth)
quad_points.append(tuple(quad))
depth = depth * 2
quad_points = list(set(quad_points))
Dominic Kempf
committed
# Find the minimum cost strategy between all the quadrature point tuples
optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points}
qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp])))
return qp, optimal_strategies[qp]
def level2_optimal_vectorization_strategy(sumfacts, width, qp):
# Find the sets of simultaneously realizable kernels (thats an equivalence relation)
keys = frozenset(sf.input_key for sf in sumfacts)
Dominic Kempf
committed
# Find minimums for each of these sets
sfdict = frozendict()
Dominic Kempf
committed
for key in keys:
key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key)
minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp),
key=fixedqp_strategy_costfunction(qp))
def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
if len(sumfacts) == 0:
# We have gone into recursion deep enough to have all sum factorization nodes
# assigned their vectorized counterpart. We can yield the result now!
yield already
return
# Ensure a deterministic order of the given sumfact kernels. This is necessary for the
# fromlist strategy to pick correct strategies across different program runs
sumfacts = sorted(sumfacts, key=lambda sf: repr(sf))
# Otherwise we pick a random sum factorization kernel and construct all the vectorization
# opportunities realizing this particular kernel and go into recursion.
sf_to_decide = sumfacts[0]
# Have "unvectorized" as an option, although it is not good
for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]),
width,
qp,
add_to_frozendict(already,
{sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
),
):
yield opp
horizontal = 1
while horizontal <= width:
# Iterate over the possible combinations of sum factorization kernels
# taking into account all the permutations of kernels. This also includes
# combinations which use a padding of 1 - but only for pure horizontality.
Dominic Kempf
committed
generators = [it.permutations(sumfacts, horizontal)]
Dominic Kempf
committed
generators.append(it.permutations(sumfacts, horizontal - 1))
# The chosen kernels must be part of the kernels for recursion
# to work correctly
if sf_to_decide not in combo:
continue
# Set up the vectorization dict for this combo
vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
if vecdict is None:
# This particular choice was rejected for some reason.
# Possible reasons:
# * the quadrature point tuple not being suitable
# for this vectorization strategy
continue
# Go into recursion to also vectorize all kernels not in this combo
for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo),
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
width,
qp,
add_to_frozendict(already, vecdict),
):
yield opp
horizontal = horizontal * 2
def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
# Enhance the list of sumfact nodes by adding vertical splittings
kernels = []
for sf in sumfacts:
# No slicing needed in the pure horizontal case
if vertical == 1:
kernels.append(sf)
continue
# Determine the slicing direction
slice_direction = 0 if sf.matrix_sequence[0].face is None else 1
if qp[slice_direction] % vertical != 0:
return None
# Split the basis tabulation matrices
oldtab = sf.matrix_sequence[slice_direction]
for i in range(vertical):
seq = list(sf.matrix_sequence)
seq[slice_direction] = oldtab.copy(slice_size=vertical,
slice_index=i)
kernels.append(sf.copy(matrix_sequence=tuple(seq)))
# Join the new kernels into a sum factorization node
buffer = get_counted_variable("joined_buffer")
return {sf: VectorizedSumfactKernel(kernels=tuple(kernels),
horizontal_width=horizontal,
vertical_width=vertical,
buffer=buffer,
) for sf in sumfacts}