Newer
Older
""" Sum factorization vectorization """
from dune.perftool.loopy.target import dtype_floatingpoint
from dune.perftool.loopy.vcl import get_vcl_type_size
from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel
from dune.perftool.generation import (backend,
generator_factory,
get_backend,
get_counted_variable,
from dune.perftool.pdelab.restriction import (Restriction,
restricted_name,
)
from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray,
quadrature_points_per_direction,
set_quadrature_points,
from dune.perftool.error import PerftoolVectorizationError
from dune.perftool.options import get_form_option, get_option, set_form_option
from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff
from pytools import product
from frozendict import frozendict
import itertools as it
import loopy as lp
import numpy as np
@generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o)
def _cache_vectorization_info(old, new):
if get_form_option("vectorization_not_fully_vectorized_error"):
if not isinstance(new, VectorizedSumfactKernel):
raise PerftoolVectorizationError("Did not fully vectorize!")
raise PerftoolVectorizationError("Vectorization info for sum factorization kernel was not gathered correctly!")
_collect_sumfact_nodes = generator_factory(item_tags=("sumfactnodes", "dryrundata"), context_tags="kernel", no_deco=True)
def attach_vectorization_info(sf):
assert isinstance(sf, SumfactKernel)
if get_global_context_value("dry_run"):
def costmodel(sf):
# Penalize vertical vectorization
vertical_penalty = 1 + math.log(sf.vertical_width)
Dominic Kempf
committed
# Penalize scalar sum factorization kernels
scalar_penalty = 1
if isinstance(sf, SumfactKernel):
scalar_penalty = get_vcl_type_size(dtype_floatingpoint())
Dominic Kempf
committed
return sf.operations * vertical_penalty * scalar_penalty
def explicit_costfunction(sf):
# Read the explicitly set values for horizontal and vertical vectorization
width = get_vcl_type_size(dtype_floatingpoint())
horizontal = get_form_option("vectorization_horizontal")
vertical = get_form_option("vectorization_vertical")
if vertical is None:
vertical = 1
horizontal = int(horizontal)
vertical = int(vertical)
if sf.horizontal_width == horizontal and sf.vertical_width == vertical:
# Penalize position mapping
return 1000000000000
def target_costfunction(sf):
# The cost of a kernel is given by the difference to the desired target cost.
# Pitfall: The target cost needs to be weighed to account for this being called
# on subsets and not on a full vectorization strategy!
target = float(get_form_option("vectorization_target"))
realcost = costmodel(sf)
ratio = sf.horizontal_width / total
def strategy_cost(strat_tuple):
qp, strategy = strat_tuple
# Choose the correct cost function
s = get_form_option("vectorization_strategy")
if s == "model":
func = costmodel
elif s == "explicit":
func = explicit_costfunction
elif s == "target":
func = target_costfunction
else:
raise NotImplementedError("Vectorization strategy '{}' unknown!".format(s))
keys = set(sf.cache_key for sf in strategy.values())
# Sum over all the sum factorization kernels in the realization
score = 0.0
for sf in strategy.values():
if sf.cache_key in keys:
score = score + float(func(sf))
keys.discard(sf.cache_key)
return score
def fixedqp_strategy_costfunction(qp):
def _cost(strategy):
return strategy_cost((qp, strategy))
return _cost
def stringify_vectorization_strategy(strategy):
result = []
qp, strategy = strategy
result.append("Printing potential vectorization strategy:")
result.append("Quadrature point tuple: {}".format(qp))
# Look for all realizations in the strategy and iterate over them
cache_keys = frozenset(v.cache_key for v in strategy.values())
for ck in cache_keys:
# Filter all the kernels that are realized by this and print
for key in strategy:
if strategy[key].cache_key == ck:
# Find one representative to print
for val in strategy.values():
if val.cache_key == ck:
def short_stringify_vectorization_strategy(strategy):
""" A short string decribing the vectorization strategy. This is used
in costmodel validation plots to describe what a data point does
"""
qp, strategy = strategy
def _short(k):
if isinstance(k, VectorizedSumfactKernel):
return str(k.horizontal_width)
else:
return "scalar"
stage1 = []
stage3 = []
for kernel in strategy.values():
if kernel.stage == 1:
stage1.append(_short(kernel))
if kernel.stage == 3:
stage3.append(_short(kernel))
return "m0={};S1:{};S3:{}".format(qp[0], "|".join(stage1), "|".join(stage3))
def filter_active_inactive_sumfacts():
# Retrieve all sum factorization kernels for stage 1 and 3
from dune.perftool.generation import retrieve_cache_items
all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")]
# Stage 1 sum factorizations that were actually used
basis_sumfacts = [i for i in retrieve_cache_items('kernel_default and basis_sf_kernels')]
# This means we can have sum factorizations that will not get used
inactive_sumfacts = [i for i in all_sumfacts if i.stage == 1 and i not in basis_sumfacts]
# All sum factorization kernels that get used
active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts]
def decide_vectorization_strategy():
""" Decide how to vectorize!
Note that the vectorization of the quadrature loop is independent of this,
as it is implemented through a post-processing (== loopy transformation) step.
"""
logger = logging.getLogger(__name__)
all_sumfacts, active_sumfacts, inactive_sumfacts = filter_active_inactive_sumfacts()
# If no vectorization is needed, abort now
if get_form_option("vectorization_strategy") == "none" or (get_global_context_value("form_type") == "jacobian" and not get_form_option("vectorization_jacobians")):
for sf in all_sumfacts:
_cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer")))
return
logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes"
.format(len(active_sumfacts)))
# Find the best vectorization strategy by using a costmodel
# Note that this optimization procedure uses a hierarchic approach to bypass
# the problems of unfavorable complexity of the set of all possible vectorization
# opportunities. Optimizations are performed at different levels (you find these
# levels in the function names implementing them), where optimal solutions at a
# higher level are combined into lower level solutions or optima of optimal solutions
# at higher level are calculated:
# * Level 1: Finding an optimal quadrature tuple (by finding optimum of level 2 optima)
# * Level 2: Split by parallelizability and combine optima into optimal solution
# * Level 3: Optimize number of different inputs to consider
# * Level 4: Optimize horizontal/vertical/hybrid strategy
width = get_vcl_type_size(dtype_floatingpoint())
qp, sfdict = level1_optimal_vectorization_strategy(active_sumfacts, width)
Dominic Kempf
committed
set_quadrature_points(qp)
logger.debug("decide_vectorization_strategy: Decided for the following strategy:" +
"\n " +
"\n ".join(stringify_vectorization_strategy((qp, sfdict))))
Dominic Kempf
committed
# We map inactive sum factorization kernels to 0
sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts})
# Register the results
Dominic Kempf
committed
_cache_vectorization_info(sf, sfdict[sf])
def level1_optimal_vectorization_strategy(sumfacts, width):
# Gather a list of possible quadrature point tuples
quad_points = [quadrature_points_per_direction()]
if get_form_option("vectorization_allow_quadrature_changes"):
sf = next(iter(sumfacts))
depth = 1
while depth <= width:
i = 0 if sf.matrix_sequence[0].face is None else 1
quad = list(quadrature_points_per_direction())
quad[i] = round_to_multiple(quad[i], depth)
quad_points.append(tuple(quad))
depth = depth * 2
quad_points = list(set(quad_points))
Dominic Kempf
committed
# Find the minimum cost strategy between all the quadrature point tuples
optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points}
qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp])))
# If we are using the 'target' strategy, we might want to log some information.
if get_form_option("vectorization_strategy") == "target":
# Print the achieved cost and the target cost on the screen
set_form_option("vectorization_strategy", "model")
cost = strategy_cost((qp, optimal_strategies[qp]))
print("The target cost was: {}".format(get_form_option("vectorization_target")))
print("The achieved cost was: {}".format(cost))
set_form_option("vectorization_strategy", "target")
print("The score in 'target' logic was: {}".format(strategy_cost((qp, optimal_strategies[qp]))))
# Print the employed vectorization strategy into a file
filename = "targetstrat_{}.log".format(int(float(get_form_option("vectorization_target"))))
with open(filename, 'w') as f:
f.write("\n".join(stringify_vectorization_strategy((qp, optimal_strategies[qp]))))
# Write an entry into a csvfile which connects the given measuring identifier with a cost
from dune.testtools.parametertree.parser import parse_ini_file
inifile = parse_ini_file(get_option("ini_file"))
# TODO: Depending on the number of samples, we might need a file lock here.
with open("mapping.csv", 'a') as f:
f.write(" ".join((identifier, str(cost), short_stringify_vectorization_strategy((qp, optimal_strategies[qp])))) + "\n")
return qp, optimal_strategies[qp]
def level2_optimal_vectorization_strategy(sumfacts, width, qp):
# Find the sets of simultaneously realizable kernels
keys = frozenset(sf.parallel_key for sf in sumfacts)
# Find minimums for each of these sets
sfdict = frozendict()
for key in keys:
key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key)
# Minimize over all the opportunities for the subset given by the current key
key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp),
key=fixedqp_strategy_costfunction(qp))
sfdict = add_to_frozendict(sfdict, key_strategy)
return sfdict
def level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
for opp in _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
# Add non-vectorized implementation information to all kernels that are not present in
# the optimal strategy
yield add_to_frozendict(opp,
{sf: sf.copy(buffer=get_counted_variable("buffer")) for sf in sumfacts if sf not in opp})
def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, already=frozendict()):
if len(sumfacts) == 0:
yield already
return
# We store the information whether a vectorization opportunity has been yielded from this
# generator to yield an incomplete strategy if not (which is then completed with unvectorized
# kernel implementations)
yielded = False
# Find the number of input coefficients we can work on
keys = frozenset(sf.inout_key for sf in sumfacts)
inoutkey_sumfacts = [tuple(sorted(filter(lambda sf: sf.inout_key == key, sumfacts))) for key in sorted(keys)]
for parallel in (1, 2):
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
if parallel > len(keys):
continue
horizontal = 1
while horizontal <= width // parallel:
combo = sum((inoutkey_sumfacts[part][:horizontal] for part in range(parallel)), ())
vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp)
horizontal *= 2
if vecdict is None:
# This particular choice was rejected for some reason.
# Possible reasons:
# * the quadrature point tuple not being suitable
# for this vectorization strategy
# * there are not enough horizontal kernels
continue
# Go into recursion to also vectorize all kernels not in this combo
for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo),
width,
qp,
add_to_frozendict(already, vecdict),
):
yielded = True
yield opp
# If we did not yield on this recursion level, yield what we got so far
if not yielded:
yield already
def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
# Discard opportunities that do not contain enough horizontal kernels
if len(sumfacts) not in (horizontal, horizontal - 1):
return None
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
# Enhance the list of sumfact nodes by adding vertical splittings
kernels = []
for sf in sumfacts:
# No slicing needed in the pure horizontal case
if vertical == 1:
kernels.append(sf)
continue
# Determine the slicing direction
slice_direction = 0 if sf.matrix_sequence[0].face is None else 1
if qp[slice_direction] % vertical != 0:
return None
# Split the basis tabulation matrices
oldtab = sf.matrix_sequence[slice_direction]
for i in range(vertical):
seq = list(sf.matrix_sequence)
seq[slice_direction] = oldtab.copy(slice_size=vertical,
slice_index=i)
kernels.append(sf.copy(matrix_sequence=tuple(seq)))
# Join the new kernels into a sum factorization node
buffer = get_counted_variable("joined_buffer")
return {sf: VectorizedSumfactKernel(kernels=tuple(kernels),
horizontal_width=horizontal,
vertical_width=vertical,
buffer=buffer,
) for sf in sumfacts}