Newer
Older
""" Sum factorization vectorization """
from dune.perftool.loopy.target import dtype_floatingpoint
from dune.perftool.loopy.vcl import get_vcl_type_size
from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel
from dune.perftool.generation import (backend,
generator_factory,
get_backend,
get_counted_variable,
from dune.perftool.pdelab.restriction import (Restriction,
restricted_name,
)
from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray,
quadrature_points_per_direction,
set_quadrature_points,
from dune.perftool.error import PerftoolVectorizationError
from dune.perftool.options import get_form_option
from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff
from pytools import product
from frozendict import frozendict
import itertools as it
import loopy as lp
import numpy as np
@generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o)
def _cache_vectorization_info(old, new):
if new is None:
raise PerftoolVectorizationError("Vectorization info for sum factorization kernel was not gathered correctly!")
_collect_sumfact_nodes = generator_factory(item_tags=("sumfactnodes", "dryrundata"), context_tags="kernel", no_deco=True)
def attach_vectorization_info(sf):
assert isinstance(sf, SumfactKernel)
if get_global_context_value("dry_run"):
def position_penalty_factor(sf):
Dominic Kempf
committed
if isinstance(sf, SumfactKernel) or sf.vertical_width > 1:
return 1
else:
return 1 + sum(abs(sf.kernels[i].position_priority - i) if sf.kernels[i].position_priority is not None else 0 for i in range(sf.length))
@backend(interface="vectorization_strategy", name="model")
def costmodel(sf):
# Penalize vertical vectorization
vertical_penalty = 1 + math.log(sf.vertical_width)
Dominic Kempf
committed
# Penalize scalar sum factorization kernels
scalar_penalty = 1
if isinstance(sf, SumfactKernel):
scalar_penalty = get_vcl_type_size(dtype_floatingpoint())
Dominic Kempf
committed
Dominic Kempf
committed
return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty
@backend(interface="vectorization_strategy", name="fromlist")
def fromlist_costmodel(sf):
# The fromlist strategy needs to reuse the cost model!
return costmodel(sf)
def explicit_costfunction(sf):
# Read the explicitly set values for horizontal and vertical vectorization
width = get_vcl_type_size(dtype_floatingpoint())
horizontal = get_form_option("vectorization_horizontal")
vertical = get_form_option("vectorization_vertical")
if vertical is None:
vertical = 1
horizontal = int(horizontal)
vertical = int(vertical)
if sf.horizontal_width == horizontal and sf.vertical_width == vertical:
# Penalize position mapping
return sf.operations * position_penalty_factor(sf)
return 1000000000000
def strategy_cost(strat_tuple):
qp, strategy = strat_tuple
selector=lambda: get_form_option("vectorization_strategy"))
keys = set(sf.cache_key for sf in strategy.values())
set_quadrature_points(qp)
# Sum over all the sum factorization kernels in the realization
score = 0.0
for sf in strategy.values():
if sf.cache_key in keys:
score = score + float(func(sf))
keys.discard(sf.cache_key)
return score
def fixedqp_strategy_costfunction(qp):
def _cost(strategy):
return strategy_cost((qp, strategy))
return _cost
def stringify_vectorization_strategy(strategy):
result = []
qp, strategy = strategy
result.append("Printing potential vectorization strategy:")
result.append("Quadrature point tuple: {}".format(qp))
# Look for all realizations in the strategy and iterate over them
cache_keys = frozenset(v.cache_key for v in strategy.values())
for ck in cache_keys:
# Filter all the kernels that are realized by this and print
for key in strategy:
if strategy[key].cache_key == ck:
# Find one representative to print
for val in strategy.values():
if val.cache_key == ck:
def decide_vectorization_strategy():
""" Decide how to vectorize!
Note that the vectorization of the quadrature loop is independent of this,
as it is implemented through a post-processing (== loopy transformation) step.
"""
logger = logging.getLogger(__name__)
# Retrieve all sum factorization kernels for stage 1 and 3
from dune.perftool.generation import retrieve_cache_items
all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")]
# Stage 1 sum factorizations that were actually used
basis_sumfacts = [i for i in retrieve_cache_items('kernel_default and basis_sf_kernels')]
# This means we can have sum factorizations that will not get used
inactive_sumfacts = [i for i in all_sumfacts if i.stage == 1 and i not in basis_sumfacts]
# All sum factorization kernels that get used
active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts]
# If no vectorization is needed, abort now
if get_form_option("vectorization_strategy") == "none":
for sf in all_sumfacts:
_cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer")))
return
logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes"
.format(len(active_sumfacts)))
# Find the best vectorization strategy by using a costmodel
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# Note that this optimization procedure uses a hierarchic approach to bypass
# the problems of unfavorable complexity of the set of all possible vectorization
# opportunities. Optimizations are performed at different levels (you find these
# levels in the function names implementing them), where optimal solutions at a
# higher level are combined into lower level solutions or optima of optimal solutions
# at higher level are calculated:
# * Level 1: Finding an optimal quadrature tuple (by finding optimum of level 2 optima)
# * Level 2: Split by parallelizability and combine optima into optimal solution
# * Level 3: Optimize number of different inputs to consider
# * Level 4: Optimize horizontal/vertical/hybrid strategy
width = get_vcl_type_size(dtype_floatingpoint())
qp, sfdict = level1_optimal_vectorization_strategy(active_sumfacts, width)
# TODO: Check how the 'fromlist' generator fits into the new overall picture
#
# if get_form_option("vectorization_strategy") == "fromlist":
# # This is a bit special and does not follow the minimization procedure at all
#
# def _choose_strategy_from_list(stage1_sumfacts):
# strategy = 0
# for qp in quad_points:
# for strat in fixed_quad_vectorization_opportunity_generator(frozenset(stage1_sumfacts), width, qp):
# if strategy == int(get_form_option("vectorization_list_index")):
# # Output the strategy and its cost into a separate file
# if get_global_context_value("form_type") == "jacobian_apply":
# with open("strategycosts.csv", "a") as f:
# f.write("{} {}\n".format(strategy, strategy_cost((qp, strat))))
# return qp, strat
# strategy = strategy + 1
#
# raise PerftoolVectorizationError("Specified vectorization list index '{}' was too high!".format(get_form_option("vectorization_list_index")))
#
# s1_sumfacts = frozenset(sf for sf in active_sumfacts if sf.stage == 1)
#
# total = sum(len([s for s in fixed_quad_vectorization_opportunity_generator(frozenset(s1_sumfacts), width, qp)]) for qp in quad_points)
# print("'fromlist' vectorization is attempting to pick #{} of {} strategies...".format(int(get_form_option("vectorization_list_index")),
# total))
# qp, sfdict = _choose_strategy_from_list(s1_sumfacts)
#
# keys = frozenset(sf.input_key for sf in active_sumfacts if sf.stage != 1)
# for key in keys:
# key_sumfacts = frozenset(sf for sf in active_sumfacts if sf.input_key == key)
# minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp),
# key=fixedqp_strategy_costfunction(qp))
# sfdict = add_to_frozendict(sfdict, minimum)
Dominic Kempf
committed
set_quadrature_points(qp)
logger.debug("decide_vectorization_strategy: Decided for the following strategy:"
"\n".join(stringify_vectorization_strategy((qp, sfdict))))
# We map inactive sum factorization kernels to 0
sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts})
# Register the results
for sf in all_sumfacts:
_cache_vectorization_info(sf, sfdict[sf])
def level1_optimal_vectorization_strategy(sumfacts, width):
# Gather a list of possible quadrature point tuples
quad_points = [quadrature_points_per_direction()]
if get_form_option("vectorization_allow_quadrature_changes"):
sf = next(iter(sumfacts))
depth = 1
while depth <= width:
i = 0 if sf.matrix_sequence[0].face is None else 1
quad = list(quadrature_points_per_direction())
quad[i] = round_to_multiple(quad[i], depth)
quad_points.append(tuple(quad))
depth = depth * 2
quad_points = list(set(quad_points))
Dominic Kempf
committed
# Find the minimum cost strategy between all the quadrature point tuples
optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points}
qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp])))
return qp, optimal_strategies[qp]
def level2_optimal_vectorization_strategy(sumfacts, width, qp):
# Find the sets of simultaneously realizable kernels
keys = frozenset(sf.parallel_key for sf in sumfacts)
# Find minimums for each of these sets
sfdict = frozendict()
for key in keys:
key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key)
key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp),
key=fixedqp_strategy_costfunction(qp))
sfdict = add_to_frozendict(sfdict, key_strategy)
return sfdict
def level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
for opp in _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
# Add non-vectorized implementation information to all kernels that are not present in
# the optimal strategy
yield add_to_frozendict(opp,
{sf: sf.copy(buffer=get_counted_variable("buffer")) for sf in sumfacts if sf not in opp})
def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, already=frozendict()):
if len(sumfacts) == 0:
yield already
return
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
# We store the information whether a vectorization opportunity has been yielded from this
# generator to yield an incomplete strategy if not (which is then completed with unvectorized
# kernel implementations)
yielded = False
# Find the number of input coefficients we can work on
keys = frozenset(sf.input_key for sf in sumfacts)
inputkey_sumfacts = [frozenset(filter(lambda sf: sf.input_key == key, sumfacts)) for key in keys]
for parallel in (1, 2):
if parallel == 2 and next(iter(sumfacts)).stage == 3:
continue
for which in filter(lambda w: w == tuple(sorted(w)),
it.permutations(range(len(keys)), parallel)):
horizontal = 1
while horizontal <= width // parallel:
generators = [filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))),
it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal) for part in which)))]
if horizontal >=4:
generators.append(filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))),
it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal - 1) for part in which))))
for combo in it.chain(*generators):
combo = sum(combo, ())
vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp)
if vecdict is None:
# This particular choice was rejected for some reason.
# Possible reasons:
# * the quadrature point tuple not being suitable
# for this vectorization strategy
continue
# Go into recursion to also vectorize all kernels not in this combo
for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo),
width,
qp,
add_to_frozendict(already, vecdict),
):
yielded = True
yield opp
horizontal *= 2
# If we did not yield on this recursion level, yield what we got so far
if not yielded:
yield already
#
# def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
# if len(sumfacts) == 0:
# # We have gone into recursion deep enough to have all sum factorization nodes
# # assigned their vectorized counterpart. We can yield the result now!
# yield already
# return
#
# # Ensure a deterministic order of the given sumfact kernels. This is necessary for the
# # fromlist strategy to pick correct strategies across different program runs
# sumfacts = sorted(sumfacts, key=lambda sf: repr(sf))
#
# # Otherwise we pick a random sum factorization kernel and construct all the vectorization
# # opportunities realizing this particular kernel and go into recursion.
# sf_to_decide = sumfacts[0]
#
# # Have "unvectorized" as an option, although it is not good
# for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]),
# width,
# qp,
# add_to_frozendict(already,
# {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
# ),
# ):
# yield opp
#
# horizontal = 1
# while horizontal <= width:
# # Iterate over the possible combinations of sum factorization kernels
# # taking into account all the permutations of kernels. This also includes
# # combinations which use a padding of 1 - but only for pure horizontality.
# generators = [it.permutations(sumfacts, horizontal)]
# if horizontal >= 4:
# generators.append(it.permutations(sumfacts, horizontal - 1))
# for combo in it.chain(*generators):
# # The chosen kernels must be part of the kernels for recursion
# # to work correctly
# if sf_to_decide not in combo:
# continue
#
# # Set up the vectorization dict for this combo
# vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
# if vecdict is None:
# # This particular choice was rejected for some reason.
# # Possible reasons:
# # * the quadrature point tuple not being suitable
# # for this vectorization strategy
# continue
#
# # Go into recursion to also vectorize all kernels not in this combo
# for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo),
# width,
# qp,
# add_to_frozendict(already, vecdict),
# ):
# yield opp
#
# horizontal = horizontal * 2
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
# Enhance the list of sumfact nodes by adding vertical splittings
kernels = []
for sf in sumfacts:
# No slicing needed in the pure horizontal case
if vertical == 1:
kernels.append(sf)
continue
# Determine the slicing direction
slice_direction = 0 if sf.matrix_sequence[0].face is None else 1
if qp[slice_direction] % vertical != 0:
return None
# Split the basis tabulation matrices
oldtab = sf.matrix_sequence[slice_direction]
for i in range(vertical):
seq = list(sf.matrix_sequence)
seq[slice_direction] = oldtab.copy(slice_size=vertical,
slice_index=i)
kernels.append(sf.copy(matrix_sequence=tuple(seq)))
# Join the new kernels into a sum factorization node
buffer = get_counted_variable("joined_buffer")
return {sf: VectorizedSumfactKernel(kernels=tuple(kernels),
horizontal_width=horizontal,
vertical_width=vertical,
buffer=buffer,
) for sf in sumfacts}