""" Sum factorization vectorization """ import logging from dune.perftool.loopy.target import dtype_floatingpoint from dune.perftool.loopy.vcl import get_vcl_type_size from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel from dune.perftool.generation import (backend, generator_factory, get_backend, get_counted_variable, get_global_context_value, ) from dune.perftool.pdelab.restriction import (Restriction, restricted_name, ) from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray, quadrature_points_per_direction, set_quadrature_points, ) from dune.perftool.error import PerftoolVectorizationError from dune.perftool.options import get_form_option from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff from pytools import product from frozendict import frozendict import itertools as it import loopy as lp import numpy as np import math @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o) def _cache_vectorization_info(old, new): if new is None: raise PerftoolVectorizationError("Vectorization info for sum factorization kernel was not gathered correctly!") return new _collect_sumfact_nodes = generator_factory(item_tags=("sumfactnodes", "dryrundata"), context_tags="kernel", no_deco=True) def attach_vectorization_info(sf): assert isinstance(sf, SumfactKernel) if get_global_context_value("dry_run"): return _collect_sumfact_nodes(sf) else: return _cache_vectorization_info(sf, None) def position_penalty_factor(sf): if isinstance(sf, SumfactKernel) or sf.vertical_width > 1: return 1 else: return 1 + sum(abs(sf.kernels[i].position_priority - i) if sf.kernels[i].position_priority is not None else 0 for i in range(sf.length)) @backend(interface="vectorization_strategy", name="model") def costmodel(sf): # Penalize vertical vectorization vertical_penalty = 1 + math.log(sf.vertical_width) # Penalize scalar sum factorization kernels scalar_penalty = 1 if isinstance(sf, SumfactKernel): scalar_penalty = get_vcl_type_size(dtype_floatingpoint()) # Return total operations return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty @backend(interface="vectorization_strategy", name="fromlist") def fromlist_costmodel(sf): # The fromlist strategy needs to reuse the cost model! return costmodel(sf) @backend(interface="vectorization_strategy", name="explicit") def explicit_costfunction(sf): # Read the explicitly set values for horizontal and vertical vectorization width = get_vcl_type_size(dtype_floatingpoint()) horizontal = get_form_option("vectorization_horizontal") if horizontal is None: horizontal = width vertical = get_form_option("vectorization_vertical") if vertical is None: vertical = 1 horizontal = int(horizontal) vertical = int(vertical) if sf.horizontal_width == horizontal and sf.vertical_width == vertical: # Penalize position mapping return sf.operations * position_penalty_factor(sf) else: return 1000000000000 def strategy_cost(strat_tuple): qp, strategy = strat_tuple func = get_backend(interface="vectorization_strategy", selector=lambda: get_form_option("vectorization_strategy")) keys = set(sf.cache_key for sf in strategy.values()) set_quadrature_points(qp) # Sum over all the sum factorization kernels in the realization score = 0.0 for sf in strategy.values(): if sf.cache_key in keys: score = score + float(func(sf)) keys.discard(sf.cache_key) return score def fixedqp_strategy_costfunction(qp): def _cost(strategy): return strategy_cost((qp, strategy)) return _cost def stringify_vectorization_strategy(strategy): result = [] qp, strategy = strategy result.append("Printing potential vectorization strategy:") result.append("Quadrature point tuple: {}".format(qp)) # Look for all realizations in the strategy and iterate over them cache_keys = frozenset(v.cache_key for v in strategy.values()) for ck in cache_keys: # Filter all the kernels that are realized by this and print for key in strategy: if strategy[key].cache_key == ck: result.append("{}:".format(key)) # Find one representative to print for val in strategy.values(): if val.cache_key == ck: result.append(" {}".format(val)) break return result def decide_vectorization_strategy(): """ Decide how to vectorize! Note that the vectorization of the quadrature loop is independent of this, as it is implemented through a post-processing (== loopy transformation) step. """ logger = logging.getLogger(__name__) # Retrieve all sum factorization kernels for stage 1 and 3 from dune.perftool.generation import retrieve_cache_items all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")] # Stage 1 sum factorizations that were actually used basis_sumfacts = [i for i in retrieve_cache_items('kernel_default and basis_sf_kernels')] # This means we can have sum factorizations that will not get used inactive_sumfacts = [i for i in all_sumfacts if i.stage == 1 and i not in basis_sumfacts] # All sum factorization kernels that get used active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts] # If no vectorization is needed, abort now if get_form_option("vectorization_strategy") == "none": for sf in all_sumfacts: _cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer"))) return logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes" .format(len(active_sumfacts))) # # Find the best vectorization strategy by using a costmodel # # Note that this optimization procedure uses a hierarchic approach to bypass # the problems of unfavorable complexity of the set of all possible vectorization # opportunities. Optimizations are performed at different levels (you find these # levels in the function names implementing them), where optimal solutions at a # higher level are combined into lower level solutions or optima of optimal solutions # at higher level are calculated: # * Level 1: Finding an optimal quadrature tuple (by finding optimum of level 2 optima) # * Level 2: Split by parallelizability and combine optima into optimal solution # * Level 3: Optimize number of different inputs to consider # * Level 4: Optimize horizontal/vertical/hybrid strategy width = get_vcl_type_size(dtype_floatingpoint()) qp, sfdict = level1_optimal_vectorization_strategy(active_sumfacts, width) # TODO: Check how the 'fromlist' generator fits into the new overall picture # # if get_form_option("vectorization_strategy") == "fromlist": # # This is a bit special and does not follow the minimization procedure at all # # def _choose_strategy_from_list(stage1_sumfacts): # strategy = 0 # for qp in quad_points: # for strat in fixed_quad_vectorization_opportunity_generator(frozenset(stage1_sumfacts), width, qp): # if strategy == int(get_form_option("vectorization_list_index")): # # Output the strategy and its cost into a separate file # if get_global_context_value("form_type") == "jacobian_apply": # with open("strategycosts.csv", "a") as f: # f.write("{} {}\n".format(strategy, strategy_cost((qp, strat)))) # return qp, strat # strategy = strategy + 1 # # raise PerftoolVectorizationError("Specified vectorization list index '{}' was too high!".format(get_form_option("vectorization_list_index"))) # # s1_sumfacts = frozenset(sf for sf in active_sumfacts if sf.stage == 1) # # total = sum(len([s for s in fixed_quad_vectorization_opportunity_generator(frozenset(s1_sumfacts), width, qp)]) for qp in quad_points) # print("'fromlist' vectorization is attempting to pick #{} of {} strategies...".format(int(get_form_option("vectorization_list_index")), # total)) # qp, sfdict = _choose_strategy_from_list(s1_sumfacts) # # keys = frozenset(sf.input_key for sf in active_sumfacts if sf.stage != 1) # for key in keys: # key_sumfacts = frozenset(sf for sf in active_sumfacts if sf.input_key == key) # minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp), # key=fixedqp_strategy_costfunction(qp)) # sfdict = add_to_frozendict(sfdict, minimum) set_quadrature_points(qp) logger.debug("decide_vectorization_strategy: Decided for the following strategy:" "\n".join(stringify_vectorization_strategy((qp, sfdict)))) # We map inactive sum factorization kernels to 0 sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts}) # Register the results for sf in all_sumfacts: _cache_vectorization_info(sf, sfdict[sf]) def level1_optimal_vectorization_strategy(sumfacts, width): # Gather a list of possible quadrature point tuples quad_points = [quadrature_points_per_direction()] if get_form_option("vectorization_allow_quadrature_changes"): sf = next(iter(sumfacts)) depth = 1 while depth <= width: i = 0 if sf.matrix_sequence[0].face is None else 1 quad = list(quadrature_points_per_direction()) quad[i] = round_to_multiple(quad[i], depth) quad_points.append(tuple(quad)) depth = depth * 2 quad_points = list(set(quad_points)) # Find the minimum cost strategy between all the quadrature point tuples optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points} qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp]))) return qp, optimal_strategies[qp] def level2_optimal_vectorization_strategy(sumfacts, width, qp): # Find the sets of simultaneously realizable kernels keys = frozenset(sf.parallel_key for sf in sumfacts) # Find minimums for each of these sets sfdict = frozendict() for key in keys: key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key) key_strategy = level3_optimal_vectorization_strategy(key_sumfacts, width, qp) sfdict = add_to_frozendict(sfdict, key_strategy) return sfdict def level3_optimal_vectorization_strategy(sumfacts, width, qp): # Find the sets of simultaneously realizable kernels (thats an equivalence relation) keys = frozenset(sf.input_key for sf in sumfacts) # Find minimums for each of these sets sfdict = frozendict() for key in keys: key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key) minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp), key=fixedqp_strategy_costfunction(qp)) sfdict = add_to_frozendict(sfdict, minimum) return sfdict def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()): if len(sumfacts) == 0: # We have gone into recursion deep enough to have all sum factorization nodes # assigned their vectorized counterpart. We can yield the result now! yield already return # Ensure a deterministic order of the given sumfact kernels. This is necessary for the # fromlist strategy to pick correct strategies across different program runs sumfacts = sorted(sumfacts, key=lambda sf: repr(sf)) # Otherwise we pick a random sum factorization kernel and construct all the vectorization # opportunities realizing this particular kernel and go into recursion. sf_to_decide = sumfacts[0] # Have "unvectorized" as an option, although it is not good for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]), width, qp, add_to_frozendict(already, {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))} ), ): yield opp horizontal = 1 while horizontal <= width: # Iterate over the possible combinations of sum factorization kernels # taking into account all the permutations of kernels. This also includes # combinations which use a padding of 1 - but only for pure horizontality. generators = [it.permutations(sumfacts, horizontal)] if horizontal >= 4: generators.append(it.permutations(sumfacts, horizontal - 1)) for combo in it.chain(*generators): # The chosen kernels must be part of the kernels for recursion # to work correctly if sf_to_decide not in combo: continue # Set up the vectorization dict for this combo vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp) if vecdict is None: # This particular choice was rejected for some reason. # Possible reasons: # * the quadrature point tuple not being suitable # for this vectorization strategy continue # Go into recursion to also vectorize all kernels not in this combo for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo), width, qp, add_to_frozendict(already, vecdict), ): yield opp horizontal = horizontal * 2 def get_vectorization_dict(sumfacts, vertical, horizontal, qp): # Enhance the list of sumfact nodes by adding vertical splittings kernels = [] for sf in sumfacts: # No slicing needed in the pure horizontal case if vertical == 1: kernels.append(sf) continue # Determine the slicing direction slice_direction = 0 if sf.matrix_sequence[0].face is None else 1 if qp[slice_direction] % vertical != 0: return None # Split the basis tabulation matrices oldtab = sf.matrix_sequence[slice_direction] for i in range(vertical): seq = list(sf.matrix_sequence) seq[slice_direction] = oldtab.copy(slice_size=vertical, slice_index=i) kernels.append(sf.copy(matrix_sequence=tuple(seq))) # Join the new kernels into a sum factorization node buffer = get_counted_variable("joined_buffer") return {sf: VectorizedSumfactKernel(kernels=tuple(kernels), horizontal_width=horizontal, vertical_width=vertical, buffer=buffer, ) for sf in sumfacts}