""" Sum factorization vectorization """ from __future__ import division import logging from dune.perftool.loopy.target import dtype_floatingpoint from dune.perftool.loopy.vcl import get_vcl_type_size from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel from dune.perftool.generation import (backend, generator_factory, get_backend, get_counted_variable, get_global_context_value, ) from dune.perftool.pdelab.restriction import (Restriction, restricted_name, ) from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray, quadrature_points_per_direction, set_quadrature_points, ) from dune.perftool.error import PerftoolVectorizationError from dune.perftool.options import get_form_option, get_option, set_form_option from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff from pytools import product from frozendict import frozendict import itertools as it import loopy as lp import numpy as np import math @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o) def _cache_vectorization_info(old, new): if get_form_option("vectorization_not_fully_vectorized_error"): if not isinstance(new, VectorizedSumfactKernel): raise PerftoolVectorizationError("Did not fully vectorize!") if new is None: raise PerftoolVectorizationError("Vectorization info for sum factorization kernel was not gathered correctly!") return new _collect_sumfact_nodes = generator_factory(item_tags=("sumfactnodes", "dryrundata"), context_tags="kernel", no_deco=True) def attach_vectorization_info(sf): assert isinstance(sf, SumfactKernel) if get_global_context_value("dry_run"): return _collect_sumfact_nodes(sf) else: return _cache_vectorization_info(sf, None) def costmodel(sf): # Penalize vertical vectorization vertical_penalty = 1 + math.log(sf.vertical_width) # Penalize scalar sum factorization kernels scalar_penalty = 1 if isinstance(sf, SumfactKernel): scalar_penalty = get_vcl_type_size(dtype_floatingpoint()) # Return total operations return sf.operations * vertical_penalty * scalar_penalty def explicit_costfunction(sf): # Read the explicitly set values for horizontal and vertical vectorization width = get_vcl_type_size(dtype_floatingpoint()) horizontal = get_form_option("vectorization_horizontal") if horizontal is None: horizontal = width vertical = get_form_option("vectorization_vertical") if vertical is None: vertical = 1 horizontal = int(horizontal) vertical = int(vertical) if sf.horizontal_width == horizontal and sf.vertical_width == vertical: # Penalize position mapping return sf.operations else: return 1000000000000 def target_costfunction(sf): # The cost of a kernel is given by the difference to the desired target cost. # Pitfall: The target cost needs to be weighed to account for this being called # on subsets and not on a full vectorization strategy! _, all_sf, _ = filter_active_inactive_sumfacts() total = len(all_sf) target = float(get_form_option("vectorization_target")) realcost = costmodel(sf) ratio = sf.horizontal_width / total return abs(realcost - ratio * target) def strategy_cost(strat_tuple): qp, strategy = strat_tuple # Choose the correct cost function s = get_form_option("vectorization_strategy") if s == "model": func = costmodel elif s == "explicit": func = explicit_costfunction elif s == "target": func = target_costfunction else: raise NotImplementedError("Vectorization strategy '{}' unknown!".format(s)) keys = set(sf.cache_key for sf in strategy.values()) set_quadrature_points(qp) # Sum over all the sum factorization kernels in the realization score = 0.0 for sf in strategy.values(): if sf.cache_key in keys: score = score + float(func(sf)) keys.discard(sf.cache_key) return score def fixedqp_strategy_costfunction(qp): def _cost(strategy): return strategy_cost((qp, strategy)) return _cost def stringify_vectorization_strategy(strategy): result = [] qp, strategy = strategy result.append("Printing potential vectorization strategy:") result.append("Quadrature point tuple: {}".format(qp)) # Look for all realizations in the strategy and iterate over them cache_keys = frozenset(v.cache_key for v in strategy.values()) for ck in cache_keys: # Filter all the kernels that are realized by this and print for key in strategy: if strategy[key].cache_key == ck: result.append("{}:".format(key)) # Find one representative to print for val in strategy.values(): if val.cache_key == ck: result.append(" {}".format(val)) break return result def short_stringify_vectorization_strategy(strategy): """ A short string decribing the vectorization strategy. This is used in costmodel validation plots to describe what a data point does """ qp, strategy = strategy def _short(k): if isinstance(k, VectorizedSumfactKernel): return str(k.horizontal_width) else: return "scalar" stage1 = [] stage3 = [] for kernel in strategy.values(): if kernel.stage == 1: stage1.append(_short(kernel)) if kernel.stage == 3: stage3.append(_short(kernel)) return "m0={};S1:{};S3:{}".format(qp[0], "|".join(stage1), "|".join(stage3)) def filter_active_inactive_sumfacts(): # Retrieve all sum factorization kernels for stage 1 and 3 from dune.perftool.generation import retrieve_cache_items all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")] # Stage 1 sum factorizations that were actually used basis_sumfacts = [i for i in retrieve_cache_items('kernel_default and basis_sf_kernels')] # This means we can have sum factorizations that will not get used inactive_sumfacts = [i for i in all_sumfacts if i.stage == 1 and i not in basis_sumfacts] # All sum factorization kernels that get used active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts] return all_sumfacts, active_sumfacts, inactive_sumfacts def decide_vectorization_strategy(): """ Decide how to vectorize! Note that the vectorization of the quadrature loop is independent of this, as it is implemented through a post-processing (== loopy transformation) step. """ logger = logging.getLogger(__name__) all_sumfacts, active_sumfacts, inactive_sumfacts = filter_active_inactive_sumfacts() # If no vectorization is needed, abort now if get_form_option("vectorization_strategy") == "none" or (get_global_context_value("form_type") == "jacobian" and not get_form_option("vectorization_jacobians")): for sf in all_sumfacts: _cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer"))) return logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes" .format(len(active_sumfacts))) # # Find the best vectorization strategy by using a costmodel # # Note that this optimization procedure uses a hierarchic approach to bypass # the problems of unfavorable complexity of the set of all possible vectorization # opportunities. Optimizations are performed at different levels (you find these # levels in the function names implementing them), where optimal solutions at a # higher level are combined into lower level solutions or optima of optimal solutions # at higher level are calculated: # * Level 1: Finding an optimal quadrature tuple (by finding optimum of level 2 optima) # * Level 2: Split by parallelizability and combine optima into optimal solution # * Level 3: Optimize number of different inputs to consider # * Level 4: Optimize horizontal/vertical/hybrid strategy width = get_vcl_type_size(dtype_floatingpoint()) qp, sfdict = level1_optimal_vectorization_strategy(active_sumfacts, width) set_quadrature_points(qp) logger.debug("decide_vectorization_strategy: Decided for the following strategy:" + "\n " + "\n ".join(stringify_vectorization_strategy((qp, sfdict)))) # We map inactive sum factorization kernels to 0 sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts}) # Register the results for sf in all_sumfacts: _cache_vectorization_info(sf, sfdict[sf]) def level1_optimal_vectorization_strategy(sumfacts, width): # Gather a list of possible quadrature point tuples quad_points = [quadrature_points_per_direction()] if get_form_option("vectorization_allow_quadrature_changes"): sf = next(iter(sumfacts)) depth = 1 while depth <= width: i = 0 if sf.matrix_sequence[0].face is None else 1 quad = list(quadrature_points_per_direction()) quad[i] = round_to_multiple(quad[i], depth) quad_points.append(tuple(quad)) depth = depth * 2 quad_points = list(set(quad_points)) # Find the minimum cost strategy between all the quadrature point tuples optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points} qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp]))) # If we are using the 'target' strategy, we might want to log some information. if get_form_option("vectorization_strategy") == "target": # Print the achieved cost and the target cost on the screen set_form_option("vectorization_strategy", "model") cost = strategy_cost((qp, optimal_strategies[qp])) print("The target cost was: {}".format(get_form_option("vectorization_target"))) print("The achieved cost was: {}".format(cost)) set_form_option("vectorization_strategy", "target") print("The score in 'target' logic was: {}".format(strategy_cost((qp, optimal_strategies[qp])))) # Print the employed vectorization strategy into a file from os.path import join filename = join(get_option("project_basedir"), "costmodel-verification", "targetstrat_{}.txt".format(int(float(get_form_option("vectorization_target"))))) with open(filename, 'w') as f: f.write("\n".join(stringify_vectorization_strategy((qp, optimal_strategies[qp])))) # Write an entry into a csvfile which connects the given measuring identifier with a cost from dune.testtools.parametertree.parser import parse_ini_file inifile = parse_ini_file(get_option("ini_file")) identifier = inifile["identifier"] filename = join(get_option("project_basedir"), "costmodel-verification", "mapping.csv") # TODO: Depending on the number of samples, we might need a file lock here. with open(filename, 'a') as f: f.write(" ".join((identifier, str(cost), short_stringify_vectorization_strategy((qp, optimal_strategies[qp])))) + "\n") return qp, optimal_strategies[qp] def level2_optimal_vectorization_strategy(sumfacts, width, qp): # Find the sets of simultaneously realizable kernels keys = frozenset(sf.parallel_key for sf in sumfacts) # Find minimums for each of these sets sfdict = frozendict() for key in keys: key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key) # Minimize over all the opportunities for the subset given by the current key key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp), key=fixedqp_strategy_costfunction(qp)) sfdict = add_to_frozendict(sfdict, key_strategy) return sfdict def level2_optimal_vectorization_strategy_generator(sumfacts, width, qp): for opp in _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp): # Add non-vectorized implementation information to all kernels that are not present in # the optimal strategy yield add_to_frozendict(opp, {sf: sf.copy(buffer=get_counted_variable("buffer")) for sf in sumfacts if sf not in opp}) def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, already=frozendict()): if len(sumfacts) == 0: yield already return # We store the information whether a vectorization opportunity has been yielded from this # generator to yield an incomplete strategy if not (which is then completed with unvectorized # kernel implementations) yielded = False # Find the number of input coefficients we can work on keys = frozenset(sf.inout_key for sf in sumfacts) inoutkey_sumfacts = [tuple(sorted(filter(lambda sf: sf.inout_key == key, sumfacts))) for key in sorted(keys)] for parallel in (1, 2): if parallel > len(keys): continue horizontal = 1 while horizontal <= width // parallel: combo = sum((inoutkey_sumfacts[part][:horizontal] for part in range(parallel)), ()) vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp) horizontal *= 2 if vecdict is None: # This particular choice was rejected for some reason. # Possible reasons: # * the quadrature point tuple not being suitable # for this vectorization strategy # * there are not enough horizontal kernels continue # Go into recursion to also vectorize all kernels not in this combo for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo), width, qp, add_to_frozendict(already, vecdict), ): yielded = True yield opp # If we did not yield on this recursion level, yield what we got so far if not yielded: yield already def get_vectorization_dict(sumfacts, vertical, horizontal, qp): # Discard opportunities that do not contain enough horizontal kernels if len(sumfacts) not in (horizontal, horizontal - 1): return None # Enhance the list of sumfact nodes by adding vertical splittings kernels = [] for sf in sumfacts: # No slicing needed in the pure horizontal case if vertical == 1: kernels.append(sf) continue # Determine the slicing direction slice_direction = 0 if sf.matrix_sequence[0].face is None else 1 if qp[slice_direction] % vertical != 0: return None # Split the basis tabulation matrices oldtab = sf.matrix_sequence[slice_direction] for i in range(vertical): seq = list(sf.matrix_sequence) seq[slice_direction] = oldtab.copy(slice_size=vertical, slice_index=i) kernels.append(sf.copy(matrix_sequence=tuple(seq))) # Join the new kernels into a sum factorization node buffer = get_counted_variable("joined_buffer") return {sf: VectorizedSumfactKernel(kernels=tuple(kernels), horizontal_width=horizontal, vertical_width=vertical, buffer=buffer, ) for sf in sumfacts}