""" Sum factorization vectorization """ from __future__ import division import logging from dune.codegen.loopy.target import dtype_floatingpoint from dune.codegen.loopy.vcl import get_vcl_type_size from dune.codegen.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel from dune.codegen.generation import (generator_factory, get_counted_variable, get_global_context_value, kernel_cached, retrieve_cache_items, ) from dune.codegen.pdelab.restriction import (Restriction, restricted_name, ) from dune.codegen.sumfact.tabulation import (quadrature_points_per_direction, set_quadrature_points, ) from dune.codegen.error import CodegenVectorizationError from dune.codegen.options import get_form_option, get_option, form_option_context from dune.codegen.tools import add_to_frozendict, round_to_multiple, list_diff from pymbolic.mapper.flop_counter import FlopCounter from pytools import product from frozendict import frozendict import itertools as it import loopy as lp import numpy as np import math @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o) def _cache_vectorization_info(old, new): if get_form_option("vectorization_not_fully_vectorized_error"): if not isinstance(new, VectorizedSumfactKernel): raise CodegenVectorizationError("Did not fully vectorize!") if new is None: raise CodegenVectorizationError("Vectorization info for sum factorization kernel was not gathered correctly!") return new _collect_sumfact_nodes = generator_factory(item_tags=("sumfactnodes", "dryrundata"), context_tags="kernel", no_deco=True) def attach_vectorization_info(sf): assert isinstance(sf, SumfactKernel) if get_global_context_value("dry_run"): return _collect_sumfact_nodes(sf) else: return _cache_vectorization_info(sf, None) def costmodel(sf): # Penalize vertical vectorization and scalar execution verticality = sf.vertical_width if isinstance(sf, SumfactKernel): verticality = get_vcl_type_size(dtype_floatingpoint()) vertical_penalty = 1 + 0.5 * math.log(verticality, 2) memory_penalty = 1.0 if isinstance(sf, VectorizedSumfactKernel): memory_penalty = 1.0 + 0.25 * math.log(len(set(k.interface for k in sf.kernels)), 2) # Return total operations return sf.operations * vertical_penalty * memory_penalty def explicit_costfunction(sf): # Read the explicitly set values for horizontal and vertical vectorization width = get_vcl_type_size(dtype_floatingpoint()) horizontal = get_form_option("vectorization_horizontal") if horizontal is None: horizontal = width vertical = get_form_option("vectorization_vertical") if vertical is None: vertical = 1 horizontal = int(horizontal) vertical = int(vertical) if sf.horizontal_width == horizontal and sf.vertical_width == vertical: # Penalize position mapping return sf.operations else: return 1000000000000 def target_costfunction(sf): # The cost of a kernel is given by the difference to the desired target cost. # Pitfall: The target cost needs to be weighed to account for this being called # on subsets and not on a full vectorization strategy! _, all_sf, _ = filter_active_inactive_sumfacts() total = len(all_sf) target = float(get_form_option("vectorization_target")) realcost = costmodel(sf) ratio = sf.horizontal_width / total return abs(realcost - ratio * target) def accumulate_for_strategy(strategy, func, reduce=lambda x, y: x + y, start=0): """ Accumulate a quantity over a given vectorization strategy """ accum = start # Make sure to count each implementation exactly once keys = set(sf.cache_key for sf in strategy.values()) for sf in strategy.values(): if sf.cache_key in keys: accum = reduce(accum, func(sf)) keys.discard(sf.cache_key) return accum def strategy_cost(strat_tuple): qp, strategy = strat_tuple # Choose the correct cost function s = get_form_option("vectorization_strategy") if s == "model": func = costmodel elif s == "explicit": func = explicit_costfunction elif s == "target": func = target_costfunction elif s == "autotune": from dune.codegen.sumfact.autotune import autotune_realization func = autotune_realization else: raise NotImplementedError("Vectorization strategy '{}' unknown!".format(s)) keys = set(sf.cache_key for sf in strategy.values()) set_quadrature_points(qp) return accumulate_for_strategy(strategy, lambda sf: float(func(sf))) class PrimitiveApproximateOpcounter(FlopCounter): def map_sumfact_kernel(self, expr): return 0 def map_tagged_variable(self, expr): return self.map_variable(expr) def map_loopy_function_identifier(self, expr): if hasattr(expr, "operations"): return expr.operations() raise NotImplementedError("The class {} should implement a symbolic flopcounter.".format(type(expr))) @generator_factory(item_tags=("opcounts",), context_tags="kernel") def store_operation_count(expr, count): return count def count_quadrature_point_operations(expr): counter = PrimitiveApproximateOpcounter() store_operation_count(expr, counter(expr)) def quadrature_penalized_strategy_cost(strat_tuple): """ Implements a penalization of the cost function that accounts for the increase in flops that occur in the quadrature loop. This needs to somehow get a guess of how much work is done in the quadrature loop relative to the sum factorization kernels. """ qp, strategy = strat_tuple # Evaluate the original cost function. This result will be scaled by this function. cost = strategy_cost(strat_tuple) if cost is 0: return 0 # Get the total number of Flops done in sum factorization kernels sf_flops = accumulate_for_strategy(strategy, lambda sf: sf.operations) # Get the minimal possible number of quadrature points and the actual quadrature points num_qp_new = product(qp) set_quadrature_points(None) num_qp_old = product(quadrature_points_per_direction()) set_quadrature_points(qp) # Determine the number of floating point operations per quadrature point. # This flop counting is a very crude approximation, but that is totally sufficient here. ops_per_qp = sum(i for i in retrieve_cache_items("opcounts")) # Do the actual scaling. return float((sf_flops + ops_per_qp * num_qp_new) / (sf_flops + ops_per_qp * num_qp_old)) * cost def fixedqp_strategy_costfunction(qp): def _cost(strategy): return strategy_cost((qp, strategy)) return _cost def stringify_vectorization_strategy(strategy): result = [] qp, strategy = strategy result.append("Printing potential vectorization strategy:") result.append("Quadrature point tuple: {}".format(qp)) # Look for all realizations in the strategy and iterate over them cache_keys = frozenset(v.cache_key for v in strategy.values()) for ck in cache_keys: # Filter all the kernels that are realized by this and print for key in strategy: if strategy[key].cache_key == ck: result.append("{}:".format(key)) # Find one representative to print for val in strategy.values(): if val.cache_key == ck: result.append(" {}".format(val)) break return result def short_stringify_vectorization_strategy(strategy): """ A short string decribing the vectorization strategy. This is used in costmodel validation plots to describe what a data point does """ qp, strategy = strategy def _short(k): if isinstance(k, VectorizedSumfactKernel): return str(k.horizontal_width) else: return "scalar" stage1 = [] stage3 = [] keys = set(sf.cache_key for sf in strategy.values()) for kernel in strategy.values(): if kernel.cache_key in keys: keys.discard(kernel.cache_key) if kernel.stage == 1: stage1.append(_short(kernel)) if kernel.stage == 3: stage3.append(_short(kernel)) return "m0={};S1:{};S3:{}".format(qp[0], "|".join(stage1), "|".join(stage3)) def filter_active_inactive_sumfacts(): # Retrieve all sum factorization kernels for stage 1 and 3 from dune.codegen.generation import retrieve_cache_items all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")] # Stage 1 sum factorizations that were actually used basis_sumfacts = [i for i in retrieve_cache_items('kernel_default and basis_sf_kernels')] # This means we can have sum factorizations that will not get used inactive_sumfacts = [i for i in all_sumfacts if i.stage == 1 and i not in basis_sumfacts] # All sum factorization kernels that get used active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts] return all_sumfacts, active_sumfacts, inactive_sumfacts def decide_vectorization_strategy(): """ Decide how to vectorize! Note that the vectorization of the quadrature loop is independent of this, as it is implemented through a post-processing (== loopy transformation) step. """ logger = logging.getLogger(__name__) all_sumfacts, active_sumfacts, inactive_sumfacts = filter_active_inactive_sumfacts() # If no vectorization is needed, abort now if get_form_option("vectorization_strategy") == "none" or (get_global_context_value("form_type") == "jacobian" and not get_form_option("vectorization_jacobians")) or not get_form_option("sumfact"): for sf in all_sumfacts: _cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer"))) return logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes" .format(len(active_sumfacts))) # # Find the best vectorization strategy by using a costmodel # # Note that this optimization procedure uses a hierarchic approach to bypass # the problems of unfavorable complexity of the set of all possible vectorization # opportunities. Optimizations are performed at different levels (you find these # levels in the function names implementing them), where optimal solutions at a # higher level are combined into lower level solutions or optima of optimal solutions # at higher level are calculated: # * Level 1: Finding an optimal quadrature tuple (by finding optimum of level 2 optima) # * Level 2: Split by parallelizability and combine optima into optimal solution # * Level 3: Optimize number of different inputs to consider # * Level 4: Optimize horizontal/vertical/hybrid strategy width = get_vcl_type_size(dtype_floatingpoint()) qp, sfdict = level1_optimal_vectorization_strategy(active_sumfacts, width) set_quadrature_points(qp) logger.debug("decide_vectorization_strategy: Decided for the following strategy:" + "\n " + "\n ".join(stringify_vectorization_strategy((qp, sfdict)))) # We map inactive sum factorization kernels to 0 sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts}) # Register the results for sf in all_sumfacts: _cache_vectorization_info(sf, sfdict[sf]) def level1_optimal_vectorization_strategy(sumfacts, width): # Gather a list of possible quadrature point tuples quad_points = [quadrature_points_per_direction()] if get_form_option("vectorization_allow_quadrature_changes"): sf = next(iter(sumfacts)) depth = 1 while depth <= width: i = 0 if sf.matrix_sequence_quadrature_permuted[0].face is None else 1 quad = list(quadrature_points_per_direction()) quad[i] = round_to_multiple(quad[i], depth) quad_points.append(tuple(quad)) depth = depth * 2 quad_points = list(set(quad_points)) # Find the minimum cost strategy between all the quadrature point tuples optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points} # If we are using the 'target' strategy, we might want to log some information. if get_form_option("vectorization_strategy") == "target": # Print the achieved cost and the target cost on the screen with form_option_context(vectorization_strategy="model"): target = float(get_form_option("vectorization_target")) qp = min(optimal_strategies, key=lambda qp: abs(strategy_cost((qp, optimal_strategies[qp])) - target)) cost = strategy_cost((qp, optimal_strategies[qp])) print("The target cost was: {}".format(target)) print("The achieved cost was: {}".format(cost)) optimum = level1_optimal_vectorization_strategy(sumfacts, width) print("The optimal cost would be: {}".format(strategy_cost(optimum))) print("The score in 'target' logic was: {}".format(strategy_cost((qp, optimal_strategies[qp])))) # Print the employed vectorization strategy into a file suffix = "" if get_global_context_value("integral_type") == "interior_facet": suffix = "_dir{}_mod{}".format(get_global_context_value("facedir_s"), get_global_context_value("facemod_s")) filename = "targetstrat_{}{}.log".format(int(float(get_form_option("vectorization_target"))), suffix) with open(filename, 'w') as f: f.write("\n".join(stringify_vectorization_strategy((qp, optimal_strategies[qp])))) # Write an entry into a csvfile which connects the given measuring identifier with a cost from dune.testtools.parametertree.parser import parse_ini_file inifile = parse_ini_file(get_option("ini_file")) identifier = inifile["identifier"] # TODO: Depending on the number of samples, we might need a file lock here. with open("mapping.csv", 'a') as f: f.write(" ".join((identifier, str(cost), short_stringify_vectorization_strategy((qp, optimal_strategies[qp])))) + "\n") else: qp = min(optimal_strategies, key=lambda qp: quadrature_penalized_strategy_cost((qp, optimal_strategies[qp]))) return qp, optimal_strategies[qp] def level2_optimal_vectorization_strategy(sumfacts, width, qp): # Find the sets of simultaneously realizable kernels keys = frozenset(sf.parallel_key for sf in sumfacts) # Find minimums for each of these sets sfdict = frozendict() for key in keys: key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key) # Minimize over all the opportunities for the subset given by the current key key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp), key=fixedqp_strategy_costfunction(qp)) sfdict = add_to_frozendict(sfdict, key_strategy) return sfdict def level2_optimal_vectorization_strategy_generator(sumfacts, width, qp): for opp in _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp): # Add non-vectorized implementation information to all kernels that are not present in # the optimal strategy yield add_to_frozendict(opp, {sf: sf.copy(buffer=get_counted_variable("buffer")) for sf in sumfacts if sf not in opp}) def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, already=frozendict()): if len(sumfacts) == 0: yield already return # We store the information whether a vectorization opportunity has been yielded from this # generator to yield an incomplete strategy if not (which is then completed with unvectorized # kernel implementations) yielded = False # Find the number of input coefficients we can work on keys = frozenset(sf.inout_key for sf in sumfacts) inoutkey_sumfacts = [tuple(sorted(filter(lambda sf: sf.inout_key == key, sumfacts))) for key in sorted(keys)] for parallel in (1, 2): if parallel > len(keys): continue horizontal = 1 while horizontal <= width // parallel: combo = sum((inoutkey_sumfacts[part][:horizontal] for part in range(parallel)), ()) vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp) horizontal *= 2 if vecdict is None: # This particular choice was rejected for some reason. # Possible reasons: # * the quadrature point tuple not being suitable # for this vectorization strategy # * there are not enough horizontal kernels continue # Go into recursion to also vectorize all kernels not in this combo for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo), width, qp, add_to_frozendict(already, vecdict), ): yielded = True yield opp # If we did not yield on this recursion level, yield what we got so far if not yielded: yield already def get_vectorization_dict(sumfacts, vertical, horizontal, qp): # Discard opportunities that do not contain enough horizontal kernels if len(sumfacts) not in (horizontal, horizontal * vertical - 1): return None # Enhance the list of sumfact nodes by adding vertical splittings kernels = [] for sf in sumfacts: # No slicing needed in the pure horizontal case if vertical == 1: kernels.append(sf) continue # Determine the slicing direction slice_direction = 0 if sf.matrix_sequence_quadrature_permuted[0].face is None else 1 if qp[slice_direction] % vertical != 0: return None # Split the basis tabulation matrices oldtab = sf.matrix_sequence_quadrature_permuted[slice_direction] for i in range(vertical): seq = list(sf.matrix_sequence_quadrature_permuted) seq[slice_direction] = oldtab.copy(slice_size=vertical, slice_index=i) # Create new sf kernel with new interface kwargs = sf.interface.get_keyword_arguments() kwargs.update({'matrix_sequence': tuple(seq)}) newinterface = type(sf.interface)(**kwargs) kernels.append(sf.copy(matrix_sequence=tuple(seq), interface=newinterface)) # Join the new kernels into a sum factorization node buffer = get_counted_variable("joined_buffer") return {sf: VectorizedSumfactKernel(kernels=tuple(kernels), horizontal_width=horizontal, vertical_width=vertical, buffer=buffer, ) for sf in sumfacts}