diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 0c626519dd5ab7c4631d01749440bcb2efa3ca64..14af6638c9a29669c9a28556d0d689bc75a3d49e 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -53,7 +53,7 @@ def attach_vectorization_info(sf): def position_penalty_factor(sf): - if isinstance(sf, SumfactKernel): + if isinstance(sf, SumfactKernel) or sf.vertical_width > 1: return 1 else: return 1 + sum(abs(sf.kernels[i].position_priority - i) if sf.kernels[i].position_priority is not None else 0 for i in range(sf.length)) @@ -64,8 +64,13 @@ def costmodel(sf): # Penalize vertical vectorization vertical_penalty = 1 + math.log(sf.vertical_width) + # Penalize scalar sum factorization kernels + scalar_penalty = 1 + if isinstance(sf, SumfactKernel): + scalar_penalty = get_vcl_type_size(np.float64) + # Return total operations - return sf.operations * position_penalty_factor(sf) * vertical_penalty + return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty @backend(interface="vectorization_strategy", name="explicit") @@ -89,8 +94,6 @@ def explicit_costfunction(sf): def strategy_cost(strategy): - qp, strategy = strategy - set_quadrature_points(qp) func = get_backend(interface="vectorization_strategy", selector=lambda: get_option("vectorization_strategy")) keys = set(sf.cache_key for sf in strategy.values()) @@ -160,35 +163,14 @@ def decide_vectorization_strategy(): # Find the best vectorization strategy by using a costmodel width = get_vcl_type_size(np.float64) - strategy = min(vectorization_opportunity_generator(active_sumfacts, width), - key=strategy_cost) - # Treat the quadrature points - qp, sfdict = strategy - set_quadrature_points(qp) - - logger.debug("decide_vectorization_strategy: Decided for the following strategy:" - "\n".join(stringify_vectorization_strategy(strategy))) - - # We map inactive sum factorization kernels to 0 - sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts}) - - # Register the results - for sf in all_sumfacts: - _cache_vectorization_info(sf, sfdict[sf]) - - -def vectorization_opportunity_generator(sumfacts, width): - """ Generator that yields all vectorization opportunities for the given - sum factorization kernels as tuples of quadrature point tuple and vectorization - dictionary """ # - # Find all the possible quadrature point tuples + # Optimize over all the possible quadrature point tuples # quad_points = [quadrature_points_per_direction()] if get_option("vectorization_allow_quadrature_changes"): - sf = next(iter(sumfacts)) + sf = next(iter(active_sumfacts)) depth = 1 while depth <= width: i = 0 if sf.matrix_sequence[0].face is None else 1 @@ -198,12 +180,44 @@ def vectorization_opportunity_generator(sumfacts, width): depth = depth * 2 quad_points = list(set(quad_points)) - for qp in quad_points: - # - # Determine vectorization opportunities given a fixed quadrature point number - # - for opp in fixed_quad_vectorization_opportunity_generator(frozenset(sumfacts), width, qp): - yield qp, opp + # Find the minimum cost strategy between all the quadrature point tuples + optimal_strategies = {qp: fixed_quadrature_optimal_vectorization(active_sumfacts, width, qp) for qp in quad_points} + qp = min(optimal_strategies, key=lambda qp: strategy_cost(optimal_strategies[qp])) + sfdict = optimal_strategies[qp] + set_quadrature_points(qp) + + logger.debug("decide_vectorization_strategy: Decided for the following strategy:" + "\n".join(stringify_vectorization_strategy((qp, sfdict)))) + + # We map inactive sum factorization kernels to 0 + sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts}) + + # Register the results + for sf in all_sumfacts: + _cache_vectorization_info(sf, sfdict[sf]) + + +def fixed_quadrature_optimal_vectorization(sumfacts, width, qp): + """ For a given quadrature point tuple, find the optimal strategy! + + In order to have this scale sufficiently, we cannot simply list all vectorization + opportunities and score them individually, but we need to do a divide and conquer + approach. + """ + set_quadrature_points(qp) + + # Find the sets of simultaneously realizable kernels (thats an equivalence relation) + keys = frozenset(sf.input_key for sf in sumfacts) + + # Find minimums for each of these sets + sfdict = frozendict() + for key in keys: + key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key) + minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp), + key=strategy_cost) + sfdict = add_to_frozendict(sfdict, minimum) + + return sfdict def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()): @@ -227,20 +241,14 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already= ): yield opp - # Find all the sum factorization kernels that the chosen kernel can be parallelized with - # - # TODO: Right now we check for same input, which is not actually needed in order - # to be a suitable candidate! We should relax this concept at some point! - candidates = filter(lambda sf: sf.input_key == sf_to_decide.input_key, sumfacts) - horizontal = 1 while horizontal <= width: # Iterate over the possible combinations of sum factorization kernels # taking into account all the permutations of kernels. This also includes # combinations which use a padding of 1 - but only for pure horizontality. - generators = [it.permutations(candidates, horizontal)] + generators = [it.permutations(sumfacts, horizontal)] if horizontal >= 4: - generators.append(it.permutations(candidates, horizontal - 1)) + generators.append(it.permutations(sumfacts, horizontal - 1)) for combo in it.chain(*generators): # The chosen kernels must be part of the kernels for recursion # to work correctly