diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 690a1c0ba912913f068fe2f295dcbc4c498bdf1e..95286389498b4697cc2450b464b37761f521f358 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -172,59 +172,55 @@ def decide_vectorization_strategy(): logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes" .format(len(active_sumfacts))) - # Find the best vectorization strategy by using a costmodel - width = get_vcl_type_size(dtype_floatingpoint()) - # - # Optimize over all the possible quadrature point tuples + # Find the best vectorization strategy by using a costmodel # - quad_points = [quadrature_points_per_direction()] - if get_form_option("vectorization_allow_quadrature_changes"): - sf = next(iter(active_sumfacts)) - depth = 1 - while depth <= width: - i = 0 if sf.matrix_sequence[0].face is None else 1 - quad = list(quadrature_points_per_direction()) - quad[i] = round_to_multiple(quad[i], depth) - quad_points.append(tuple(quad)) - depth = depth * 2 - quad_points = list(set(quad_points)) - - if get_form_option("vectorization_strategy") == "fromlist": - # This is a bit special and does not follow the minimization procedure at all - - def _choose_strategy_from_list(stage1_sumfacts): - strategy = 0 - for qp in quad_points: - for strat in fixed_quad_vectorization_opportunity_generator(frozenset(stage1_sumfacts), width, qp): - if strategy == int(get_form_option("vectorization_list_index")): - # Output the strategy and its cost into a separate file - if get_global_context_value("form_type") == "jacobian_apply": - with open("strategycosts.csv", "a") as f: - f.write("{} {}\n".format(strategy, strategy_cost((qp, strat)))) - return qp, strat - strategy = strategy + 1 - - raise PerftoolVectorizationError("Specified vectorization list index '{}' was too high!".format(get_form_option("vectorization_list_index"))) - - s1_sumfacts = frozenset(sf for sf in active_sumfacts if sf.stage == 1) - - total = sum(len([s for s in fixed_quad_vectorization_opportunity_generator(frozenset(s1_sumfacts), width, qp)]) for qp in quad_points) - print("'fromlist' vectorization is attempting to pick #{} of {} strategies...".format(int(get_form_option("vectorization_list_index")), - total)) - qp, sfdict = _choose_strategy_from_list(s1_sumfacts) - - keys = frozenset(sf.input_key for sf in active_sumfacts if sf.stage != 1) - for key in keys: - key_sumfacts = frozenset(sf for sf in active_sumfacts if sf.input_key == key) - minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp), - key=fixedqp_strategy_costfunction(qp)) - sfdict = add_to_frozendict(sfdict, minimum) - else: - # Find the minimum cost strategy between all the quadrature point tuples - optimal_strategies = {qp: fixed_quadrature_optimal_vectorization(active_sumfacts, width, qp) for qp in quad_points} - qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp]))) - sfdict = optimal_strategies[qp] + # Note that this optimization procedure uses a hierarchic approach to bypass + # the problems of unfavorable complexity of the set of all possible vectorization + # opportunities. Optimizations are performed at different levels (you find these + # levels in the function names implementing them), where optimal solutions at a + # higher level are combined into lower level solutions or optima of optimal solutions + # at higher level are calculated: + # * Level 1: Finding an optimal quadrature tuple (by finding optimum of level 2 optima) + # * Level 2: Split by parallelizability and combine optima into optimal solution + # * Level 3: Optimize number of different inputs to consider + # * Level 4: Optimize horizontal/vertical/hybrid strategy + width = get_vcl_type_size(dtype_floatingpoint()) + qp, sfdict = level1_optimal_vectorization_strategy(active_sumfacts, width) + + +# TODO: Check how the 'fromlist' generator fits into the new overall picture +# +# if get_form_option("vectorization_strategy") == "fromlist": +# # This is a bit special and does not follow the minimization procedure at all +# +# def _choose_strategy_from_list(stage1_sumfacts): +# strategy = 0 +# for qp in quad_points: +# for strat in fixed_quad_vectorization_opportunity_generator(frozenset(stage1_sumfacts), width, qp): +# if strategy == int(get_form_option("vectorization_list_index")): +# # Output the strategy and its cost into a separate file +# if get_global_context_value("form_type") == "jacobian_apply": +# with open("strategycosts.csv", "a") as f: +# f.write("{} {}\n".format(strategy, strategy_cost((qp, strat)))) +# return qp, strat +# strategy = strategy + 1 +# +# raise PerftoolVectorizationError("Specified vectorization list index '{}' was too high!".format(get_form_option("vectorization_list_index"))) +# +# s1_sumfacts = frozenset(sf for sf in active_sumfacts if sf.stage == 1) +# +# total = sum(len([s for s in fixed_quad_vectorization_opportunity_generator(frozenset(s1_sumfacts), width, qp)]) for qp in quad_points) +# print("'fromlist' vectorization is attempting to pick #{} of {} strategies...".format(int(get_form_option("vectorization_list_index")), +# total)) +# qp, sfdict = _choose_strategy_from_list(s1_sumfacts) +# +# keys = frozenset(sf.input_key for sf in active_sumfacts if sf.stage != 1) +# for key in keys: +# key_sumfacts = frozenset(sf for sf in active_sumfacts if sf.input_key == key) +# minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp), +# key=fixedqp_strategy_costfunction(qp)) +# sfdict = add_to_frozendict(sfdict, minimum) set_quadrature_points(qp) @@ -239,13 +235,28 @@ def decide_vectorization_strategy(): _cache_vectorization_info(sf, sfdict[sf]) -def fixed_quadrature_optimal_vectorization(sumfacts, width, qp): - """ For a given quadrature point tuple, find the optimal strategy! +def level1_optimal_vectorization_strategy(sumfacts, width): + # Gather a list of possible quadrature point tuples + quad_points = [quadrature_points_per_direction()] + if get_form_option("vectorization_allow_quadrature_changes"): + sf = next(iter(sumfacts)) + depth = 1 + while depth <= width: + i = 0 if sf.matrix_sequence[0].face is None else 1 + quad = list(quadrature_points_per_direction()) + quad[i] = round_to_multiple(quad[i], depth) + quad_points.append(tuple(quad)) + depth = depth * 2 + quad_points = list(set(quad_points)) - In order to have this scale sufficiently, we cannot simply list all vectorization - opportunities and score them individually, but we need to do a divide and conquer - approach. - """ + # Find the minimum cost strategy between all the quadrature point tuples + optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points} + qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp]))) + + return qp, optimal_strategies[qp] + + +def level2_optimal_vectorization_strategy(sumfacts, width, qp): # Find the sets of simultaneously realizable kernels (thats an equivalence relation) keys = frozenset(sf.input_key for sf in sumfacts)