diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index a28b2bc5beb7b7cb6ac5317797a0122be10bd214..9d4ccb3f5da38ac1d83bd5d7b63f7c7a1059c99d 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -82,7 +82,7 @@ class PerftoolFormOptionsArray(ImmutableRecord): sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization") sumfact_regular_jacobians = PerftoolOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)") vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") - vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model") + vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target") vectorization_not_fully_vectorized_error = PerftoolOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize") vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy") vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy") @@ -90,6 +90,7 @@ class PerftoolFormOptionsArray(ImmutableRecord): vectorization_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers") vectorization_list_index = PerftoolOption(default=None, helpstr="Which vectorization to pick from a list (only valid with vectorization_strategy=fromlist).") vectorization_jacobians = PerftoolOption(default=True, helpstr="Whether to attempt to vectorize jacobians (takes time, often not needed)") + vectorization_target = PerftoolOption(_type=float, helpstr="The cost function target for the 'target' cost model. Only needed to verify the cost model itself, do not use light-heartedly!!!") simplify = PerftoolOption(default=False, helpstr="Whether to simplify expressions using sympy") generate_jacobians = PerftoolOption(default=True, helpstr="Whether jacobian_* methods should be generated. This is set to false automatically, when numerical_jacobian is set to true.") generate_residuals = PerftoolOption(default=True, helpstr="Whether alpha_* methods should be generated.") diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 3e79419dcb97777c86a130deb3f0b82b84d4c950..aed3f4a30b9115fc0a4aef1169f85db8e492853a 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -1,5 +1,7 @@ """ Sum factorization vectorization """ +from __future__ import division + import logging from dune.perftool.loopy.target import dtype_floatingpoint @@ -19,7 +21,7 @@ from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray, set_quadrature_points, ) from dune.perftool.error import PerftoolVectorizationError -from dune.perftool.options import get_form_option +from dune.perftool.options import get_form_option, get_option, set_form_option from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff from pytools import product @@ -51,21 +53,21 @@ def attach_vectorization_info(sf): return _cache_vectorization_info(sf, None) -@backend(interface="vectorization_strategy", name="model") def costmodel(sf): - # Penalize vertical vectorization - vertical_penalty = 1 + math.log(sf.vertical_width) - - # Penalize scalar sum factorization kernels - scalar_penalty = 1 + # Penalize vertical vectorization and scalar execution + verticality = sf.vertical_width if isinstance(sf, SumfactKernel): - scalar_penalty = get_vcl_type_size(dtype_floatingpoint()) + verticality = get_vcl_type_size(dtype_floatingpoint()) + vertical_penalty = 1 + 0.5 * math.log(verticality, 2) + + memory_penalty = 1.0 + if isinstance(sf, VectorizedSumfactKernel): + memory_penalty = 1.0 + 0.25 * math.log(len(set(k.interface for k in sf.kernels)), 2) # Return total operations - return sf.operations * vertical_penalty * scalar_penalty + return sf.operations * vertical_penalty * memory_penalty -@backend(interface="vectorization_strategy", name="explicit") def explicit_costfunction(sf): # Read the explicitly set values for horizontal and vertical vectorization width = get_vcl_type_size(dtype_floatingpoint()) @@ -85,10 +87,32 @@ def explicit_costfunction(sf): return 1000000000000 +def target_costfunction(sf): + # The cost of a kernel is given by the difference to the desired target cost. + # Pitfall: The target cost needs to be weighed to account for this being called + # on subsets and not on a full vectorization strategy! + _, all_sf, _ = filter_active_inactive_sumfacts() + total = len(all_sf) + target = float(get_form_option("vectorization_target")) + realcost = costmodel(sf) + ratio = sf.horizontal_width / total + return abs(realcost - ratio * target) + + def strategy_cost(strat_tuple): qp, strategy = strat_tuple - func = get_backend(interface="vectorization_strategy", - selector=lambda: get_form_option("vectorization_strategy")) + + # Choose the correct cost function + s = get_form_option("vectorization_strategy") + if s == "model": + func = costmodel + elif s == "explicit": + func = explicit_costfunction + elif s == "target": + func = target_costfunction + else: + raise NotImplementedError("Vectorization strategy '{}' unknown!".format(s)) + keys = set(sf.cache_key for sf in strategy.values()) set_quadrature_points(qp) @@ -133,13 +157,33 @@ def stringify_vectorization_strategy(strategy): return result -def decide_vectorization_strategy(): - """ Decide how to vectorize! - Note that the vectorization of the quadrature loop is independent of this, - as it is implemented through a post-processing (== loopy transformation) step. +def short_stringify_vectorization_strategy(strategy): + """ A short string decribing the vectorization strategy. This is used + in costmodel validation plots to describe what a data point does """ - logger = logging.getLogger(__name__) + qp, strategy = strategy + + def _short(k): + if isinstance(k, VectorizedSumfactKernel): + return str(k.horizontal_width) + else: + return "scalar" + + stage1 = [] + stage3 = [] + keys = set(sf.cache_key for sf in strategy.values()) + for kernel in strategy.values(): + if kernel.cache_key in keys: + keys.discard(kernel.cache_key) + if kernel.stage == 1: + stage1.append(_short(kernel)) + if kernel.stage == 3: + stage3.append(_short(kernel)) + + return "m0={};S1:{};S3:{}".format(qp[0], "|".join(stage1), "|".join(stage3)) + +def filter_active_inactive_sumfacts(): # Retrieve all sum factorization kernels for stage 1 and 3 from dune.perftool.generation import retrieve_cache_items all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")] @@ -153,6 +197,18 @@ def decide_vectorization_strategy(): # All sum factorization kernels that get used active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts] + return all_sumfacts, active_sumfacts, inactive_sumfacts + + +def decide_vectorization_strategy(): + """ Decide how to vectorize! + Note that the vectorization of the quadrature loop is independent of this, + as it is implemented through a post-processing (== loopy transformation) step. + """ + logger = logging.getLogger(__name__) + + all_sumfacts, active_sumfacts, inactive_sumfacts = filter_active_inactive_sumfacts() + # If no vectorization is needed, abort now if get_form_option("vectorization_strategy") == "none" or (get_global_context_value("form_type") == "jacobian" and not get_form_option("vectorization_jacobians")): for sf in all_sumfacts: @@ -207,7 +263,41 @@ def level1_optimal_vectorization_strategy(sumfacts, width): # Find the minimum cost strategy between all the quadrature point tuples optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points} - qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp]))) + + # If we are using the 'target' strategy, we might want to log some information. + if get_form_option("vectorization_strategy") == "target": + # Print the achieved cost and the target cost on the screen + set_form_option("vectorization_strategy", "model") + target = float(get_form_option("vectorization_target")) + qp = min(optimal_strategies, key=lambda qp: abs(strategy_cost((qp, optimal_strategies[qp])) - target)) + cost = strategy_cost((qp, optimal_strategies[qp])) + + print("The target cost was: {}".format(target)) + print("The achieved cost was: {}".format(cost)) + optimum = level1_optimal_vectorization_strategy(sumfacts, width) + print("The optimal cost would be: {}".format(strategy_cost(optimum))) + set_form_option("vectorization_strategy", "target") + print("The score in 'target' logic was: {}".format(strategy_cost((qp, optimal_strategies[qp])))) + + # Print the employed vectorization strategy into a file + suffix = "" + if get_global_context_value("integral_type") == "interior_facet": + suffix = "_dir{}_mod{}".format(get_global_context_value("facedir_s"), + get_global_context_value("facemod_s")) + filename = "targetstrat_{}{}.log".format(int(float(get_form_option("vectorization_target"))), suffix) + with open(filename, 'w') as f: + f.write("\n".join(stringify_vectorization_strategy((qp, optimal_strategies[qp])))) + + # Write an entry into a csvfile which connects the given measuring identifier with a cost + from dune.testtools.parametertree.parser import parse_ini_file + inifile = parse_ini_file(get_option("ini_file")) + identifier = inifile["identifier"] + + # TODO: Depending on the number of samples, we might need a file lock here. + with open("mapping.csv", 'a') as f: + f.write(" ".join((identifier, str(cost), short_stringify_vectorization_strategy((qp, optimal_strategies[qp])))) + "\n") + else: + qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp]))) return qp, optimal_strategies[qp] @@ -221,6 +311,8 @@ def level2_optimal_vectorization_strategy(sumfacts, width, qp): for key in keys: key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key) + + # Minimize over all the opportunities for the subset given by the current key key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp), key=fixedqp_strategy_costfunction(qp)) sfdict = add_to_frozendict(sfdict, key_strategy) @@ -286,7 +378,7 @@ def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, alread def get_vectorization_dict(sumfacts, vertical, horizontal, qp): # Discard opportunities that do not contain enough horizontal kernels - if len(sumfacts) not in (horizontal, horizontal - 1): + if len(sumfacts) not in (horizontal, horizontal * vertical - 1): return None # Enhance the list of sumfact nodes by adding vertical splittings