[!262] Costmodel Validation

Merge branch 'feature/costmodel-validation' into 'master' The MR adds a new vectorization strategy target, which tries to find a vectorization strategy whose cost (according to the model strategy) is as close as possible to a given target. This allows to run several executables and plot the cost model value against performance data. The actual plotting work is done in dune-perftool-paperplots. The feature is very niche, and I tried to minimize the occurence of code paths only necessary for this, but I think the outcome is quite okay. See merge request [dominic/dune-perftool!262] [dominic/dune-perftool!262]: gitlab.dune-project.org/dominic/dune-perftool/merge_requests/262

[!262] Costmodel Validation
Merge branch 'feature/costmodel-validation' into 'master' The MR adds a new vectorization strategy target, which tries to find a vectorization strategy whose cost (according to the model strategy) is as close as possible to a given target. This allows to run several executables and plot the cost model value against performance data. The actual plotting work is done in dune-perftool-paperplots. The feature is very niche, and I tried to minimize the occurence of code paths only necessary for this, but I think the outcome is quite okay. See merge request [dominic/dune-perftool!262] [dominic/dune-perftool!262]: gitlab.dune-project.org/dominic/dune-perftool/merge_requests/262
c761cd04 · René Heß · 6b82c38c · e3ef02ad · c761cd04 · c761cd04
Commit c761cd04 authored 6 years ago by René Heß
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -82,7 +82,7 @@ class PerftoolFormOptionsArray(ImmutableRecord):
    sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization")
    sumfact_regular_jacobians = PerftoolOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)")
    vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
-    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model")
+    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target")
    vectorization_not_fully_vectorized_error = PerftoolOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize")
    vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy")
    vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy")
@@ -90,6 +90,7 @@ class PerftoolFormOptionsArray(ImmutableRecord):
    vectorization_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers")
    vectorization_list_index = PerftoolOption(default=None, helpstr="Which vectorization to pick from a list (only valid with vectorization_strategy=fromlist).")
    vectorization_jacobians = PerftoolOption(default=True, helpstr="Whether to attempt to vectorize jacobians (takes time, often not needed)")
+    vectorization_target = PerftoolOption(_type=float, helpstr="The cost function target for the 'target' cost model. Only needed to verify the cost model itself, do not use light-heartedly!!!")
    simplify = PerftoolOption(default=False, helpstr="Whether to simplify expressions using sympy")
    generate_jacobians = PerftoolOption(default=True, helpstr="Whether jacobian_* methods should be generated. This is set to false automatically, when numerical_jacobian is set to true.")
    generate_residuals = PerftoolOption(default=True, helpstr="Whether alpha_* methods should be generated.")

--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
 """ Sum factorization vectorization """
+from __future__ import division
 import logging
 from dune.perftool.loopy.target import dtype_floatingpoint
@@ -19,7 +21,7 @@ from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray,
                                              set_quadrature_points,
                                              )
 from dune.perftool.error import PerftoolVectorizationError
-from dune.perftool.options import get_form_option
+from dune.perftool.options import get_form_option, get_option, set_form_option
 from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff
 from pytools import product
@@ -51,21 +53,21 @@ def attach_vectorization_info(sf):
        return _cache_vectorization_info(sf, None)
-@backend(interface="vectorization_strategy", name="model")
 def costmodel(sf):
-    # Penalize vertical vectorization
+    # Penalize vertical vectorization and scalar execution
-    vertical_penalty = 1 + math.log(sf.vertical_width)
+    verticality = sf.vertical_width
-    # Penalize scalar sum factorization kernels
-    scalar_penalty = 1
    if isinstance(sf, SumfactKernel):
-        scalar_penalty = get_vcl_type_size(dtype_floatingpoint())
+        verticality = get_vcl_type_size(dtype_floatingpoint())
+    vertical_penalty = 1 + 0.5 * math.log(verticality, 2)
+    memory_penalty = 1.0
+    if isinstance(sf, VectorizedSumfactKernel):
+        memory_penalty = 1.0 + 0.25 * math.log(len(set(k.interface for k in sf.kernels)), 2)
    # Return total operations
-    return sf.operations * vertical_penalty * scalar_penalty
+    return sf.operations * vertical_penalty * memory_penalty
-@backend(interface="vectorization_strategy", name="explicit")
 def explicit_costfunction(sf):
    # Read the explicitly set values for horizontal and vertical vectorization
    width = get_vcl_type_size(dtype_floatingpoint())
@@ -85,10 +87,32 @@ def explicit_costfunction(sf):
        return 1000000000000
+def target_costfunction(sf):
+    # The cost of a kernel is given by the difference to the desired target cost.
+    # Pitfall: The target cost needs to be weighed to account for this being called
+    # on subsets and not on a full vectorization strategy!
+    _, all_sf, _ = filter_active_inactive_sumfacts()
+    total = len(all_sf)
+    target = float(get_form_option("vectorization_target"))
+    realcost = costmodel(sf)
+    ratio = sf.horizontal_width / total
+    return abs(realcost - ratio * target)
 def strategy_cost(strat_tuple):
    qp, strategy = strat_tuple
-    func = get_backend(interface="vectorization_strategy",
-                       selector=lambda: get_form_option("vectorization_strategy"))
+    # Choose the correct cost function
+    s = get_form_option("vectorization_strategy")
+    if s == "model":
+        func = costmodel
+    elif s == "explicit":
+        func = explicit_costfunction
+    elif s == "target":
+        func = target_costfunction
+    else:
+        raise NotImplementedError("Vectorization strategy '{}' unknown!".format(s))
    keys = set(sf.cache_key for sf in strategy.values())
    set_quadrature_points(qp)
@@ -133,13 +157,33 @@ def stringify_vectorization_strategy(strategy):
    return result
-def decide_vectorization_strategy():
+def short_stringify_vectorization_strategy(strategy):
-    """ Decide how to vectorize!
+    """ A short string decribing the vectorization strategy. This is used
-    Note that the vectorization of the quadrature loop is independent of this,
+    in costmodel validation plots to describe what a data point does
-    as it is implemented through a post-processing (== loopy transformation) step.
    """
-    logger = logging.getLogger(__name__)
+    qp, strategy = strategy
+    def _short(k):
+        if isinstance(k, VectorizedSumfactKernel):
+            return str(k.horizontal_width)
+        else:
+            return "scalar"
+    stage1 = []
+    stage3 = []
+    keys = set(sf.cache_key for sf in strategy.values())
+    for kernel in strategy.values():
+        if kernel.cache_key in keys:
+            keys.discard(kernel.cache_key)
+            if kernel.stage == 1:
+                stage1.append(_short(kernel))
+            if kernel.stage == 3:
+                stage3.append(_short(kernel))
+    return "m0={};S1:{};S3:{}".format(qp[0], "|".join(stage1), "|".join(stage3))
+def filter_active_inactive_sumfacts():
    # Retrieve all sum factorization kernels for stage 1 and 3
    from dune.perftool.generation import retrieve_cache_items
    all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")]
@@ -153,6 +197,18 @@ def decide_vectorization_strategy():
    # All sum factorization kernels that get used
    active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts]
+    return all_sumfacts, active_sumfacts, inactive_sumfacts
+def decide_vectorization_strategy():
+    """ Decide how to vectorize!
+    Note that the vectorization of the quadrature loop is independent of this,
+    as it is implemented through a post-processing (== loopy transformation) step.
+    """
+    logger = logging.getLogger(__name__)
+    all_sumfacts, active_sumfacts, inactive_sumfacts = filter_active_inactive_sumfacts()
    # If no vectorization is needed, abort now
    if get_form_option("vectorization_strategy") == "none" or (get_global_context_value("form_type") == "jacobian" and not get_form_option("vectorization_jacobians")):
        for sf in all_sumfacts:
@@ -207,7 +263,41 @@ def level1_optimal_vectorization_strategy(sumfacts, width):
    # Find the minimum cost strategy between all the quadrature point tuples
    optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points}
-    qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp])))
+    # If we are using the 'target' strategy, we might want to log some information.
+    if get_form_option("vectorization_strategy") == "target":
+        # Print the achieved cost and the target cost on the screen
+        set_form_option("vectorization_strategy", "model")
+        target = float(get_form_option("vectorization_target"))
+        qp = min(optimal_strategies, key=lambda qp: abs(strategy_cost((qp, optimal_strategies[qp])) - target))
+        cost = strategy_cost((qp, optimal_strategies[qp]))
+        print("The target cost was:   {}".format(target))
+        print("The achieved cost was: {}".format(cost))
+        optimum = level1_optimal_vectorization_strategy(sumfacts, width)
+        print("The optimal cost would be: {}".format(strategy_cost(optimum)))
+        set_form_option("vectorization_strategy", "target")
+        print("The score in 'target' logic was: {}".format(strategy_cost((qp, optimal_strategies[qp]))))
+        # Print the employed vectorization strategy into a file
+        suffix = ""
+        if get_global_context_value("integral_type") == "interior_facet":
+            suffix = "_dir{}_mod{}".format(get_global_context_value("facedir_s"),
+                                           get_global_context_value("facemod_s"))
+        filename = "targetstrat_{}{}.log".format(int(float(get_form_option("vectorization_target"))), suffix)
+        with open(filename, 'w') as f:
+            f.write("\n".join(stringify_vectorization_strategy((qp, optimal_strategies[qp]))))
+        # Write an entry into a csvfile which connects the given measuring identifier with a cost
+        from dune.testtools.parametertree.parser import parse_ini_file
+        inifile = parse_ini_file(get_option("ini_file"))
+        identifier = inifile["identifier"]
+        # TODO: Depending on the number of samples, we might need a file lock here.
+        with open("mapping.csv", 'a') as f:
+            f.write(" ".join((identifier, str(cost), short_stringify_vectorization_strategy((qp, optimal_strategies[qp])))) + "\n")
+    else:
+        qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp])))
    return qp, optimal_strategies[qp]
@@ -221,6 +311,8 @@ def level2_optimal_vectorization_strategy(sumfacts, width, qp):
    for key in keys:
        key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key)
+        # Minimize over all the opportunities for the subset given by the current key
        key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp),
                           key=fixedqp_strategy_costfunction(qp))
        sfdict = add_to_frozendict(sfdict, key_strategy)
@@ -286,7 +378,7 @@ def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, alread
 def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
    # Discard opportunities that do not contain enough horizontal kernels
-    if len(sumfacts) not in (horizontal, horizontal - 1):
+    if len(sumfacts) not in (horizontal, horizontal * vertical - 1):
        return None
    # Enhance the list of sumfact nodes by adding vertical splittings