Implement everything through a costmodel and adapt the ini options

94e8e7df · Dominic Kempf · 7c27277d · 94e8e7df · 94e8e7df · 94e8e7df
Commit 94e8e7df authored 7 years ago by Dominic Kempf
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -54,15 +54,12 @@ class PerftoolOptionsArray(ImmutableRecord):
    project_basedir = PerftoolOption(helpstr="The base (build) directory of the dune-perftool project")
    fastdg = PerftoolOption(default=False, helpstr="Use FastDGGridOperator from PDELab.")
    sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization")
-    vectorize_quad = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
+    vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
-    vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
+    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model")
-    vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
+    vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy")
-    vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
+    vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy")
-    vectorize_greedy = PerftoolOption(default=False, helpstr="the heuristic currently in use (to produce paper numbers)")
+    vectorization_padding = PerftoolOption(default=None, helpstr="an explicit value for the allowed padding in vectorization")
-    vectorize_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization")
+    vectorization_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers")
-    vectorize_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization")
-    vectorize_padding = PerftoolOption(default=None, helpstr="an explicit value for padding in vectorization")
-    vectorize_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers")
    turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.")
    architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl")
    grid_offset = PerftoolOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.")

--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -549,7 +549,7 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
    kernel = heuristic_duplication(kernel)
    # Maybe apply vectorization strategies
-    if get_option("vectorize_quad"):
+    if get_option("vectorization_quadloop"):
        if get_option("sumfact"):
            from dune.perftool.loopy.transformations.vectorize_quad import vectorize_quadrature_loop
            kernel = vectorize_quadrature_loop(kernel)

--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -4,7 +4,9 @@ import logging
 from dune.perftool.loopy.vcl import get_vcl_type_size
 from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel
-from dune.perftool.generation import (generator_factory,
+from dune.perftool.generation import (backend,
+                                      generator_factory,
+                                      get_backend,
                                      get_counted_variable,
                                      get_global_context_value,
                                      )
@@ -48,165 +50,38 @@ def attach_vectorization_info(sf):
        return _cache_vectorization_info(sf, None)
-def no_vec(sf):
+@backend(interface="vectorization_costfunction", name="greedy")
-    return sf.copy(buffer=get_counted_variable("buffer"))
+def greedy_costfunction(sf):
+    return 1
-def no_vectorization(sumfacts):
+@backend(interface="vectorization_costfunction", name="explicit")
-    return {sf: no_vec(sf) for sf in sumfacts}
+def explicit_costfunction(sf):
+    # Read the explicitly set values for horizontal and vertical vectorization
+    width = get_vcl_type_size(np.float64)
+    horizontal = int(get_option("vectorization_horizontal", width))
+    vertical = int(get_option("vectorization_vertical", 1))
+    if sf.horizontal_width == horizontal and sf.vertical_width == vertical:
-def vertical_vectorization_strategy(sumfact, depth):
+        return 1
-    # If depth is 1, there is nothing do
-    if depth == 1:
-        if isinstance(sumfact, SumfactKernel):
-            return {sumfact: sumfact}
-        else:
-            return {k: sumfact for k in sumfact.kernels}
-    # Assert that this is not already sliced
-    assert all(mat.slice_size is None for mat in sumfact.matrix_sequence)
-    result = {}
-    # Determine which of the matrices in the kernel should be sliced
-    def determine_slice_direction(sf):
-        for i, mat in enumerate(sf.matrix_sequence):
-            if mat.quadrature_size % depth == 0:
-                return i
-            elif get_option("vectorize_allow_quadrature_changes") and mat.quadrature_size != 1:
-                quad = list(quadrature_points_per_direction())
-                quad[i] = round_to_multiple(quad[i], depth)
-                set_quadrature_points(tuple(quad))
-                return i
-            elif mat.quadrature_size != 1:
-                raise PerftoolError("Vertical vectorization is not possible!")
-    if isinstance(sumfact, SumfactKernel):
-        kernels = [sumfact]
    else:
-        assert isinstance(sumfact, VectorizedSumfactKernel)
+        return 2
-        kernels = sumfact.kernels
-    newkernels = []
-    for sf in kernels:
-        sliced = determine_slice_direction(sf)
-        oldtab = sf.matrix_sequence[sliced]
-        for i in range(depth):
-            seq = list(sf.matrix_sequence)
-            seq[sliced] = oldtab.copy(slice_size=depth,
-                                      slice_index=i)
-            newkernels.append(sf.copy(matrix_sequence=tuple(seq)))
-    if isinstance(sumfact, SumfactKernel):
-        buffer = get_counted_variable("vertical_buffer")
-        result[sumfact] = VectorizedSumfactKernel(kernels=tuple(newkernels),
-                                                  buffer=buffer,
-                                                  vertical_width=depth,
-                                                  )
-    else:
-        assert isinstance(sumfact, VectorizedSumfactKernel)
-        for sf in kernels:
-            result[sf] = sumfact.copy(kernels=tuple(newkernels),
-                                      vertical_width=depth,
-                                      )
-    return result
-def horizontal_vectorization_strategy(sumfacts, width, allow_padding=1):
-    result = {}
-    todo = set(sumfacts)
-    while todo:
-        kernels = []
-        for sf in sorted(todo, key=lambda s: s.position_priority):
-            if len(kernels) < width:
-                kernels.append(sf)
-                todo.discard(sf)
-        buffer = get_counted_variable("joined_buffer")
-        if len(kernels) in range(width - allow_padding, width + 1):
-            for sf in kernels:
-                result[sf] = VectorizedSumfactKernel(kernels=tuple(kernels),
-                                                     horizontal_width=width,
-                                                     buffer=buffer,
-                                                     )
-    return result
-def diagonal_vectorization_strategy(sumfacts, width):
-    # Read explicitly set values
-    horizontal = get_option("vectorize_horizontal")
-    vertical = get_option("vectorize_vertical")
-    padding = get_option("vectorize_padding")
-    if width == 4:
-        if horizontal is None:
-            horizontal = 2
-        if vertical is None:
-            vertical = 2
-        if padding is None:
-            padding = 0
-    elif width == 8:
-        if horizontal is None:
-            horizontal = 4
-        if vertical is None:
-            vertical = 2
-        if padding is None:
-            padding = 1
-    else:
-        raise NotImplementedError
-    horizontal = int(horizontal)
-    vertical = int(vertical)
-    padding = int(padding)
-    result = {}
-    horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=padding)
-    for sf in horizontal_kernels:
-        vert = vertical_vectorization_strategy(horizontal_kernels[sf], width // horizontal_kernels[sf].horizontal_width)
-        for k in vert:
-            result[k] = vert[k]
-    return result
+def strategy_cost(strategy):
-def greedy_vectorization_strategy(sumfacts, width):
+    qp, strategy = strategy
-    sumfacts = set(sumfacts)
+    set_quadrature_points(qp)
-    horizontal = width
+    func = get_backend(interface="vectorization_costfunction",
-    vertical = 1
+                       name=get_option("vectorization_strategy"))
-    allowed_padding = 1
+    return sum(float(func(sf)) for sf in strategy.values())
-    result = {}
-    while horizontal > 0:
-        if horizontal > 1:
-            horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=allowed_padding)
-        else:
-            horizontal_kernels = {sf: sf for sf in sumfacts}
-        for sf in horizontal_kernels:
-            if horizontal_kernels[sf].horizontal_width == horizontal:
-                vert = vertical_vectorization_strategy(horizontal_kernels[sf],
-                                                       vertical)
-                for k in vert:
-                    result[k] = vert[k]
-                sumfacts.discard(sf)
-        horizontal = horizontal // 2
-        vertical = vertical * 2
-        # We heuristically allow padding only on the full SIMD width
-        allowed_padding = 0
-    return result
-def print_vectorization_strategy(strategy):
+def stringify_vectorization_strategy(strategy):
+    result = []
    qp, strategy = strategy
-    print "\nPrinting potential vectorization strategy:"
-    print "Quadrature point tuple: {}".format(qp)
+    result.append["Printing potential vectorization strategy:"]
+    result.append["Quadrature point tuple: {}".format(qp)]
    # Look for all realizations in the strategy and iterate over them
    cache_keys = frozenset(v.cache_key for v in strategy.values())
@@ -214,12 +89,12 @@ def print_vectorization_strategy(strategy):
        # Filter all the kernels that are realized by this and print
        for key in strategy:
            if strategy[key].cache_key == ck:
-                print "{}:".format(key)
+                result.append["{}:".format(key)]
        # Find one representative to print
        for val in strategy.values():
            if val.cache_key == ck:
-                print "    {}".format(val)
+                result.append["    {}".format(val)]
                break
@@ -243,45 +118,29 @@ def decide_vectorization_strategy():
    # All sum factorization kernels that get used
    active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts]
-    # We map inacitve sum factorizatino kernels to 0
+    # If no vectorization is needed, abort now
-    sfdict = {}
+    if get_option("vectorization_strategy") == "none":
-    for sf in inactive_sumfacts:
+        for sf in all_sumfacts:
-        sfdict[sf] = 0
+            _cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer")))
+        return
    logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes"
                 .format(len(active_sumfacts)))
-    if get_option("vectorize_grads"):
+    # Find the best vectorization strategy by using a costmodel
-        # Currently we base our idea here on the fact that we only group sum
+    width = get_vcl_type_size(np.float64)
-        # factorization kernels with the same input.
+    strategy = min(vectorization_opportunity_generator(active_sumfacts, width),
-        inputkeys = set(sf.input_key for sf in active_sumfacts)
+                   key=strategy_cost)
-        for inputkey in inputkeys:
-            width = get_vcl_type_size(np.float64)
+    # Treat the quadrature points
-            sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
+    qp, sfdict = strategy
-            for old, new in horizontal_vectorization_strategy(sumfact_filter, width).items():
+    set_quadrature_points(qp)
-                sfdict[old] = new
-    elif get_option("vectorize_slice"):
+    logger.debug("decide_vectorization_strategy: Decided for the following strategy:"
-        for sumfact in active_sumfacts:
+                 "\n".join(stringify_vectorization_strategy(strategy)))
-            width = get_vcl_type_size(np.float64)
-            for old, new in vertical_vectorization_strategy(sumfact, width).items():
+    # We map inactive sum factorization kernels to 0
-                sfdict[old] = new
+    sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts})
-    elif get_option("vectorize_diagonal"):
-        inputkeys = set(sf.input_key for sf in active_sumfacts)
-        for inputkey in inputkeys:
-            width = get_vcl_type_size(np.float64)
-            sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
-            for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items():
-                sfdict[old] = new
-    elif get_option("vectorize_greedy"):
-        inputkeys = set(sf.input_key for sf in active_sumfacts)
-        for inputkey in inputkeys:
-            width = get_vcl_type_size(np.float64)
-            sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
-            for old, new in greedy_vectorization_strategy(sumfact_filter, width).items():
-                sfdict[old] = new
-    else:
-        for old, new in no_vectorization(active_sumfacts).items():
-            sfdict[old] = new
    # Register the results
    for sf in all_sumfacts:
@@ -297,7 +156,7 @@ def vectorization_opportunity_generator(sumfacts, width):
    #
    quad_points = [quadrature_points_per_direction()]
-    if True or get_option("vectorize_allow_quadrature_changes"):
+    if True or get_option("vectorization_allow_quadrature_changes"):
        sf = next(iter(sumfacts))
        depth = 1
        while depth <= width:
@@ -331,7 +190,9 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=
    for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}),
                                                              width,
                                                              qp,
-                                                              add_to_frozendict(already, {sf_to_decide: sf_to_decide}),
+                                                              add_to_frozendict(already,
+                                                                                {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
+                                                                                ),
                                                              ):
        yield opp