diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index 90ab3a11178b77984ede80913f517cea13df574c..9a9cca4ca53c5a50e3e40226116a620c064f2c16 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -54,15 +54,12 @@ class PerftoolOptionsArray(ImmutableRecord): project_basedir = PerftoolOption(helpstr="The base (build) directory of the dune-perftool project") fastdg = PerftoolOption(default=False, helpstr="Use FastDGGridOperator from PDELab.") sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization") - vectorize_quad = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") - vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") - vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") - vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") - vectorize_greedy = PerftoolOption(default=False, helpstr="the heuristic currently in use (to produce paper numbers)") - vectorize_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization") - vectorize_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization") - vectorize_padding = PerftoolOption(default=None, helpstr="an explicit value for padding in vectorization") - vectorize_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers") + vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") + vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model") + vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy") + vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy") + vectorization_padding = PerftoolOption(default=None, helpstr="an explicit value for the allowed padding in vectorization") + vectorization_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers") turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.") architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl") grid_offset = PerftoolOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.") diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 7a9f4169c837f019d5af2842875c3ef39d2eb133..900ce61711e67310472d143ebea5930f52ba5756 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -549,7 +549,7 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): kernel = heuristic_duplication(kernel) # Maybe apply vectorization strategies - if get_option("vectorize_quad"): + if get_option("vectorization_quadloop"): if get_option("sumfact"): from dune.perftool.loopy.transformations.vectorize_quad import vectorize_quadrature_loop kernel = vectorize_quadrature_loop(kernel) diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 53a5c278f1d361abc415bf4186b856ce8c01a7d6..e3edaf2cb70d1a06681742445bc83314b42c306b 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -4,7 +4,9 @@ import logging from dune.perftool.loopy.vcl import get_vcl_type_size from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel -from dune.perftool.generation import (generator_factory, +from dune.perftool.generation import (backend, + generator_factory, + get_backend, get_counted_variable, get_global_context_value, ) @@ -48,165 +50,38 @@ def attach_vectorization_info(sf): return _cache_vectorization_info(sf, None) -def no_vec(sf): - return sf.copy(buffer=get_counted_variable("buffer")) +@backend(interface="vectorization_costfunction", name="greedy") +def greedy_costfunction(sf): + return 1 -def no_vectorization(sumfacts): - return {sf: no_vec(sf) for sf in sumfacts} +@backend(interface="vectorization_costfunction", name="explicit") +def explicit_costfunction(sf): + # Read the explicitly set values for horizontal and vertical vectorization + width = get_vcl_type_size(np.float64) + horizontal = int(get_option("vectorization_horizontal", width)) + vertical = int(get_option("vectorization_vertical", 1)) - -def vertical_vectorization_strategy(sumfact, depth): - # If depth is 1, there is nothing do - if depth == 1: - if isinstance(sumfact, SumfactKernel): - return {sumfact: sumfact} - else: - return {k: sumfact for k in sumfact.kernels} - - # Assert that this is not already sliced - assert all(mat.slice_size is None for mat in sumfact.matrix_sequence) - result = {} - - # Determine which of the matrices in the kernel should be sliced - def determine_slice_direction(sf): - for i, mat in enumerate(sf.matrix_sequence): - if mat.quadrature_size % depth == 0: - return i - elif get_option("vectorize_allow_quadrature_changes") and mat.quadrature_size != 1: - quad = list(quadrature_points_per_direction()) - quad[i] = round_to_multiple(quad[i], depth) - set_quadrature_points(tuple(quad)) - return i - elif mat.quadrature_size != 1: - raise PerftoolError("Vertical vectorization is not possible!") - - if isinstance(sumfact, SumfactKernel): - kernels = [sumfact] + if sf.horizontal_width == horizontal and sf.vertical_width == vertical: + return 1 else: - assert isinstance(sumfact, VectorizedSumfactKernel) - kernels = sumfact.kernels - - newkernels = [] - for sf in kernels: - sliced = determine_slice_direction(sf) - oldtab = sf.matrix_sequence[sliced] - for i in range(depth): - seq = list(sf.matrix_sequence) - seq[sliced] = oldtab.copy(slice_size=depth, - slice_index=i) - newkernels.append(sf.copy(matrix_sequence=tuple(seq))) - - if isinstance(sumfact, SumfactKernel): - buffer = get_counted_variable("vertical_buffer") - result[sumfact] = VectorizedSumfactKernel(kernels=tuple(newkernels), - buffer=buffer, - vertical_width=depth, - ) - else: - assert isinstance(sumfact, VectorizedSumfactKernel) - for sf in kernels: - result[sf] = sumfact.copy(kernels=tuple(newkernels), - vertical_width=depth, - ) - - return result - - -def horizontal_vectorization_strategy(sumfacts, width, allow_padding=1): - result = {} - todo = set(sumfacts) - while todo: - kernels = [] - for sf in sorted(todo, key=lambda s: s.position_priority): - if len(kernels) < width: - kernels.append(sf) - todo.discard(sf) - - buffer = get_counted_variable("joined_buffer") - if len(kernels) in range(width - allow_padding, width + 1): - for sf in kernels: - result[sf] = VectorizedSumfactKernel(kernels=tuple(kernels), - horizontal_width=width, - buffer=buffer, - ) - - return result - - -def diagonal_vectorization_strategy(sumfacts, width): - # Read explicitly set values - horizontal = get_option("vectorize_horizontal") - vertical = get_option("vectorize_vertical") - padding = get_option("vectorize_padding") - - if width == 4: - if horizontal is None: - horizontal = 2 - if vertical is None: - vertical = 2 - if padding is None: - padding = 0 - elif width == 8: - if horizontal is None: - horizontal = 4 - if vertical is None: - vertical = 2 - if padding is None: - padding = 1 - else: - raise NotImplementedError - - horizontal = int(horizontal) - vertical = int(vertical) - padding = int(padding) - - result = {} - - horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=padding) - - for sf in horizontal_kernels: - vert = vertical_vectorization_strategy(horizontal_kernels[sf], width // horizontal_kernels[sf].horizontal_width) - for k in vert: - result[k] = vert[k] + return 2 - return result - -def greedy_vectorization_strategy(sumfacts, width): - sumfacts = set(sumfacts) - horizontal = width - vertical = 1 - allowed_padding = 1 - - result = {} - - while horizontal > 0: - if horizontal > 1: - horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=allowed_padding) - else: - horizontal_kernels = {sf: sf for sf in sumfacts} - - for sf in horizontal_kernels: - if horizontal_kernels[sf].horizontal_width == horizontal: - vert = vertical_vectorization_strategy(horizontal_kernels[sf], - vertical) - for k in vert: - result[k] = vert[k] - sumfacts.discard(sf) - - horizontal = horizontal // 2 - vertical = vertical * 2 - - # We heuristically allow padding only on the full SIMD width - allowed_padding = 0 - return result +def strategy_cost(strategy): + qp, strategy = strategy + set_quadrature_points(qp) + func = get_backend(interface="vectorization_costfunction", + name=get_option("vectorization_strategy")) + return sum(float(func(sf)) for sf in strategy.values()) -def print_vectorization_strategy(strategy): +def stringify_vectorization_strategy(strategy): + result = [] qp, strategy = strategy - print "\nPrinting potential vectorization strategy:" - print "Quadrature point tuple: {}".format(qp) + + result.append["Printing potential vectorization strategy:"] + result.append["Quadrature point tuple: {}".format(qp)] # Look for all realizations in the strategy and iterate over them cache_keys = frozenset(v.cache_key for v in strategy.values()) @@ -214,12 +89,12 @@ def print_vectorization_strategy(strategy): # Filter all the kernels that are realized by this and print for key in strategy: if strategy[key].cache_key == ck: - print "{}:".format(key) + result.append["{}:".format(key)] # Find one representative to print for val in strategy.values(): if val.cache_key == ck: - print " {}".format(val) + result.append[" {}".format(val)] break @@ -243,45 +118,29 @@ def decide_vectorization_strategy(): # All sum factorization kernels that get used active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts] - # We map inacitve sum factorizatino kernels to 0 - sfdict = {} - for sf in inactive_sumfacts: - sfdict[sf] = 0 + # If no vectorization is needed, abort now + if get_option("vectorization_strategy") == "none": + for sf in all_sumfacts: + _cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer"))) + return logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes" .format(len(active_sumfacts))) - if get_option("vectorize_grads"): - # Currently we base our idea here on the fact that we only group sum - # factorization kernels with the same input. - inputkeys = set(sf.input_key for sf in active_sumfacts) - for inputkey in inputkeys: - width = get_vcl_type_size(np.float64) - sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey] - for old, new in horizontal_vectorization_strategy(sumfact_filter, width).items(): - sfdict[old] = new - elif get_option("vectorize_slice"): - for sumfact in active_sumfacts: - width = get_vcl_type_size(np.float64) - for old, new in vertical_vectorization_strategy(sumfact, width).items(): - sfdict[old] = new - elif get_option("vectorize_diagonal"): - inputkeys = set(sf.input_key for sf in active_sumfacts) - for inputkey in inputkeys: - width = get_vcl_type_size(np.float64) - sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey] - for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items(): - sfdict[old] = new - elif get_option("vectorize_greedy"): - inputkeys = set(sf.input_key for sf in active_sumfacts) - for inputkey in inputkeys: - width = get_vcl_type_size(np.float64) - sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey] - for old, new in greedy_vectorization_strategy(sumfact_filter, width).items(): - sfdict[old] = new - else: - for old, new in no_vectorization(active_sumfacts).items(): - sfdict[old] = new + # Find the best vectorization strategy by using a costmodel + width = get_vcl_type_size(np.float64) + strategy = min(vectorization_opportunity_generator(active_sumfacts, width), + key=strategy_cost) + + # Treat the quadrature points + qp, sfdict = strategy + set_quadrature_points(qp) + + logger.debug("decide_vectorization_strategy: Decided for the following strategy:" + "\n".join(stringify_vectorization_strategy(strategy))) + + # We map inactive sum factorization kernels to 0 + sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts}) # Register the results for sf in all_sumfacts: @@ -297,7 +156,7 @@ def vectorization_opportunity_generator(sumfacts, width): # quad_points = [quadrature_points_per_direction()] - if True or get_option("vectorize_allow_quadrature_changes"): + if True or get_option("vectorization_allow_quadrature_changes"): sf = next(iter(sumfacts)) depth = 1 while depth <= width: @@ -331,7 +190,9 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already= for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}), width, qp, - add_to_frozendict(already, {sf_to_decide: sf_to_decide}), + add_to_frozendict(already, + {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))} + ), ): yield opp