diff --git a/python/dune/perftool/sumfact/costmodel.py b/python/dune/perftool/sumfact/costmodel.py deleted file mode 100644 index c036276703981cf136442ae1d48c51368eaa79bc..0000000000000000000000000000000000000000 --- a/python/dune/perftool/sumfact/costmodel.py +++ /dev/null @@ -1,25 +0,0 @@ -""" Implementation of the cost model used for vectorization -""" - - -def ilp_heuristic(sf): - """ A heuristic measure for the ILP capabilities of the kernel - - Return is the throttle of insufficient ILP, generally in (0., 1.0]. - 1.0 indicates perfect instruction level parallelism. - """ - return 1.0 - - -def vectorization_costfunction(sf): - # kernel characteristics necessary - traffic = sf.memory_traffic - intensity = sf.operations / sf.memory_traffic - - # hardware characteristics necessary - bandwidth = 100000000000000000.0 - peakperformance = 100.0 - - roofline = min(bandwidth * intensity, ilp_heuristics(sf) * peakperformance) - - return (traffic * intensity) / roofline diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 0d7c4b038827a4af1e076f6982d6e50b496e6291..6261a38016b9635c06c6e51cd796461f154fe3a1 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -17,8 +17,10 @@ from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray, ) from dune.perftool.error import PerftoolError from dune.perftool.options import get_option -from dune.perftool.tools import round_to_multiple +from dune.perftool.tools import add_to_frozendict,round_to_multiple +from frozendict import frozendict +import itertools as it import loopy as lp import numpy as np @@ -264,3 +266,120 @@ def decide_vectorization_strategy(): # Register the results for sf in all_sumfacts: _cache_vectorization_info(sf, sfdict.get(sf, no_vec(sf))) + + +def vectorization_opportunity_generator(sumfacts, width): + """ Generator that yields all vectorization opportunities for the given + sum factorization kernels as tuples of quadrature point tuple and vectorization + dictionary """ + # + # Find all the possible quadrature point tuples + # + quad_points = [quadrature_points_per_direction()] + + if True or get_option("vectorize_allow_quadrature_changes"): + sf = next(iter(sumfacts)) + depth = 1 + while depth <= width: + i = 0 if sf.matrix_sequence[0].face is None else 1 + quad = list(quadrature_points_per_direction()) + quad[i] = round_to_multiple(quad[i], depth) + quad_points.append(tuple(quad)) + depth = depth * 2 + quad_points = list(set(quad_points)) + + for qp in quad_points: + # + # Determine vectorization opportunities given a fixed quadrature point number + # + for opp in fixed_quad_vectorization_opportunity_generator(frozenset(sumfacts), width, qp): + yield qp, opp + + +def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()): + if len(sumfacts) == 0: + # We have gone into recursion deep enough to have all sum factorization nodes + # assigned their vectorized counterpart. We can yield the result now! + yield already + return + + # Otherwise we pick a random sum factorization kernel and construct all the vectorization + # opportunities realizing this particular kernel and go into recursion. + sf_to_decide = next(iter(sumfacts)) + + # Have "unvectorized" as an option, although it is not good + for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}), + width, + qp, + add_to_frozendict(already, {sf_to_decide: sf_to_decide}), + ): + yield opp + + # Find all the sum factorization kernels that the chosen kernel can be parallelized with + # + # TODO: Right now we check for same input, which is not actually needed in order + # to be a suitable candidate! We should relax this concept at some point! + candidates = filter(lambda sf: sf.input_key == sf_to_decide.input_key, sumfacts) + + horizontal = 1 + while horizontal <= width: + # Iterate over the possible combinations of sum factorization kernels + # taking into account all the permutations of kernels. This also includes + # combinations which use a padding of 1. + for combo in it.chain(it.permutations(candidates, horizontal), + it.permutations(candidates, horizontal - 1)): + # The chosen kernels must be part of the kernels for recursion + # to work correctly + if sf_to_decide not in combo: + continue + + # Set up the vectorization dict for this combo + vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp) + if vecdict is None: + # This particular choice was rejected for some reason. + # Possible reasons: + # * the quadrature point tuple not being suitable + # for this vectorization strategy + continue + + # Go into recursion to also vectorize all kernels not in this combo + for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference(combo), + width, + qp, + add_to_frozendict(already, vecdict), + ): + yield opp + + horizontal = horizontal * 2 + + +def get_vectorization_dict(sumfacts, vertical, horizontal, qp): + # Enhance the list of sumfact nodes by adding vertical splittings + kernels = [] + for sf in sumfacts: + # No slicing needed in the pure horizontal case + if vertical == 1: + kernels.append(sf) + continue + + # Determine the slicing direction + slice_direction = 0 if sf.matrix_sequence[0].face is None else 1 + if qp[slice_direction] % vertical != 0: + return None + + # Split the basis tabulation matrices + oldtab = sf.matrix_sequence[slice_direction] + for i in range(vertical): + seq = list(sf.matrix_sequence) + seq[slice_direction] = oldtab.copy(slice_size=vertical, + slice_index=i) + kernels.append(sf.copy(matrix_sequence=tuple(seq))) + + + # Join the new kernels into a sum factorization node + buffer = get_counted_variable("joined_buffer") + return {sf: VectorizedSumfactKernel(kernels=tuple(kernels), + horizontal_width=horizontal, + vertical_width=vertical, + buffer=buffer, + ) for sf in sumfacts} diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py index 260fc9dfe6151492405e8b6996afa1623e9f3896..d358f3ab45ef8231a3a952a881167e7108c14704 100644 --- a/python/dune/perftool/tools.py +++ b/python/dune/perftool/tools.py @@ -3,6 +3,7 @@ from __future__ import absolute_import import loopy as lp import pymbolic.primitives as prim +import frozendict def get_pymbolic_basename(expr): @@ -67,3 +68,9 @@ def ceildiv(a, b): def round_to_multiple(x, n): return n * ceildiv(x, n) + + +def add_to_frozendict(fd, valdict): + t = dict(fd) + t.update(valdict) + return frozendict.frozendict(t)