diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index 66958d0b375476b200a1858f8030f9ff5ea28629..ea5255725b527d6fa59f3f005f49cdc0d2a11dd8 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -235,6 +235,19 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): # Some convenience methods to extract information about the sum factorization kernel # + def __lt__(self, other): + if self.parallel_key != other.parallel_key: + return self.parallel_key < other.parallel_key + if self.input_key != other.input_key: + return self.input_key < other.input_key + if self.position_priority == other.position_priority: + return repr(self) < repr(other) + if self.position_priority is None: + return False + if other.position_priority is None: + return True + return self.position_priority < other.position_priority + @property def length(self): """ The number of matrices to apply """ diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index c79161ac61cdb932cdfc419888e9d2aef7ddf347..e4a044edf0e62b8b2176efe4755a21617f0fcd8c 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -48,13 +48,6 @@ def attach_vectorization_info(sf): return _cache_vectorization_info(sf, None) -def position_penalty_factor(sf): - if isinstance(sf, SumfactKernel) or sf.vertical_width > 1: - return 1 - else: - return 1 + sum(abs(sf.kernels[i].position_priority - i) if sf.kernels[i].position_priority is not None else 0 for i in range(sf.length)) - - @backend(interface="vectorization_strategy", name="model") def costmodel(sf): # Penalize vertical vectorization @@ -66,7 +59,7 @@ def costmodel(sf): scalar_penalty = get_vcl_type_size(dtype_floatingpoint()) # Return total operations - return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty + return sf.operations * vertical_penalty * scalar_penalty @backend(interface="vectorization_strategy", name="fromlist") @@ -90,7 +83,7 @@ def explicit_costfunction(sf): if sf.horizontal_width == horizontal and sf.vertical_width == vertical: # Penalize position mapping - return sf.operations * position_penalty_factor(sf) + return sf.operations else: return 1000000000000 @@ -292,7 +285,7 @@ def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, alread # Find the number of input coefficients we can work on keys = frozenset(sf.input_key for sf in sumfacts) - inputkey_sumfacts = [frozenset(filter(lambda sf: sf.input_key == key, sumfacts)) for key in keys] + inputkey_sumfacts = [tuple(sorted(filter(lambda sf: sf.input_key == key, sumfacts))) for key in keys] for parallel in (1, 2): if parallel == 2 and next(iter(sumfacts)).stage == 3: @@ -301,98 +294,38 @@ def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, alread it.permutations(range(len(keys)), parallel)): horizontal = 1 while horizontal <= width // parallel: - generators = [filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))), - it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal) for part in which)))] - if horizontal >=4: - generators.append(filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))), - it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal - 1) for part in which)))) - for combo in it.chain(*generators): - combo = sum(combo, ()) - - vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp) - if vecdict is None: - # This particular choice was rejected for some reason. - # Possible reasons: - # * the quadrature point tuple not being suitable - # for this vectorization strategy - continue - - # Go into recursion to also vectorize all kernels not in this combo - for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo), - width, - qp, - add_to_frozendict(already, vecdict), - ): - yielded = True - yield opp + combo = sum((inputkey_sumfacts[part][:horizontal] for part in which), ()) + vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp) horizontal *= 2 + if vecdict is None: + # This particular choice was rejected for some reason. + # Possible reasons: + # * the quadrature point tuple not being suitable + # for this vectorization strategy + # * there are not enough horizontal kernels + continue + + # Go into recursion to also vectorize all kernels not in this combo + for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo), + width, + qp, + add_to_frozendict(already, vecdict), + ): + yielded = True + yield opp + # If we did not yield on this recursion level, yield what we got so far if not yielded: yield already -# -# def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()): -# if len(sumfacts) == 0: -# # We have gone into recursion deep enough to have all sum factorization nodes -# # assigned their vectorized counterpart. We can yield the result now! -# yield already -# return -# -# # Ensure a deterministic order of the given sumfact kernels. This is necessary for the -# # fromlist strategy to pick correct strategies across different program runs -# sumfacts = sorted(sumfacts, key=lambda sf: repr(sf)) -# -# # Otherwise we pick a random sum factorization kernel and construct all the vectorization -# # opportunities realizing this particular kernel and go into recursion. -# sf_to_decide = sumfacts[0] -# -# # Have "unvectorized" as an option, although it is not good -# for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]), -# width, -# qp, -# add_to_frozendict(already, -# {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))} -# ), -# ): -# yield opp -# -# horizontal = 1 -# while horizontal <= width: -# # Iterate over the possible combinations of sum factorization kernels -# # taking into account all the permutations of kernels. This also includes -# # combinations which use a padding of 1 - but only for pure horizontality. -# generators = [it.permutations(sumfacts, horizontal)] -# if horizontal >= 4: -# generators.append(it.permutations(sumfacts, horizontal - 1)) -# for combo in it.chain(*generators): -# # The chosen kernels must be part of the kernels for recursion -# # to work correctly -# if sf_to_decide not in combo: -# continue -# -# # Set up the vectorization dict for this combo -# vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp) -# if vecdict is None: -# # This particular choice was rejected for some reason. -# # Possible reasons: -# # * the quadrature point tuple not being suitable -# # for this vectorization strategy -# continue -# -# # Go into recursion to also vectorize all kernels not in this combo -# for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo), -# width, -# qp, -# add_to_frozendict(already, vecdict), -# ): -# yield opp -# -# horizontal = horizontal * 2 - def get_vectorization_dict(sumfacts, vertical, horizontal, qp): + # Discard opportunities that do not contain enough horizontal kernels + if len(sumfacts) not in (horizontal, horizontal - 1): + return None + # Enhance the list of sumfact nodes by adding vertical splittings kernels = [] for sf in sumfacts: