diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 8d0081dbf4739005c7fb9c5ffc9fa7aa48806c93..ad01abbc8360704e1842be9c503b0117063fdaee 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -288,33 +288,35 @@ def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, alread inoutkey_sumfacts = [tuple(sorted(filter(lambda sf: sf.inout_key == key, sumfacts))) for key in keys] for parallel in (1, 2): + if parallel > len(keys): + continue + if parallel == 2 and next(iter(sumfacts)).stage == 3: continue - for which in filter(lambda w: w == tuple(sorted(w)), - it.permutations(range(len(keys)), parallel)): - horizontal = 1 - while horizontal <= width // parallel: - combo = sum((inoutkey_sumfacts[part][:horizontal] for part in which), ()) - - vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp) - horizontal *= 2 - - if vecdict is None: - # This particular choice was rejected for some reason. - # Possible reasons: - # * the quadrature point tuple not being suitable - # for this vectorization strategy - # * there are not enough horizontal kernels - continue - - # Go into recursion to also vectorize all kernels not in this combo - for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo), - width, - qp, - add_to_frozendict(already, vecdict), - ): - yielded = True - yield opp + + horizontal = 1 + while horizontal <= width // parallel: + combo = sum((inoutkey_sumfacts[part][:horizontal] for part in range(parallel)), ()) + + vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp) + horizontal *= 2 + + if vecdict is None: + # This particular choice was rejected for some reason. + # Possible reasons: + # * the quadrature point tuple not being suitable + # for this vectorization strategy + # * there are not enough horizontal kernels + continue + + # Go into recursion to also vectorize all kernels not in this combo + for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo), + width, + qp, + add_to_frozendict(already, vecdict), + ): + yielded = True + yield opp # If we did not yield on this recursion level, yield what we got so far if not yielded: