From 193fb5f7c4ba0c95f78b7df349ffd423d3c2e64e Mon Sep 17 00:00:00 2001 From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> Date: Wed, 14 Feb 2018 10:12:19 +0100 Subject: [PATCH] A first draft of lower/upper half vectorization for stage 1 --- python/dune/perftool/loopy/vcl.py | 9 + python/dune/perftool/sumfact/symbolic.py | 11 +- python/dune/perftool/sumfact/tabulation.py | 2 +- python/dune/perftool/sumfact/vectorization.py | 180 +++++++++++------- 4 files changed, 130 insertions(+), 72 deletions(-) diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py index ead55725..0aef74a5 100644 --- a/python/dune/perftool/loopy/vcl.py +++ b/python/dune/perftool/loopy/vcl.py @@ -76,8 +76,17 @@ class ExplicitVCLCast(lp.symbolic.FunctionIdentifier): return get_vcl_typename(self.nptype, vector_width=self.vector_width) +class VCLLowerUpperLoad(ExplicitVCLCast): + pass + + @function_mangler def vcl_cast_mangler(knl, func, arg_dtypes): + if isinstance(func, VCLLowerUpperLoad): + return lp.CallMangleInfo(func.name, + (lp.types.NumpyType(func.nptype),), + arg_dtypes) + if isinstance(func, ExplicitVCLCast): return lp.CallMangleInfo(func.name, (lp.types.NumpyType(func.nptype),), (arg_dtypes[0],)) diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index 393a2275..66958d0b 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -6,7 +6,7 @@ from dune.perftool.pdelab.geometry import local_dimension, world_dimension from dune.perftool.sumfact.quadrature import quadrature_inames from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray from dune.perftool.loopy.target import dtype_floatingpoint -from dune.perftool.loopy.vcl import ExplicitVCLCast +from dune.perftool.loopy.vcl import ExplicitVCLCast, VCLLowerUpperLoad from pytools import ImmutableRecord, product @@ -61,7 +61,11 @@ class VectorSumfactKernelInput(SumfactKernelInputBase): # The lower and the upper part of the SIMD register use # the same input coefficient, we combine the SIMD register # from two shorter SIMD types - raise NotImplementedError("Lower/Upper half SIMD loads not implemented!") + return prim.Call(VCLLowerUpperLoad(dtype_floatingpoint()), + (self.inputs[0].realize_direct(shape, inames), + self.inputs[len(self.inputs) // 2].realize_direct(shape, inames), + ) + ) else: # The input does not exhibit a broadcastable structure, we # need to load scalars into the SIMD vector. @@ -405,7 +409,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) # Assert all the properties that need to be the same across all subkernels assert len(set(k.stage for k in kernels)) == 1 assert len(set(k.length for k in kernels)) == 1 - assert len(set(k.restriction for k in kernels)) == 1 assert len(set(k.within_inames for k in kernels)) == 1 assert len(set(k.predicates for k in kernels)) == 1 @@ -413,7 +416,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) for i in range(kernels[0].length): assert len(set(tuple(k.matrix_sequence[i].rows for k in kernels))) == 1 assert len(set(tuple(k.matrix_sequence[i].cols for k in kernels))) == 1 - assert len(set(tuple(k.matrix_sequence[i].face for k in kernels))) == 1 + assert len(set(tuple(k.matrix_sequence[i].direction for k in kernels))) == 1 assert len(set(tuple(k.matrix_sequence[i].transpose for k in kernels))) == 1 # Join the instruction dependencies of all subkernels diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index bc671b70..ca00eee5 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -131,7 +131,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): assert len(set(t.quadrature_size for t in tabs)) == 1 assert len(set(t.basis_size for t in tabs)) == 1 assert len(set(t.transpose for t in tabs)) == 1 - assert len(set(t.face for t in tabs)) == 1 + assert len(set(t.direction for t in tabs)) == 1 assert len(set(t.slice_size for t in tabs)) == 1 self.tabs = tabs diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 7c9c5495..c79161ac 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -265,85 +265,131 @@ def level2_optimal_vectorization_strategy(sumfacts, width, qp): for key in keys: key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key) - key_strategy = level3_optimal_vectorization_strategy(key_sumfacts, width, qp) + key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp), + key=fixedqp_strategy_costfunction(qp)) sfdict = add_to_frozendict(sfdict, key_strategy) return sfdict -def level3_optimal_vectorization_strategy(sumfacts, width, qp): - # Find the sets of simultaneously realizable kernels (thats an equivalence relation) - keys = frozenset(sf.input_key for sf in sumfacts) +def level2_optimal_vectorization_strategy_generator(sumfacts, width, qp): + for opp in _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp): + # Add non-vectorized implementation information to all kernels that are not present in + # the optimal strategy + yield add_to_frozendict(opp, + {sf: sf.copy(buffer=get_counted_variable("buffer")) for sf in sumfacts if sf not in opp}) - # Find minimums for each of these sets - sfdict = frozendict() - for key in keys: - key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key) - minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp), - key=fixedqp_strategy_costfunction(qp)) - sfdict = add_to_frozendict(sfdict, minimum) +def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, already=frozendict()): + if len(sumfacts) == 0: + yield already + return - return sfdict + # We store the information whether a vectorization opportunity has been yielded from this + # generator to yield an incomplete strategy if not (which is then completed with unvectorized + # kernel implementations) + yielded = False + # Find the number of input coefficients we can work on + keys = frozenset(sf.input_key for sf in sumfacts) + inputkey_sumfacts = [frozenset(filter(lambda sf: sf.input_key == key, sumfacts)) for key in keys] -def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()): - if len(sumfacts) == 0: - # We have gone into recursion deep enough to have all sum factorization nodes - # assigned their vectorized counterpart. We can yield the result now! + for parallel in (1, 2): + if parallel == 2 and next(iter(sumfacts)).stage == 3: + continue + for which in filter(lambda w: w == tuple(sorted(w)), + it.permutations(range(len(keys)), parallel)): + horizontal = 1 + while horizontal <= width // parallel: + generators = [filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))), + it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal) for part in which)))] + if horizontal >=4: + generators.append(filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))), + it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal - 1) for part in which)))) + for combo in it.chain(*generators): + combo = sum(combo, ()) + + vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp) + if vecdict is None: + # This particular choice was rejected for some reason. + # Possible reasons: + # * the quadrature point tuple not being suitable + # for this vectorization strategy + continue + + # Go into recursion to also vectorize all kernels not in this combo + for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo), + width, + qp, + add_to_frozendict(already, vecdict), + ): + yielded = True + yield opp + + horizontal *= 2 + + # If we did not yield on this recursion level, yield what we got so far + if not yielded: yield already - return - # Ensure a deterministic order of the given sumfact kernels. This is necessary for the - # fromlist strategy to pick correct strategies across different program runs - sumfacts = sorted(sumfacts, key=lambda sf: repr(sf)) - - # Otherwise we pick a random sum factorization kernel and construct all the vectorization - # opportunities realizing this particular kernel and go into recursion. - sf_to_decide = sumfacts[0] - - # Have "unvectorized" as an option, although it is not good - for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]), - width, - qp, - add_to_frozendict(already, - {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))} - ), - ): - yield opp - - horizontal = 1 - while horizontal <= width: - # Iterate over the possible combinations of sum factorization kernels - # taking into account all the permutations of kernels. This also includes - # combinations which use a padding of 1 - but only for pure horizontality. - generators = [it.permutations(sumfacts, horizontal)] - if horizontal >= 4: - generators.append(it.permutations(sumfacts, horizontal - 1)) - for combo in it.chain(*generators): - # The chosen kernels must be part of the kernels for recursion - # to work correctly - if sf_to_decide not in combo: - continue - - # Set up the vectorization dict for this combo - vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp) - if vecdict is None: - # This particular choice was rejected for some reason. - # Possible reasons: - # * the quadrature point tuple not being suitable - # for this vectorization strategy - continue - - # Go into recursion to also vectorize all kernels not in this combo - for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo), - width, - qp, - add_to_frozendict(already, vecdict), - ): - yield opp - - horizontal = horizontal * 2 +# +# def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()): +# if len(sumfacts) == 0: +# # We have gone into recursion deep enough to have all sum factorization nodes +# # assigned their vectorized counterpart. We can yield the result now! +# yield already +# return +# +# # Ensure a deterministic order of the given sumfact kernels. This is necessary for the +# # fromlist strategy to pick correct strategies across different program runs +# sumfacts = sorted(sumfacts, key=lambda sf: repr(sf)) +# +# # Otherwise we pick a random sum factorization kernel and construct all the vectorization +# # opportunities realizing this particular kernel and go into recursion. +# sf_to_decide = sumfacts[0] +# +# # Have "unvectorized" as an option, although it is not good +# for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]), +# width, +# qp, +# add_to_frozendict(already, +# {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))} +# ), +# ): +# yield opp +# +# horizontal = 1 +# while horizontal <= width: +# # Iterate over the possible combinations of sum factorization kernels +# # taking into account all the permutations of kernels. This also includes +# # combinations which use a padding of 1 - but only for pure horizontality. +# generators = [it.permutations(sumfacts, horizontal)] +# if horizontal >= 4: +# generators.append(it.permutations(sumfacts, horizontal - 1)) +# for combo in it.chain(*generators): +# # The chosen kernels must be part of the kernels for recursion +# # to work correctly +# if sf_to_decide not in combo: +# continue +# +# # Set up the vectorization dict for this combo +# vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp) +# if vecdict is None: +# # This particular choice was rejected for some reason. +# # Possible reasons: +# # * the quadrature point tuple not being suitable +# # for this vectorization strategy +# continue +# +# # Go into recursion to also vectorize all kernels not in this combo +# for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo), +# width, +# qp, +# add_to_frozendict(already, vecdict), +# ): +# yield opp +# +# horizontal = horizontal * 2 def get_vectorization_dict(sumfacts, vertical, horizontal, qp): -- GitLab