From 193fb5f7c4ba0c95f78b7df349ffd423d3c2e64e Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Wed, 14 Feb 2018 10:12:19 +0100
Subject: [PATCH] A first draft of lower/upper half vectorization for stage 1

---
 python/dune/perftool/loopy/vcl.py             |   9 +
 python/dune/perftool/sumfact/symbolic.py      |  11 +-
 python/dune/perftool/sumfact/tabulation.py    |   2 +-
 python/dune/perftool/sumfact/vectorization.py | 180 +++++++++++-------
 4 files changed, 130 insertions(+), 72 deletions(-)

diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py
index ead55725..0aef74a5 100644
--- a/python/dune/perftool/loopy/vcl.py
+++ b/python/dune/perftool/loopy/vcl.py
@@ -76,8 +76,17 @@ class ExplicitVCLCast(lp.symbolic.FunctionIdentifier):
         return get_vcl_typename(self.nptype, vector_width=self.vector_width)
 
 
+class VCLLowerUpperLoad(ExplicitVCLCast):
+    pass
+
+
 @function_mangler
 def vcl_cast_mangler(knl, func, arg_dtypes):
+    if isinstance(func, VCLLowerUpperLoad):
+        return lp.CallMangleInfo(func.name,
+                                 (lp.types.NumpyType(func.nptype),),
+                                 arg_dtypes)
+
     if isinstance(func, ExplicitVCLCast):
         return lp.CallMangleInfo(func.name, (lp.types.NumpyType(func.nptype),), (arg_dtypes[0],))
 
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index 393a2275..66958d0b 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -6,7 +6,7 @@ from dune.perftool.pdelab.geometry import local_dimension, world_dimension
 from dune.perftool.sumfact.quadrature import quadrature_inames
 from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray
 from dune.perftool.loopy.target import dtype_floatingpoint
-from dune.perftool.loopy.vcl import ExplicitVCLCast
+from dune.perftool.loopy.vcl import ExplicitVCLCast, VCLLowerUpperLoad
 
 from pytools import ImmutableRecord, product
 
@@ -61,7 +61,11 @@ class VectorSumfactKernelInput(SumfactKernelInputBase):
             # The lower and the upper part of the SIMD register use
             # the same input coefficient, we combine the SIMD register
             # from two shorter SIMD types
-            raise NotImplementedError("Lower/Upper half SIMD loads not implemented!")
+            return prim.Call(VCLLowerUpperLoad(dtype_floatingpoint()),
+                             (self.inputs[0].realize_direct(shape, inames),
+                              self.inputs[len(self.inputs) // 2].realize_direct(shape, inames),
+                              )
+                             )
         else:
             # The input does not exhibit a broadcastable structure, we
             # need to load scalars into the SIMD vector.
@@ -405,7 +409,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         # Assert all the properties that need to be the same across all subkernels
         assert len(set(k.stage for k in kernels)) == 1
         assert len(set(k.length for k in kernels)) == 1
-        assert len(set(k.restriction for k in kernels)) == 1
         assert len(set(k.within_inames for k in kernels)) == 1
         assert len(set(k.predicates for k in kernels)) == 1
 
@@ -413,7 +416,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
         for i in range(kernels[0].length):
             assert len(set(tuple(k.matrix_sequence[i].rows for k in kernels))) == 1
             assert len(set(tuple(k.matrix_sequence[i].cols for k in kernels))) == 1
-            assert len(set(tuple(k.matrix_sequence[i].face for k in kernels))) == 1
+            assert len(set(tuple(k.matrix_sequence[i].direction for k in kernels))) == 1
             assert len(set(tuple(k.matrix_sequence[i].transpose for k in kernels))) == 1
 
         # Join the instruction dependencies of all subkernels
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index bc671b70..ca00eee5 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -131,7 +131,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
         assert len(set(t.quadrature_size for t in tabs)) == 1
         assert len(set(t.basis_size for t in tabs)) == 1
         assert len(set(t.transpose for t in tabs)) == 1
-        assert len(set(t.face for t in tabs)) == 1
+        assert len(set(t.direction for t in tabs)) == 1
         assert len(set(t.slice_size for t in tabs)) == 1
         self.tabs = tabs
 
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 7c9c5495..c79161ac 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -265,85 +265,131 @@ def level2_optimal_vectorization_strategy(sumfacts, width, qp):
 
     for key in keys:
         key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key)
-        key_strategy = level3_optimal_vectorization_strategy(key_sumfacts, width, qp)
+        key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp),
+                           key=fixedqp_strategy_costfunction(qp))
         sfdict = add_to_frozendict(sfdict, key_strategy)
 
     return sfdict
 
 
-def level3_optimal_vectorization_strategy(sumfacts, width, qp):
-    # Find the sets of simultaneously realizable kernels (thats an equivalence relation)
-    keys = frozenset(sf.input_key for sf in sumfacts)
+def level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
+    for opp in _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp):
+        # Add non-vectorized implementation information to all kernels that are not present in
+        # the optimal strategy
+        yield add_to_frozendict(opp,
+                                {sf: sf.copy(buffer=get_counted_variable("buffer")) for sf in sumfacts if sf not in opp})
 
-    # Find minimums for each of these sets
-    sfdict = frozendict()
 
-    for key in keys:
-        key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key)
-        minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp),
-                      key=fixedqp_strategy_costfunction(qp))
-        sfdict = add_to_frozendict(sfdict, minimum)
+def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, already=frozendict()):
+    if len(sumfacts) == 0:
+        yield already
+        return
 
-    return sfdict
+    # We store the information whether a vectorization opportunity has been yielded from this
+    # generator to yield an incomplete strategy if not (which is then completed with unvectorized
+    # kernel implementations)
+    yielded = False
 
+    # Find the number of input coefficients we can work on
+    keys = frozenset(sf.input_key for sf in sumfacts)
+    inputkey_sumfacts = [frozenset(filter(lambda sf: sf.input_key == key, sumfacts)) for key in keys]
 
-def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
-    if len(sumfacts) == 0:
-        # We have gone into recursion deep enough to have all sum factorization nodes
-        # assigned their vectorized counterpart. We can yield the result now!
+    for parallel in (1, 2):
+        if parallel == 2 and next(iter(sumfacts)).stage == 3:
+            continue
+        for which in filter(lambda w: w == tuple(sorted(w)),
+                            it.permutations(range(len(keys)), parallel)):
+            horizontal = 1
+            while horizontal <= width // parallel:
+                generators = [filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))),
+                                     it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal) for part in which)))]
+                if horizontal >=4:
+                    generators.append(filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))),
+                                             it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal - 1) for part in which))))
+                for combo in it.chain(*generators):
+                    combo = sum(combo, ())
+
+                    vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp)
+                    if vecdict is None:
+                        # This particular choice was rejected for some reason.
+                        # Possible reasons:
+                        # * the quadrature point tuple not being suitable
+                        #   for this vectorization strategy
+                        continue
+
+                    # Go into recursion to also vectorize all kernels not in this combo
+                    for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo),
+                                                                               width,
+                                                                               qp,
+                                                                               add_to_frozendict(already, vecdict),
+                                                                               ):
+                        yielded = True
+                        yield opp
+
+                horizontal *= 2
+
+    # If we did not yield on this recursion level, yield what we got so far
+    if not yielded:
         yield already
-        return
 
-    # Ensure a deterministic order of the given sumfact kernels. This is necessary for the
-    # fromlist strategy to pick correct strategies across different program runs
-    sumfacts = sorted(sumfacts, key=lambda sf: repr(sf))
-
-    # Otherwise we pick a random sum factorization kernel and construct all the vectorization
-    # opportunities realizing this particular kernel and go into recursion.
-    sf_to_decide = sumfacts[0]
-
-    # Have "unvectorized" as an option, although it is not good
-    for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]),
-                                                              width,
-                                                              qp,
-                                                              add_to_frozendict(already,
-                                                                                {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
-                                                                                ),
-                                                              ):
-        yield opp
-
-    horizontal = 1
-    while horizontal <= width:
-        # Iterate over the possible combinations of sum factorization kernels
-        # taking into account all the permutations of kernels. This also includes
-        # combinations which use a padding of 1 - but only for pure horizontality.
-        generators = [it.permutations(sumfacts, horizontal)]
-        if horizontal >= 4:
-            generators.append(it.permutations(sumfacts, horizontal - 1))
-        for combo in it.chain(*generators):
-            # The chosen kernels must be part of the kernels for recursion
-            # to work correctly
-            if sf_to_decide not in combo:
-                continue
-
-            # Set up the vectorization dict for this combo
-            vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
-            if vecdict is None:
-                # This particular choice was rejected for some reason.
-                # Possible reasons:
-                # * the quadrature point tuple not being suitable
-                #   for this vectorization strategy
-                continue
-
-            # Go into recursion to also vectorize all kernels not in this combo
-            for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo),
-                                                                      width,
-                                                                      qp,
-                                                                      add_to_frozendict(already, vecdict),
-                                                                      ):
-                yield opp
-
-        horizontal = horizontal * 2
+#
+# def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
+#     if len(sumfacts) == 0:
+#         # We have gone into recursion deep enough to have all sum factorization nodes
+#         # assigned their vectorized counterpart. We can yield the result now!
+#         yield already
+#         return
+#
+#     # Ensure a deterministic order of the given sumfact kernels. This is necessary for the
+#     # fromlist strategy to pick correct strategies across different program runs
+#     sumfacts = sorted(sumfacts, key=lambda sf: repr(sf))
+#
+#     # Otherwise we pick a random sum factorization kernel and construct all the vectorization
+#     # opportunities realizing this particular kernel and go into recursion.
+#     sf_to_decide = sumfacts[0]
+#
+#     # Have "unvectorized" as an option, although it is not good
+#     for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]),
+#                                                               width,
+#                                                               qp,
+#                                                               add_to_frozendict(already,
+#                                                                                 {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
+#                                                                                 ),
+#                                                               ):
+#         yield opp
+#
+#     horizontal = 1
+#     while horizontal <= width:
+#         # Iterate over the possible combinations of sum factorization kernels
+#         # taking into account all the permutations of kernels. This also includes
+#         # combinations which use a padding of 1 - but only for pure horizontality.
+#         generators = [it.permutations(sumfacts, horizontal)]
+#         if horizontal >= 4:
+#             generators.append(it.permutations(sumfacts, horizontal - 1))
+#         for combo in it.chain(*generators):
+#             # The chosen kernels must be part of the kernels for recursion
+#             # to work correctly
+#             if sf_to_decide not in combo:
+#                 continue
+#
+#             # Set up the vectorization dict for this combo
+#             vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
+#             if vecdict is None:
+#                 # This particular choice was rejected for some reason.
+#                 # Possible reasons:
+#                 # * the quadrature point tuple not being suitable
+#                 #   for this vectorization strategy
+#                 continue
+#
+#             # Go into recursion to also vectorize all kernels not in this combo
+#             for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo),
+#                                                                       width,
+#                                                                       qp,
+#                                                                       add_to_frozendict(already, vecdict),
+#                                                                       ):
+#                 yield opp
+#
+#         horizontal = horizontal * 2
 
 
 def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
-- 
GitLab