From 2c9cdfc6fc19bfb688030af871f806425550b68e Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Mon, 3 Apr 2017 14:31:14 +0200
Subject: [PATCH] First improvement of vectorization strategy

---
 python/dune/perftool/sumfact/symbolic.py      | 31 +++++++++----
 python/dune/perftool/sumfact/vectorization.py | 46 +++++++++----------
 2 files changed, 44 insertions(+), 33 deletions(-)

diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index e0f1c658..f44cecc3 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -130,12 +130,35 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
 
     mapper_method = "map_sumfact_kernel"
 
+    #
+    # Some cache key definitions
+    # Watch out for the documentation to see which key is used unter what circumstances
+    #
+
+    @property
+    def cache_key(self):
+        """ The cache key that can be used in generation magic
+        Any two sum factorization kernels having the same cache_key
+        are realized simulatenously!
+        """
+        return (self.matrix_sequence, self.restriction, self.stage, self.buffer)
+
+    @property
+    def input_key(self):
+        """ A cache key for the input coefficients
+        Any two sum factorization kernels having the same input_key
+        work on the same input coefficient (and are suitable for simultaneous
+        treatment because of that)
+        """
+        return (self.restriction, self.stage, self.coeff_func, self.element, self.component, self.accumvar)
+
     #
     # Some convenience methods to extract information about the sum factorization kernel
     #
 
     @property
     def length(self):
+        """ The number of matrices to apply """
         return len(self.matrix_sequence)
 
     @property
@@ -146,14 +169,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
     def transposed(self):
         return next(iter(self.matrix_sequence)).transpose
 
-    @property
-    def cache_key(self):
-        """ The cache key that can be used in generation magic,
-        Any two sum factorization kernels having the same cache_key
-        are realized simulatneously!
-        """
-        return hash((self.matrix_sequence, self.restriction, self.stage, self.buffer))
-
     @property
     def vec_index(self):
         """ A tuple with the vector index
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 8ca28d17..ea058464 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -39,13 +39,12 @@ def no_vectorization(sumfacts):
                                               input=get_counted_variable("input")))
 
 
-def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
-    stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage and sf.restriction == restriction])
-    if len(stage_sumfacts) in (3, 4):
+def horizontal_vectorization_strategy(sumfacts):
+    if len(sumfacts) in (3, 4):
         # Map the sum factorization to their position in the joint kernel
         position_mapping = {}
         available = set(range(4))
-        for sf in stage_sumfacts:
+        for sf in sumfacts:
             if sf.preferred_position is not None:
                 # This asserts that no two kernels want to take the same position
                 # Later on, more complicated stuff might be necessary here.
@@ -54,7 +53,7 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
                 position_mapping[sf] = sf.preferred_position
 
         # Choose a position for those that have no preferred one!
-        for sumf in stage_sumfacts:
+        for sumf in sumfacts:
             if sumf.preferred_position is None:
                 position_mapping[sumf] = available.pop()
 
@@ -64,37 +63,37 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
 
         # Collect the large matrices!
         large_matrix_sequence = []
-        for i in range(len(next(iter(stage_sumfacts)).matrix_sequence)):
+        for i in range(len(next(iter(sumfacts)).matrix_sequence)):
             # Assert that the matrices of all sum factorizations have the same size
-            assert len(set(tuple(sf.matrix_sequence[i].rows for sf in stage_sumfacts))) == 1
-            assert len(set(tuple(sf.matrix_sequence[i].cols for sf in stage_sumfacts))) == 1
+            assert len(set(tuple(sf.matrix_sequence[i].rows for sf in sumfacts))) == 1
+            assert len(set(tuple(sf.matrix_sequence[i].cols for sf in sumfacts))) == 1
 
             # Collect the derivative information
             derivative = [False] * 4
-            for sf in stage_sumfacts:
+            for sf in sumfacts:
                 derivative[position_mapping[sf]] = sf.matrix_sequence[i].derivative
 
-            large = BasisTabulationMatrixArray(rows=next(iter(stage_sumfacts)).matrix_sequence[i].rows,
-                                               cols=next(iter(stage_sumfacts)).matrix_sequence[i].cols,
-                                               transpose=next(iter(stage_sumfacts)).matrix_sequence[i].transpose,
+            large = BasisTabulationMatrixArray(rows=next(iter(sumfacts)).matrix_sequence[i].rows,
+                                               cols=next(iter(sumfacts)).matrix_sequence[i].cols,
+                                               transpose=next(iter(sumfacts)).matrix_sequence[i].transpose,
                                                derivative=tuple(derivative),
-                                               face=next(iter(stage_sumfacts)).matrix_sequence[i].face,
+                                               face=next(iter(sumfacts)).matrix_sequence[i].face,
                                                )
             large_matrix_sequence.append(large)
 
-        for sumf in stage_sumfacts:
+        for sumf in sumfacts:
             _cache_vectorization_info(sumf,
                                       sumf.copy(matrix_sequence=tuple(large_matrix_sequence),
                                                 buffer=buf,
                                                 input=inp,
                                                 index=position_mapping[sumf],
                                                 padding=frozenset(available),
-                                                insn_dep=frozenset().union(sf.insn_dep for sf in stage_sumfacts),
+                                                insn_dep=frozenset().union(sf.insn_dep for sf in sumfacts),
                                                 )
                                       )
     else:
         # Disable vectorization strategy
-        no_vectorization(stage_sumfacts)
+        no_vectorization(sumfacts)
 
 
 def decide_vectorization_strategy():
@@ -108,12 +107,9 @@ def decide_vectorization_strategy():
     if not get_option("vectorize_grads"):
         no_vectorization(sumfacts)
     else:
-        res = (Restriction.NONE, Restriction.POSITIVE, Restriction.NEGATIVE)
-        # Stage 1 kernels
-        for restriction in res:
-            decide_stage_vectorization_strategy(sumfacts, 1, restriction)
-
-        # Stage 3 kernels
-        import itertools as it
-        for restriction in it.product(res, res):
-            decide_stage_vectorization_strategy(sumfacts, 3, restriction)
+        # Currently we base our idea here on the fact that we only group sum
+        # factorization kernels with the same input.
+        inputkeys = set(sf.input_key for sf in sumfacts)
+        for inputkey in inputkeys:
+            sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey]
+            horizontal_vectorization_strategy(sumfact_filter)
-- 
GitLab