From 880adbdc802d87796f4901a4f14935d3b7e14c2c Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Mon, 12 Feb 2018 15:10:44 +0100
Subject: [PATCH] Add one level of indirection into the vectorization generator

---
 python/dune/perftool/sumfact/symbolic.py      |  8 ++++++--
 python/dune/perftool/sumfact/vectorization.py | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index 627c8fc0..393a2275 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -202,6 +202,11 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     # Watch out for the documentation to see which key is used unter what circumstances
     #
 
+    @property
+    def parallel_key(self):
+        """ A key that identifies parallellizable kernels. """
+        return tuple(m.basis_size for m in self.matrix_sequence) + (self.stage, self.buffer)
+
     @property
     def cache_key(self):
         """ The cache key that can be used in generation magic
@@ -214,8 +219,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     def input_key(self):
         """ A cache key for the input coefficients
         Any two sum factorization kernels having the same input_key
-        work on the same input coefficient (and are suitable for simultaneous
-        treatment because of that)
+        work on the same input coefficient
         """
         return (self.input, self.restriction, self.accumvar, self.trial_element_index)
 
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 95286389..7c9c5495 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -257,6 +257,21 @@ def level1_optimal_vectorization_strategy(sumfacts, width):
 
 
 def level2_optimal_vectorization_strategy(sumfacts, width, qp):
+    # Find the sets of simultaneously realizable kernels
+    keys = frozenset(sf.parallel_key for sf in sumfacts)
+
+    # Find minimums for each of these sets
+    sfdict = frozendict()
+
+    for key in keys:
+        key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key)
+        key_strategy = level3_optimal_vectorization_strategy(key_sumfacts, width, qp)
+        sfdict = add_to_frozendict(sfdict, key_strategy)
+
+    return sfdict
+
+
+def level3_optimal_vectorization_strategy(sumfacts, width, qp):
     # Find the sets of simultaneously realizable kernels (thats an equivalence relation)
     keys = frozenset(sf.input_key for sf in sumfacts)
 
-- 
GitLab