First improvement of vectorization strategy

2c9cdfc6 · Dominic Kempf · f84650ac · 2c9cdfc6 · 2c9cdfc6
Commit 2c9cdfc6 authored 8 years ago by Dominic Kempf
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -130,12 +130,35 @@ class SumfactKernel(ImmutableRecord, prim.Variable):

    mapper_method = "map_sumfact_kernel"

+    #
+    # Some cache key definitions
+    # Watch out for the documentation to see which key is used unter what circumstances
+    #
+
+    @property
+    def cache_key(self):
+        """ The cache key that can be used in generation magic
+        Any two sum factorization kernels having the same cache_key
+        are realized simulatenously!
+        """
+        return (self.matrix_sequence, self.restriction, self.stage, self.buffer)
+
+    @property
+    def input_key(self):
+        """ A cache key for the input coefficients
+        Any two sum factorization kernels having the same input_key
+        work on the same input coefficient (and are suitable for simultaneous
+        treatment because of that)
+        """
+        return (self.restriction, self.stage, self.coeff_func, self.element, self.component, self.accumvar)
+
    #
    # Some convenience methods to extract information about the sum factorization kernel
    #

    @property
    def length(self):
+        """ The number of matrices to apply """
        return len(self.matrix_sequence)

    @property
@@ -146,14 +169,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
    def transposed(self):
        return next(iter(self.matrix_sequence)).transpose

-    @property
-    def cache_key(self):
-        """ The cache key that can be used in generation magic,
-        Any two sum factorization kernels having the same cache_key
-        are realized simulatneously!
-        """
-        return hash((self.matrix_sequence, self.restriction, self.stage, self.buffer))
-
    @property
    def vec_index(self):
        """ A tuple with the vector index

--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -39,13 +39,12 @@ def no_vectorization(sumfacts):
                                              input=get_counted_variable("input")))


-def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
-    stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage and sf.restriction == restriction])
-    if len(stage_sumfacts) in (3, 4):
+def horizontal_vectorization_strategy(sumfacts):
+    if len(sumfacts) in (3, 4):
        # Map the sum factorization to their position in the joint kernel
        position_mapping = {}
        available = set(range(4))
-        for sf in stage_sumfacts:
+        for sf in sumfacts:
            if sf.preferred_position is not None:
                # This asserts that no two kernels want to take the same position
                # Later on, more complicated stuff might be necessary here.
@@ -54,7 +53,7 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
                position_mapping[sf] = sf.preferred_position

        # Choose a position for those that have no preferred one!
-        for sumf in stage_sumfacts:
+        for sumf in sumfacts:
            if sumf.preferred_position is None:
                position_mapping[sumf] = available.pop()

@@ -64,37 +63,37 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):

        # Collect the large matrices!
        large_matrix_sequence = []
-        for i in range(len(next(iter(stage_sumfacts)).matrix_sequence)):
+        for i in range(len(next(iter(sumfacts)).matrix_sequence)):
            # Assert that the matrices of all sum factorizations have the same size
-            assert len(set(tuple(sf.matrix_sequence[i].rows for sf in stage_sumfacts))) == 1
-            assert len(set(tuple(sf.matrix_sequence[i].cols for sf in stage_sumfacts))) == 1
+            assert len(set(tuple(sf.matrix_sequence[i].rows for sf in sumfacts))) == 1
+            assert len(set(tuple(sf.matrix_sequence[i].cols for sf in sumfacts))) == 1

            # Collect the derivative information
            derivative = [False] * 4
-            for sf in stage_sumfacts:
+            for sf in sumfacts:
                derivative[position_mapping[sf]] = sf.matrix_sequence[i].derivative

-            large = BasisTabulationMatrixArray(rows=next(iter(stage_sumfacts)).matrix_sequence[i].rows,
-                                               cols=next(iter(stage_sumfacts)).matrix_sequence[i].cols,
-                                               transpose=next(iter(stage_sumfacts)).matrix_sequence[i].transpose,
+            large = BasisTabulationMatrixArray(rows=next(iter(sumfacts)).matrix_sequence[i].rows,
+                                               cols=next(iter(sumfacts)).matrix_sequence[i].cols,
+                                               transpose=next(iter(sumfacts)).matrix_sequence[i].transpose,
                                               derivative=tuple(derivative),
-                                               face=next(iter(stage_sumfacts)).matrix_sequence[i].face,
+                                               face=next(iter(sumfacts)).matrix_sequence[i].face,
                                               )
            large_matrix_sequence.append(large)

-        for sumf in stage_sumfacts:
+        for sumf in sumfacts:
            _cache_vectorization_info(sumf,
                                      sumf.copy(matrix_sequence=tuple(large_matrix_sequence),
                                                buffer=buf,
                                                input=inp,
                                                index=position_mapping[sumf],
                                                padding=frozenset(available),
-                                                insn_dep=frozenset().union(sf.insn_dep for sf in stage_sumfacts),
+                                                insn_dep=frozenset().union(sf.insn_dep for sf in sumfacts),
                                                )
                                      )
    else:
        # Disable vectorization strategy
-        no_vectorization(stage_sumfacts)
+        no_vectorization(sumfacts)


 def decide_vectorization_strategy():
@@ -108,12 +107,9 @@ def decide_vectorization_strategy():
    if not get_option("vectorize_grads"):
        no_vectorization(sumfacts)
    else:
-        res = (Restriction.NONE, Restriction.POSITIVE, Restriction.NEGATIVE)
-        # Stage 1 kernels
-        for restriction in res:
-            decide_stage_vectorization_strategy(sumfacts, 1, restriction)
-
-        # Stage 3 kernels
-        import itertools as it
-        for restriction in it.product(res, res):
-            decide_stage_vectorization_strategy(sumfacts, 3, restriction)
+        # Currently we base our idea here on the fact that we only group sum
+        # factorization kernels with the same input.
+        inputkeys = set(sf.input_key for sf in sumfacts)
+        for inputkey in inputkeys:
+            sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey]
+            horizontal_vectorization_strategy(sumfact_filter)