From 0e8b4ea559058fb921e435d9254c3f4fce16decd Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Fri, 22 Sep 2017 16:24:34 +0200
Subject: [PATCH] Implement a greedy vectorization strategy

It always aims for maximally horizontal vectorization
where possible.
---
 .../poisson_dg_tensor/poisson_dg_tensor.mini  |  4 +-
 applications/stokes_dg/stokes_dg.mini         |  6 +--
 python/dune/perftool/options.py               |  1 +
 python/dune/perftool/sumfact/vectorization.py | 41 +++++++++++++++++++
 4 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/applications/poisson_dg_tensor/poisson_dg_tensor.mini b/applications/poisson_dg_tensor/poisson_dg_tensor.mini
index feffea5b..09c57bf1 100644
--- a/applications/poisson_dg_tensor/poisson_dg_tensor.mini
+++ b/applications/poisson_dg_tensor/poisson_dg_tensor.mini
@@ -38,7 +38,7 @@ extension = vtu
 fastdg = 1
 sumfact = 1
 vectorize_quad = 1
-vectorize_grads = 1
+vectorize_greedy = 1
 instrumentation_level = 2, 3, 4 | expand
 opcounter = 1, 0 | expand opcount
 time_opcounter = 0, 1 | expand opcount
@@ -48,4 +48,4 @@ quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
 
 [formcompiler.ufl_variants]
 cell = hexahedron
-degree = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand
+degree = 1, 3, 5, 7, 9 | expand
diff --git a/applications/stokes_dg/stokes_dg.mini b/applications/stokes_dg/stokes_dg.mini
index 246deb18..1b0fbdeb 100644
--- a/applications/stokes_dg/stokes_dg.mini
+++ b/applications/stokes_dg/stokes_dg.mini
@@ -39,7 +39,7 @@ extension = vtu
 fastdg = 1
 sumfact = 1
 vectorize_quad = 1
-vectorize_grads = 1
+vectorize_greedy = 1
 instrumentation_level = 2, 3, 4 | expand
 opcounter = 1, 0 | expand opcount
 time_opcounter = 0, 1 | expand opcount
@@ -47,5 +47,5 @@ quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval
 
 [formcompiler.ufl_variants]
 cell = hexahedron
-v_degree = 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand degree
-p_degree = 1, 2, 3, 4, 5, 6, 7, 8, 9  | expand degree
+v_degree = 1, 3, 5, 7, 9 | expand degree
+p_degree = 0, 2, 4, 6, 8 | expand degree
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index ed378971..0e1e8f58 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -58,6 +58,7 @@ class PerftoolOptionsArray(ImmutableRecord):
     vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
     vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
     vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
+    vectorize_greedy = PerftoolOption(default=False, helpstr="the heuristic currently in use (to produce paper numbers)")
     turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.")
     architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl")
 
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index b2dca33d..ab5535d7 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -51,6 +51,13 @@ def no_vectorization(sumfacts):
 
 
 def vertical_vectorization_strategy(sumfact, depth):
+    # If depth is 1, there is nothing do
+    if depth == 1:
+        if isinstance(sumfact, SumfactKernel):
+            return {sumfact: sumfact}
+        else:
+            return {k: sumfact for k in sumfact.kernels}
+
     # Assert that this is not already sliced
     assert all(mat.slice_size is None for mat in sumfact.matrix_sequence)
     result = {}
@@ -139,6 +146,33 @@ def diagonal_vectorization_strategy(sumfacts, width):
     return result
 
 
+def greedy_vectorization_strategy(sumfacts, width):
+    sumfacts = set(sumfacts)
+    horizontal = width
+    vertical = 1
+
+    result = {}
+
+    while horizontal > 0:
+        if horizontal > 1:
+            horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=0)
+        else:
+            horizontal_kernels = {sf: sf for sf in sumfacts}
+
+        for sf in horizontal_kernels:
+            if horizontal_kernels[sf].horizontal_width == horizontal:
+                vert = vertical_vectorization_strategy(horizontal_kernels[sf],
+                                                       vertical)
+                for k in vert:
+                    result[k] = vert[k]
+                sumfacts.discard(sf)
+
+        horizontal = horizontal / 2
+        vertical = vertical * 2
+
+    return result
+
+
 def decide_vectorization_strategy():
     """ Decide how to vectorize!
     Note that the vectorization of the quadrature loop is independent of this,
@@ -188,6 +222,13 @@ def decide_vectorization_strategy():
             sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
             for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items():
                 sfdict[old] = new
+    elif get_option("vectorize_greedy"):
+        inputkeys = set(sf.input_key for sf in active_sumfacts)
+        for inputkey in inputkeys:
+            width = get_vcl_type_size(np.float64)
+            sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
+            for old, new in greedy_vectorization_strategy(sumfact_filter, width).items():
+                sfdict[old] = new
     else:
         for old, new in no_vectorization(active_sumfacts).items():
             sfdict[old] = new
-- 
GitLab