From 6cb3a57e670cd445c5b4db793b2855e68a599ab6 Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Thu, 6 Apr 2017 13:54:38 +0200
Subject: [PATCH] First implementation of vertical vectorization

---
 python/dune/perftool/sumfact/tabulation.py    |  2 +-
 python/dune/perftool/sumfact/vectorization.py | 39 +++++++++++++++++--
 test/sumfact/mass/CMakeLists.txt              |  5 +++
 test/sumfact/mass/mass_3d.mini                |  3 ++
 test/sumfact/mass/mass_3d.ufl                 |  2 +-
 5 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index d6519699..71cadb75 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -56,7 +56,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
         if quadrature_size is None:
             quadrature_size = quadrature_points_per_direction()
         if slice_size is not None:
-            quadrature_size = ceildiv(quadrature_size, slize_size)
+            quadrature_size = ceildiv(quadrature_size, slice_size)
         if basis_size is None:
             basis_size = basis_functions_per_direction()
         ImmutableRecord.__init__(self,
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 460ae1e9..4e709b39 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -41,6 +41,35 @@ def no_vectorization(sumfacts):
                                               input=get_counted_variable("input")))
 
 
+def vertical_vectorization_strategy(sumfact, depth):
+    # For sake of simplicity we restrict us to stage 1 so far
+    if sumfact.stage == 1:
+        # Assert that this is not already sliced
+        assert all(mat.slice_size is None for mat in sumfact.matrix_sequence)
+
+        # Determine which of the matrices in the kernel should be sliced
+        sliced = None
+        for i, mat in enumerate(sumfact.matrix_sequence):
+            if mat.quadrature_size % depth == 0:
+                sliced = i
+
+        # Currently we assume that this function is always able to do the thing!
+        assert sliced is not None
+
+        kernels = []
+        oldtab = sumfact.matrix_sequence[sliced]
+        for i in range(depth):
+            seq = list(sumfact.matrix_sequence)
+            seq[sliced] = oldtab.copy(slice_size=depth,
+                                      slice_index=i)
+            kernels.append(sumfact.copy(matrix_sequence=tuple(seq)))
+
+        vsf = VectorizedSumfactKernel(kernels=tuple(kernels))
+        return _cache_vectorization_info(sumfact, vsf)
+    else:
+        return _cache_vectorization_info(sumfact, sumfact)
+
+
 def horizontal_vectorization_strategy(sumfacts):
     width = get_vcl_type_size(np.float64)
     # We currently heuristically allow horizontal vectorization if the number
@@ -95,12 +124,16 @@ def decide_vectorization_strategy():
     from dune.perftool.generation import retrieve_cache_items
     sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")]
 
-    if not get_option("vectorize_grads"):
-        no_vectorization(sumfacts)
-    else:
+    if get_option("vectorize_grads"):
         # Currently we base our idea here on the fact that we only group sum
         # factorization kernels with the same input.
         inputkeys = set(sf.input_key for sf in sumfacts)
         for inputkey in inputkeys:
             sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey]
             horizontal_vectorization_strategy(sumfact_filter)
+    elif get_option("vectorize_slice"):
+        for sumfact in sumfacts:
+            width = get_vcl_type_size(np.float64)
+            vertical_vectorization_strategy(sumfact, width)
+    else:
+        no_vectorization(sumfacts)
diff --git a/test/sumfact/mass/CMakeLists.txt b/test/sumfact/mass/CMakeLists.txt
index a640f020..a1313988 100644
--- a/test/sumfact/mass/CMakeLists.txt
+++ b/test/sumfact/mass/CMakeLists.txt
@@ -8,3 +8,8 @@ dune_add_formcompiler_system_test(UFLFILE mass_3d.ufl
                                   BASENAME sumfact_mass_3d
                                   INIFILE mass_3d.mini
                                   )
+
+dune_add_formcompiler_system_test(UFLFILE mass_3d.ufl
+                                  BASENAME sumfact_mass_sliced
+                                  INIFILE sliced.mini
+                                  )
diff --git a/test/sumfact/mass/mass_3d.mini b/test/sumfact/mass/mass_3d.mini
index 2948674e..1244acec 100644
--- a/test/sumfact/mass/mass_3d.mini
+++ b/test/sumfact/mass/mass_3d.mini
@@ -17,3 +17,6 @@ extension = vtu
 numerical_jacobian = 1, 0 | expand num
 vectorize_quad = 1, 0 | expand vec
 sumfact = 1
+
+[formcompiler.ufl_variants]
+degree = 1
diff --git a/test/sumfact/mass/mass_3d.ufl b/test/sumfact/mass/mass_3d.ufl
index bd56d296..5f55103e 100644
--- a/test/sumfact/mass/mass_3d.ufl
+++ b/test/sumfact/mass/mass_3d.ufl
@@ -1,6 +1,6 @@
 cell = "hexahedron"
 
-V = FiniteElement("DG", cell, 1)
+V = FiniteElement("DG", cell, degree)
 
 u = TrialFunction(V)
 v = TestFunction(V)
-- 
GitLab