From a8eadc594f95f683dab3238d0a2b536f3179ab8d Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Wed, 5 Apr 2017 14:40:24 +0200
Subject: [PATCH] Use padding on theta large matrices and get trid of explicit
 4s

---
 python/dune/perftool/sumfact/accumulation.py  |  2 +-
 python/dune/perftool/sumfact/realization.py   |  2 +-
 python/dune/perftool/sumfact/symbolic.py      |  9 +++++----
 python/dune/perftool/sumfact/tabulation.py    | 20 ++++++++++++++-----
 python/dune/perftool/sumfact/vectorization.py | 14 ++++++++++---
 5 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
index d77f8145..d09a91b7 100644
--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -203,7 +203,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         vecinames = ()
         # TODO: evaluate whether the following line would be okay with vsf.vectorized
         if vsf.vec_index(sf) is not None:
-            iname = accum_iname((accterm.argument.restriction, restriction), 4, "vec")
+            iname = accum_iname((accterm.argument.restriction, restriction), vsf.vector_width, "vec")
             vecinames = (iname,)
             transform(lp.tag_inames, [(iname, "vec")])
             from dune.perftool.tools import maybe_wrap_subscript
diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py
index e2e375a2..583f8f4d 100644
--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -95,7 +95,7 @@ def _realize_sum_factorization_kernel(sf):
     if sf.vectorized:
         ftags = ftags + ",vec"
         ctags = ctags + ",vec"
-        vec_shape = (4,)
+        vec_shape = (sf.vector_width,)
 
     # Measure times and count operations in c++ code
     if get_option("instrumentation_level") >= 4:
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index a9a0dd4c..37fa5e3f 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -344,6 +344,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
                                                 transpose=self.kernels[0].matrix_sequence[i].transpose,
                                                 face=self.kernels[0].matrix_sequence[i].face,
                                                 derivative=tuple(k.matrix_sequence[i].derivative for k in self.kernels),
+                                                width=self.vector_width,
                                                 )
                      for i in range(self.length))
 
@@ -419,14 +420,14 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
 
     @property
     def flat_input_shape(self):
-        return (product(mat.cols for mat in self.matrix_sequence), 4)
+        return (product(mat.cols for mat in self.matrix_sequence), self.vector_width)
 
     @property
     def quadrature_shape(self):
         if self.transposed:
-            return tuple(mat.cols for mat in self.matrix_sequence if mat.face is None) + (4,)
+            return tuple(mat.cols for mat in self.matrix_sequence if mat.face is None) + (self.vector_width,)
         else:
-            return tuple(mat.rows for mat in self.matrix_sequence if mat.face is None) + (4,)
+            return tuple(mat.rows for mat in self.matrix_sequence if mat.face is None) + (self.vector_width,)
 
     @property
     def quadrature_dimtags(self):
@@ -436,7 +437,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
 
     @property
     def dof_shape(self):
-        return tuple(mat.rows for mat in self.matrix_sequence) + (4,)
+        return tuple(mat.rows for mat in self.matrix_sequence) + (self.vector_width,)
 
     @property
     def dof_dimtags(self):
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index c6fa99e3..e6465985 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -59,7 +59,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase):
 
 
 class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
-    def __init__(self, rows, cols, transpose, derivative, face):
+    def __init__(self, rows, cols, transpose, derivative, face, width):
         assert isinstance(derivative, tuple)
         BasisTabulationMatrixBase.__init__(self,
                                            rows=rows,
@@ -67,6 +67,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
                                            transpose=transpose,
                                            derivative=derivative,
                                            face=face,
+                                           width=width,
                                            )
 
     @property
@@ -77,13 +78,22 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
                                                quadrature_points_per_direction(),
                                                )
         for i, d in enumerate(self.derivative):
-            define_theta(name, (self.rows, self.cols), self.transpose, d, face=self.face, additional_indices=(i,))
+            define_theta(name, (self.rows, self.cols), self.transpose, d, face=self.face, additional_indices=(i,), width=self.width)
+
+        # Apply padding to those fields not used. This is necessary because you may get memory
+        # initialized with NaN and those NaNs will screw the horizontal_add.
+        i = theta_iname("i", self.rows)
+        j = theta_iname("j", self.cols)
+        for index in set(range(self.width)) - set(range(len(self.derivative))):
+            instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j), index)),
+                        expression=0,
+                        kernel="operator")
 
         return loopy_class_member(name,
                                   classtag="operator",
                                   dtype=numpy.float64,
                                   dim_tags="f,f,vec",
-                                  shape=(self.rows, self.cols, 4),
+                                  shape=(self.rows, self.cols, self.width),
                                   potentially_vectorized=True,
                                   managed=True,
                                   )
@@ -214,7 +224,7 @@ def polynomial_lookup_mangler(target, func, dtypes):
         return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64)))
 
 
-def define_theta(name, shape, transpose, derivative, face=None, additional_indices=()):
+def define_theta(name, shape, transpose, derivative, face=None, additional_indices=(), width=None):
     qp = name_oned_quadrature_points()
     qw = name_oned_quadrature_weights()
     sort_quadrature_points_weights(qp, qw)
@@ -223,7 +233,7 @@ def define_theta(name, shape, transpose, derivative, face=None, additional_indic
     dim_tags = "f,f"
     if additional_indices:
         dim_tags = dim_tags + ",c"
-        shape = shape + (4,)
+        shape = shape + (width,)
 
     loopy_class_member(name,
                        dtype=numpy.float64,
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 84c7e615..460ae1e9 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -1,5 +1,6 @@
 """ Sum factorization vectorization """
 
+from dune.perftool.loopy.vcl import get_vcl_type_size
 from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel
 from dune.perftool.generation import (generator_factory,
                                       get_counted_variable,
@@ -13,6 +14,7 @@ from dune.perftool.error import PerftoolError
 from dune.perftool.options import get_option
 
 import loopy as lp
+import numpy as np
 
 
 @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o)
@@ -40,10 +42,16 @@ def no_vectorization(sumfacts):
 
 
 def horizontal_vectorization_strategy(sumfacts):
-    if len(sumfacts) in (3, 4):
+    width = get_vcl_type_size(np.float64)
+    # We currently heuristically allow horizontal vectorization if the number
+    # of kernels matches the maximum vector width or is one below that. There
+    # is a lot of TODOs with that procedure:
+    # * You might have *more* kernels than vector width
+    # * You might want to combine horizontal and vertical strategies
+    if len(sumfacts) in (width - 1, width):
         # Map the sum factorization to their position in the joint kernel
         position_mapping = {}
-        available = set(range(4))
+        available = set(range(width))
         for sf in sumfacts:
             if sf.preferred_position is not None:
                 # This asserts that no two kernels want to take the same position
@@ -69,7 +77,7 @@ def horizontal_vectorization_strategy(sumfacts):
         for sumf in sumfacts:
             _cache_vectorization_info(sumf,
                                       VectorizedSumfactKernel(kernels=kernels,
-                                                              vector_width=4,
+                                                              vector_width=width,
                                                               buffer=buffer,
                                                               input=input,
                                                               )
-- 
GitLab