From 743d6ec03250772020279779677a51ca67d0f324 Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Thu, 21 Sep 2017 17:37:21 +0200
Subject: [PATCH] Add a first draft for a cost model

---
 python/dune/perftool/sumfact/costmodel.py   | 25 ++++++++++++
 python/dune/perftool/sumfact/permutation.py |  3 +-
 python/dune/perftool/sumfact/symbolic.py    | 45 +++++++++++++++++++++
 python/dune/perftool/sumfact/tabulation.py  | 26 ++++++++++++
 4 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 python/dune/perftool/sumfact/costmodel.py

diff --git a/python/dune/perftool/sumfact/costmodel.py b/python/dune/perftool/sumfact/costmodel.py
new file mode 100644
index 00000000..c0362767
--- /dev/null
+++ b/python/dune/perftool/sumfact/costmodel.py
@@ -0,0 +1,25 @@
+""" Implementation of the cost model used for vectorization
+"""
+
+
+def ilp_heuristic(sf):
+    """ A heuristic measure for the ILP capabilities of the kernel
+
+    Return is the throttle of insufficient ILP, generally in (0., 1.0].
+    1.0 indicates perfect instruction level parallelism.
+    """ 
+    return 1.0
+
+
+def vectorization_costfunction(sf):
+    # kernel characteristics necessary
+    traffic = sf.memory_traffic
+    intensity = sf.operations / sf.memory_traffic
+
+    # hardware characteristics necessary
+    bandwidth = 100000000000000000.0
+    peakperformance = 100.0
+
+    roofline = min(bandwidth * intensity, ilp_heuristics(sf) * peakperformance)
+
+    return (traffic * intensity) / roofline
diff --git a/python/dune/perftool/sumfact/permutation.py b/python/dune/perftool/sumfact/permutation.py
index 7ebc0be7..b00be306 100644
--- a/python/dune/perftool/sumfact/permutation.py
+++ b/python/dune/perftool/sumfact/permutation.py
@@ -38,7 +38,8 @@ def flop_cost(matrix_sequence):
         for i in range(l, len(matrix_sequence)):
             cost_n *= matrix_sequence[i].cols
         cost += cost_m * cost_n
-    return cost
+    # The factor of 2 indicates FMA
+    return 2 * cost
 
 
 def sumfact_permutation_strategy(sf):
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index 296d6f6d..44ae9f79 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -308,6 +308,29 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     def vector_width(self):
         return 1
 
+    #
+    # Implement properties needed by cost models
+    #
+
+    @property
+    def memory_traffic(self):
+        """ The total number of bytes needed from RAM for the kernel
+        to be executed - neglecting the existence of caches of course
+        """
+        input = product(mat.basis_size for mat in self.matrix_sequence)
+        matrices = sum(mat.memory_traffic for mat in set(matrix_sequence))
+
+        # TODO: this is a hard coded sizeof(double)
+        return (input + matrices) * 8
+
+    @property
+    def operations(self):
+        """ The total number of floating point operations for the kernel
+        to be carried out """
+        from dune.perftool.sumfact.tabulation import flop_cost
+        return flop_cost(self.matrix_sequence)
+
+
 # Extract the argument list and store it on the class. This needs to be done
 # outside of the class because the SumfactKernel class object needs to be fully
 # initialized in order to extract the information from __init__.
@@ -587,3 +610,25 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     @property
     def tag(self):
         return "vecsumfac_h{}_v{}".format(self.horizontal_width, self.vertical_width)
+
+    #
+    # Implement properties needed by cost models
+    #
+
+    @property
+    def memory_traffic(self):
+        """ The total number of bytes needed from RAM for the kernel
+        to be executed - neglecting the existence of caches of course
+        """
+        dofs = product(mat.basis_size for mat in self.matrix_sequence)
+        matrices = sum(mat.memory_traffic for mat in set(matrix_sequence))
+
+        # TODO: this is a hard coded sizeof(double)
+        return (dofs + matrices) * 8
+
+    @property
+    def operations(self):
+        """ The total number of floating point operations for the kernel
+        to be carried out """
+        from dune.perftool.sumfact.tabulation import flop_cost
+        return flop_cost(self.matrix_sequence)
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index e8518668..3e7c6375 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -98,6 +98,17 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
     def vectorized(self):
         return False
 
+    #
+    # Implement properties needed by cost models
+    #
+
+    @property
+    def memory_traffic(self):
+        """ The total number of bytes needed from RAM for the kernel
+        to be executed - neglecting the existence of caches of course
+        """
+        return mat.rows * mat.cols
+
 
 class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
     def __init__(self, tabs, width=None):
@@ -200,6 +211,21 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
     def vectorized(self):
         return True
 
+    #
+    # Implement properties needed by cost models
+    #
+
+    @property
+    def memory_traffic(self):
+        """ The total number of bytes needed from RAM for the kernel
+        to be executed - neglecting the existence of caches of course
+        """
+        if len(set(self.tabs)) == 1:
+            factor = 1
+        else:
+            factor = self.width
+        return factor * mat.rows * mat.cols
+
 
 def quadrature_points_per_direction():
     # Quadrature order per direction
-- 
GitLab