From 743d6ec03250772020279779677a51ca67d0f324 Mon Sep 17 00:00:00 2001 From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> Date: Thu, 21 Sep 2017 17:37:21 +0200 Subject: [PATCH] Add a first draft for a cost model --- python/dune/perftool/sumfact/costmodel.py | 25 ++++++++++++ python/dune/perftool/sumfact/permutation.py | 3 +- python/dune/perftool/sumfact/symbolic.py | 45 +++++++++++++++++++++ python/dune/perftool/sumfact/tabulation.py | 26 ++++++++++++ 4 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 python/dune/perftool/sumfact/costmodel.py diff --git a/python/dune/perftool/sumfact/costmodel.py b/python/dune/perftool/sumfact/costmodel.py new file mode 100644 index 00000000..c0362767 --- /dev/null +++ b/python/dune/perftool/sumfact/costmodel.py @@ -0,0 +1,25 @@ +""" Implementation of the cost model used for vectorization +""" + + +def ilp_heuristic(sf): + """ A heuristic measure for the ILP capabilities of the kernel + + Return is the throttle of insufficient ILP, generally in (0., 1.0]. + 1.0 indicates perfect instruction level parallelism. + """ + return 1.0 + + +def vectorization_costfunction(sf): + # kernel characteristics necessary + traffic = sf.memory_traffic + intensity = sf.operations / sf.memory_traffic + + # hardware characteristics necessary + bandwidth = 100000000000000000.0 + peakperformance = 100.0 + + roofline = min(bandwidth * intensity, ilp_heuristics(sf) * peakperformance) + + return (traffic * intensity) / roofline diff --git a/python/dune/perftool/sumfact/permutation.py b/python/dune/perftool/sumfact/permutation.py index 7ebc0be7..b00be306 100644 --- a/python/dune/perftool/sumfact/permutation.py +++ b/python/dune/perftool/sumfact/permutation.py @@ -38,7 +38,8 @@ def flop_cost(matrix_sequence): for i in range(l, len(matrix_sequence)): cost_n *= matrix_sequence[i].cols cost += cost_m * cost_n - return cost + # The factor of 2 indicates FMA + return 2 * cost def sumfact_permutation_strategy(sf): diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index 296d6f6d..44ae9f79 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -308,6 +308,29 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): def vector_width(self): return 1 + # + # Implement properties needed by cost models + # + + @property + def memory_traffic(self): + """ The total number of bytes needed from RAM for the kernel + to be executed - neglecting the existence of caches of course + """ + input = product(mat.basis_size for mat in self.matrix_sequence) + matrices = sum(mat.memory_traffic for mat in set(matrix_sequence)) + + # TODO: this is a hard coded sizeof(double) + return (input + matrices) * 8 + + @property + def operations(self): + """ The total number of floating point operations for the kernel + to be carried out """ + from dune.perftool.sumfact.tabulation import flop_cost + return flop_cost(self.matrix_sequence) + + # Extract the argument list and store it on the class. This needs to be done # outside of the class because the SumfactKernel class object needs to be fully # initialized in order to extract the information from __init__. @@ -587,3 +610,25 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) @property def tag(self): return "vecsumfac_h{}_v{}".format(self.horizontal_width, self.vertical_width) + + # + # Implement properties needed by cost models + # + + @property + def memory_traffic(self): + """ The total number of bytes needed from RAM for the kernel + to be executed - neglecting the existence of caches of course + """ + dofs = product(mat.basis_size for mat in self.matrix_sequence) + matrices = sum(mat.memory_traffic for mat in set(matrix_sequence)) + + # TODO: this is a hard coded sizeof(double) + return (dofs + matrices) * 8 + + @property + def operations(self): + """ The total number of floating point operations for the kernel + to be carried out """ + from dune.perftool.sumfact.tabulation import flop_cost + return flop_cost(self.matrix_sequence) diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index e8518668..3e7c6375 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -98,6 +98,17 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): def vectorized(self): return False + # + # Implement properties needed by cost models + # + + @property + def memory_traffic(self): + """ The total number of bytes needed from RAM for the kernel + to be executed - neglecting the existence of caches of course + """ + return mat.rows * mat.cols + class BasisTabulationMatrixArray(BasisTabulationMatrixBase): def __init__(self, tabs, width=None): @@ -200,6 +211,21 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): def vectorized(self): return True + # + # Implement properties needed by cost models + # + + @property + def memory_traffic(self): + """ The total number of bytes needed from RAM for the kernel + to be executed - neglecting the existence of caches of course + """ + if len(set(self.tabs)) == 1: + factor = 1 + else: + factor = self.width + return factor * mat.rows * mat.cols + def quadrature_points_per_direction(): # Quadrature order per direction -- GitLab