diff --git a/python/dune/perftool/sumfact/costmodel.py b/python/dune/perftool/sumfact/costmodel.py new file mode 100644 index 0000000000000000000000000000000000000000..c036276703981cf136442ae1d48c51368eaa79bc --- /dev/null +++ b/python/dune/perftool/sumfact/costmodel.py @@ -0,0 +1,25 @@ +""" Implementation of the cost model used for vectorization +""" + + +def ilp_heuristic(sf): + """ A heuristic measure for the ILP capabilities of the kernel + + Return is the throttle of insufficient ILP, generally in (0., 1.0]. + 1.0 indicates perfect instruction level parallelism. + """ + return 1.0 + + +def vectorization_costfunction(sf): + # kernel characteristics necessary + traffic = sf.memory_traffic + intensity = sf.operations / sf.memory_traffic + + # hardware characteristics necessary + bandwidth = 100000000000000000.0 + peakperformance = 100.0 + + roofline = min(bandwidth * intensity, ilp_heuristics(sf) * peakperformance) + + return (traffic * intensity) / roofline diff --git a/python/dune/perftool/sumfact/permutation.py b/python/dune/perftool/sumfact/permutation.py index 7ebc0be78adf2a00d184fcd8187de38a53b2d648..b00be306a7f6c724b35107c30692c2f0d1b4f14a 100644 --- a/python/dune/perftool/sumfact/permutation.py +++ b/python/dune/perftool/sumfact/permutation.py @@ -38,7 +38,8 @@ def flop_cost(matrix_sequence): for i in range(l, len(matrix_sequence)): cost_n *= matrix_sequence[i].cols cost += cost_m * cost_n - return cost + # The factor of 2 indicates FMA + return 2 * cost def sumfact_permutation_strategy(sf): diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index 296d6f6d757cd6a9cccbd95d8e0d37e856624e8a..44ae9f7946e8a71dd90ab34b6b941f578cb9b76e 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -308,6 +308,29 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): def vector_width(self): return 1 + # + # Implement properties needed by cost models + # + + @property + def memory_traffic(self): + """ The total number of bytes needed from RAM for the kernel + to be executed - neglecting the existence of caches of course + """ + input = product(mat.basis_size for mat in self.matrix_sequence) + matrices = sum(mat.memory_traffic for mat in set(matrix_sequence)) + + # TODO: this is a hard coded sizeof(double) + return (input + matrices) * 8 + + @property + def operations(self): + """ The total number of floating point operations for the kernel + to be carried out """ + from dune.perftool.sumfact.tabulation import flop_cost + return flop_cost(self.matrix_sequence) + + # Extract the argument list and store it on the class. This needs to be done # outside of the class because the SumfactKernel class object needs to be fully # initialized in order to extract the information from __init__. @@ -587,3 +610,25 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) @property def tag(self): return "vecsumfac_h{}_v{}".format(self.horizontal_width, self.vertical_width) + + # + # Implement properties needed by cost models + # + + @property + def memory_traffic(self): + """ The total number of bytes needed from RAM for the kernel + to be executed - neglecting the existence of caches of course + """ + dofs = product(mat.basis_size for mat in self.matrix_sequence) + matrices = sum(mat.memory_traffic for mat in set(matrix_sequence)) + + # TODO: this is a hard coded sizeof(double) + return (dofs + matrices) * 8 + + @property + def operations(self): + """ The total number of floating point operations for the kernel + to be carried out """ + from dune.perftool.sumfact.tabulation import flop_cost + return flop_cost(self.matrix_sequence) diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index e8518668c227c7528ab72712bbd9e2cba92c147f..3e7c63752b9a770ff4cd9d2e7ecc945edc0c49c5 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -98,6 +98,17 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): def vectorized(self): return False + # + # Implement properties needed by cost models + # + + @property + def memory_traffic(self): + """ The total number of bytes needed from RAM for the kernel + to be executed - neglecting the existence of caches of course + """ + return mat.rows * mat.cols + class BasisTabulationMatrixArray(BasisTabulationMatrixBase): def __init__(self, tabs, width=None): @@ -200,6 +211,21 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): def vectorized(self): return True + # + # Implement properties needed by cost models + # + + @property + def memory_traffic(self): + """ The total number of bytes needed from RAM for the kernel + to be executed - neglecting the existence of caches of course + """ + if len(set(self.tabs)) == 1: + factor = 1 + else: + factor = self.width + return factor * mat.rows * mat.cols + def quadrature_points_per_direction(): # Quadrature order per direction