Skip to content
Snippets Groups Projects
Commit 743d6ec0 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

Add a first draft for a cost model

parent d30e55f5
No related branches found
No related tags found
No related merge requests found
""" Implementation of the cost model used for vectorization
"""
def ilp_heuristic(sf):
""" A heuristic measure for the ILP capabilities of the kernel
Return is the throttle of insufficient ILP, generally in (0., 1.0].
1.0 indicates perfect instruction level parallelism.
"""
return 1.0
def vectorization_costfunction(sf):
# kernel characteristics necessary
traffic = sf.memory_traffic
intensity = sf.operations / sf.memory_traffic
# hardware characteristics necessary
bandwidth = 100000000000000000.0
peakperformance = 100.0
roofline = min(bandwidth * intensity, ilp_heuristics(sf) * peakperformance)
return (traffic * intensity) / roofline
...@@ -38,7 +38,8 @@ def flop_cost(matrix_sequence): ...@@ -38,7 +38,8 @@ def flop_cost(matrix_sequence):
for i in range(l, len(matrix_sequence)): for i in range(l, len(matrix_sequence)):
cost_n *= matrix_sequence[i].cols cost_n *= matrix_sequence[i].cols
cost += cost_m * cost_n cost += cost_m * cost_n
return cost # The factor of 2 indicates FMA
return 2 * cost
def sumfact_permutation_strategy(sf): def sumfact_permutation_strategy(sf):
......
...@@ -308,6 +308,29 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): ...@@ -308,6 +308,29 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
def vector_width(self): def vector_width(self):
return 1 return 1
#
# Implement properties needed by cost models
#
@property
def memory_traffic(self):
""" The total number of bytes needed from RAM for the kernel
to be executed - neglecting the existence of caches of course
"""
input = product(mat.basis_size for mat in self.matrix_sequence)
matrices = sum(mat.memory_traffic for mat in set(matrix_sequence))
# TODO: this is a hard coded sizeof(double)
return (input + matrices) * 8
@property
def operations(self):
""" The total number of floating point operations for the kernel
to be carried out """
from dune.perftool.sumfact.tabulation import flop_cost
return flop_cost(self.matrix_sequence)
# Extract the argument list and store it on the class. This needs to be done # Extract the argument list and store it on the class. This needs to be done
# outside of the class because the SumfactKernel class object needs to be fully # outside of the class because the SumfactKernel class object needs to be fully
# initialized in order to extract the information from __init__. # initialized in order to extract the information from __init__.
...@@ -587,3 +610,25 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) ...@@ -587,3 +610,25 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
@property @property
def tag(self): def tag(self):
return "vecsumfac_h{}_v{}".format(self.horizontal_width, self.vertical_width) return "vecsumfac_h{}_v{}".format(self.horizontal_width, self.vertical_width)
#
# Implement properties needed by cost models
#
@property
def memory_traffic(self):
""" The total number of bytes needed from RAM for the kernel
to be executed - neglecting the existence of caches of course
"""
dofs = product(mat.basis_size for mat in self.matrix_sequence)
matrices = sum(mat.memory_traffic for mat in set(matrix_sequence))
# TODO: this is a hard coded sizeof(double)
return (dofs + matrices) * 8
@property
def operations(self):
""" The total number of floating point operations for the kernel
to be carried out """
from dune.perftool.sumfact.tabulation import flop_cost
return flop_cost(self.matrix_sequence)
...@@ -98,6 +98,17 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): ...@@ -98,6 +98,17 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
def vectorized(self): def vectorized(self):
return False return False
#
# Implement properties needed by cost models
#
@property
def memory_traffic(self):
""" The total number of bytes needed from RAM for the kernel
to be executed - neglecting the existence of caches of course
"""
return mat.rows * mat.cols
class BasisTabulationMatrixArray(BasisTabulationMatrixBase): class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
def __init__(self, tabs, width=None): def __init__(self, tabs, width=None):
...@@ -200,6 +211,21 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): ...@@ -200,6 +211,21 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
def vectorized(self): def vectorized(self):
return True return True
#
# Implement properties needed by cost models
#
@property
def memory_traffic(self):
""" The total number of bytes needed from RAM for the kernel
to be executed - neglecting the existence of caches of course
"""
if len(set(self.tabs)) == 1:
factor = 1
else:
factor = self.width
return factor * mat.rows * mat.cols
def quadrature_points_per_direction(): def quadrature_points_per_direction():
# Quadrature order per direction # Quadrature order per direction
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment