diff --git a/python/dune/perftool/sumfact/costmodel.py b/python/dune/perftool/sumfact/costmodel.py
new file mode 100644
index 0000000000000000000000000000000000000000..c036276703981cf136442ae1d48c51368eaa79bc
--- /dev/null
+++ b/python/dune/perftool/sumfact/costmodel.py
@@ -0,0 +1,25 @@
+""" Implementation of the cost model used for vectorization
+"""
+
+
+def ilp_heuristic(sf):
+    """ A heuristic measure for the ILP capabilities of the kernel
+
+    Return is the throttle of insufficient ILP, generally in (0., 1.0].
+    1.0 indicates perfect instruction level parallelism.
+    """ 
+    return 1.0
+
+
+def vectorization_costfunction(sf):
+    # kernel characteristics necessary
+    traffic = sf.memory_traffic
+    intensity = sf.operations / sf.memory_traffic
+
+    # hardware characteristics necessary
+    bandwidth = 100000000000000000.0
+    peakperformance = 100.0
+
+    roofline = min(bandwidth * intensity, ilp_heuristics(sf) * peakperformance)
+
+    return (traffic * intensity) / roofline
diff --git a/python/dune/perftool/sumfact/permutation.py b/python/dune/perftool/sumfact/permutation.py
index 7ebc0be78adf2a00d184fcd8187de38a53b2d648..b00be306a7f6c724b35107c30692c2f0d1b4f14a 100644
--- a/python/dune/perftool/sumfact/permutation.py
+++ b/python/dune/perftool/sumfact/permutation.py
@@ -38,7 +38,8 @@ def flop_cost(matrix_sequence):
         for i in range(l, len(matrix_sequence)):
             cost_n *= matrix_sequence[i].cols
         cost += cost_m * cost_n
-    return cost
+    # The factor of 2 indicates FMA
+    return 2 * cost
 
 
 def sumfact_permutation_strategy(sf):
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index 296d6f6d757cd6a9cccbd95d8e0d37e856624e8a..44ae9f7946e8a71dd90ab34b6b941f578cb9b76e 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -308,6 +308,29 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     def vector_width(self):
         return 1
 
+    #
+    # Implement properties needed by cost models
+    #
+
+    @property
+    def memory_traffic(self):
+        """ The total number of bytes needed from RAM for the kernel
+        to be executed - neglecting the existence of caches of course
+        """
+        input = product(mat.basis_size for mat in self.matrix_sequence)
+        matrices = sum(mat.memory_traffic for mat in set(matrix_sequence))
+
+        # TODO: this is a hard coded sizeof(double)
+        return (input + matrices) * 8
+
+    @property
+    def operations(self):
+        """ The total number of floating point operations for the kernel
+        to be carried out """
+        from dune.perftool.sumfact.tabulation import flop_cost
+        return flop_cost(self.matrix_sequence)
+
+
 # Extract the argument list and store it on the class. This needs to be done
 # outside of the class because the SumfactKernel class object needs to be fully
 # initialized in order to extract the information from __init__.
@@ -587,3 +610,25 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
     @property
     def tag(self):
         return "vecsumfac_h{}_v{}".format(self.horizontal_width, self.vertical_width)
+
+    #
+    # Implement properties needed by cost models
+    #
+
+    @property
+    def memory_traffic(self):
+        """ The total number of bytes needed from RAM for the kernel
+        to be executed - neglecting the existence of caches of course
+        """
+        dofs = product(mat.basis_size for mat in self.matrix_sequence)
+        matrices = sum(mat.memory_traffic for mat in set(matrix_sequence))
+
+        # TODO: this is a hard coded sizeof(double)
+        return (dofs + matrices) * 8
+
+    @property
+    def operations(self):
+        """ The total number of floating point operations for the kernel
+        to be carried out """
+        from dune.perftool.sumfact.tabulation import flop_cost
+        return flop_cost(self.matrix_sequence)
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index e8518668c227c7528ab72712bbd9e2cba92c147f..3e7c63752b9a770ff4cd9d2e7ecc945edc0c49c5 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -98,6 +98,17 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord):
     def vectorized(self):
         return False
 
+    #
+    # Implement properties needed by cost models
+    #
+
+    @property
+    def memory_traffic(self):
+        """ The total number of bytes needed from RAM for the kernel
+        to be executed - neglecting the existence of caches of course
+        """
+        return mat.rows * mat.cols
+
 
 class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
     def __init__(self, tabs, width=None):
@@ -200,6 +211,21 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
     def vectorized(self):
         return True
 
+    #
+    # Implement properties needed by cost models
+    #
+
+    @property
+    def memory_traffic(self):
+        """ The total number of bytes needed from RAM for the kernel
+        to be executed - neglecting the existence of caches of course
+        """
+        if len(set(self.tabs)) == 1:
+            factor = 1
+        else:
+            factor = self.width
+        return factor * mat.rows * mat.cols
+
 
 def quadrature_points_per_direction():
     # Quadrature order per direction