diff --git a/python/dune/perftool/sumfact/costmodel.py b/python/dune/perftool/sumfact/costmodel.py
deleted file mode 100644
index c036276703981cf136442ae1d48c51368eaa79bc..0000000000000000000000000000000000000000
--- a/python/dune/perftool/sumfact/costmodel.py
+++ /dev/null
@@ -1,25 +0,0 @@
-""" Implementation of the cost model used for vectorization
-"""
-
-
-def ilp_heuristic(sf):
-    """ A heuristic measure for the ILP capabilities of the kernel
-
-    Return is the throttle of insufficient ILP, generally in (0., 1.0].
-    1.0 indicates perfect instruction level parallelism.
-    """ 
-    return 1.0
-
-
-def vectorization_costfunction(sf):
-    # kernel characteristics necessary
-    traffic = sf.memory_traffic
-    intensity = sf.operations / sf.memory_traffic
-
-    # hardware characteristics necessary
-    bandwidth = 100000000000000000.0
-    peakperformance = 100.0
-
-    roofline = min(bandwidth * intensity, ilp_heuristics(sf) * peakperformance)
-
-    return (traffic * intensity) / roofline
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 0d7c4b038827a4af1e076f6982d6e50b496e6291..6261a38016b9635c06c6e51cd796461f154fe3a1 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -17,8 +17,10 @@ from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray,
                                               )
 from dune.perftool.error import PerftoolError
 from dune.perftool.options import get_option
-from dune.perftool.tools import round_to_multiple
+from dune.perftool.tools import add_to_frozendict,round_to_multiple
 
+from frozendict import frozendict
+import itertools as it
 import loopy as lp
 import numpy as np
 
@@ -264,3 +266,120 @@ def decide_vectorization_strategy():
     # Register the results
     for sf in all_sumfacts:
         _cache_vectorization_info(sf, sfdict.get(sf, no_vec(sf)))
+
+
+def vectorization_opportunity_generator(sumfacts, width):
+    """ Generator that yields all vectorization opportunities for the given
+    sum factorization kernels as tuples of quadrature point tuple and vectorization
+    dictionary """
+    #
+    # Find all the possible quadrature point tuples
+    #
+    quad_points = [quadrature_points_per_direction()]
+
+    if True or get_option("vectorize_allow_quadrature_changes"):
+        sf = next(iter(sumfacts))
+        depth = 1
+        while depth <= width:
+            i = 0 if sf.matrix_sequence[0].face is None else 1
+            quad = list(quadrature_points_per_direction())
+            quad[i] = round_to_multiple(quad[i], depth)
+            quad_points.append(tuple(quad))
+            depth = depth * 2
+        quad_points = list(set(quad_points))
+
+    for qp in quad_points:
+        #
+        # Determine vectorization opportunities given a fixed quadrature point number
+        #
+        for opp in fixed_quad_vectorization_opportunity_generator(frozenset(sumfacts), width, qp):
+            yield qp, opp
+
+
+def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
+    if len(sumfacts) == 0:
+        # We have gone into recursion deep enough to have all sum factorization nodes
+        # assigned their vectorized counterpart. We can yield the result now!
+        yield already
+        return
+
+    # Otherwise we pick a random sum factorization kernel and construct all the vectorization
+    # opportunities realizing this particular kernel and go into recursion.
+    sf_to_decide = next(iter(sumfacts))
+
+    # Have "unvectorized" as an option, although it is not good
+    for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}),
+                                                              width,
+                                                              qp,
+                                                              add_to_frozendict(already, {sf_to_decide: sf_to_decide}),
+                                                              ):
+        yield opp
+
+    # Find all the sum factorization kernels that the chosen kernel can be parallelized with
+    #
+    # TODO: Right now we check for same input, which is not actually needed in order
+    #       to be a suitable candidate! We should relax this concept at some point!
+    candidates = filter(lambda sf: sf.input_key == sf_to_decide.input_key, sumfacts)
+
+    horizontal = 1
+    while horizontal <= width:
+        # Iterate over the possible combinations of sum factorization kernels
+        # taking into account all the permutations of kernels. This also includes
+        # combinations which use a padding of 1.
+        for combo in it.chain(it.permutations(candidates, horizontal),
+                              it.permutations(candidates, horizontal - 1)):
+            # The chosen kernels must be part of the kernels for recursion
+            # to work correctly
+            if sf_to_decide not in combo:
+                continue
+
+            # Set up the vectorization dict for this combo
+            vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
+            if vecdict is None:
+                # This particular choice was rejected for some reason.
+                # Possible reasons:
+                # * the quadrature point tuple not being suitable
+                #   for this vectorization strategy
+                continue
+
+            # Go into recursion to also vectorize all kernels not in this combo
+            for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference(combo),
+                                                                      width,
+                                                                      qp,
+                                                                      add_to_frozendict(already, vecdict),
+                                                                      ):
+                yield opp
+
+        horizontal = horizontal * 2
+
+
+def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
+    # Enhance the list of sumfact nodes by adding vertical splittings
+    kernels = []
+    for sf in sumfacts:
+        # No slicing needed in the pure horizontal case
+        if vertical == 1:
+            kernels.append(sf)
+            continue
+
+        # Determine the slicing direction
+        slice_direction = 0 if sf.matrix_sequence[0].face is None else 1
+        if qp[slice_direction] % vertical != 0:
+            return None
+
+        # Split the basis tabulation matrices
+        oldtab = sf.matrix_sequence[slice_direction]
+        for i in range(vertical):
+            seq = list(sf.matrix_sequence)
+            seq[slice_direction] = oldtab.copy(slice_size=vertical,
+                                               slice_index=i)
+            kernels.append(sf.copy(matrix_sequence=tuple(seq)))
+
+
+    # Join the new kernels into a sum factorization node
+    buffer = get_counted_variable("joined_buffer")
+    return {sf: VectorizedSumfactKernel(kernels=tuple(kernels),
+                                        horizontal_width=horizontal,
+                                        vertical_width=vertical,
+                                        buffer=buffer,
+                                        ) for sf in sumfacts}
diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py
index 260fc9dfe6151492405e8b6996afa1623e9f3896..d358f3ab45ef8231a3a952a881167e7108c14704 100644
--- a/python/dune/perftool/tools.py
+++ b/python/dune/perftool/tools.py
@@ -3,6 +3,7 @@ from __future__ import absolute_import
 
 import loopy as lp
 import pymbolic.primitives as prim
+import frozendict
 
 
 def get_pymbolic_basename(expr):
@@ -67,3 +68,9 @@ def ceildiv(a, b):
 
 def round_to_multiple(x, n):
     return n * ceildiv(x, n)
+
+
+def add_to_frozendict(fd, valdict):
+    t = dict(fd)
+    t.update(valdict)
+    return frozendict.frozendict(t)