diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 0c626519dd5ab7c4631d01749440bcb2efa3ca64..14af6638c9a29669c9a28556d0d689bc75a3d49e 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -53,7 +53,7 @@ def attach_vectorization_info(sf):
 
 
 def position_penalty_factor(sf):
-    if isinstance(sf, SumfactKernel):
+    if isinstance(sf, SumfactKernel) or sf.vertical_width > 1:
         return 1
     else:
         return 1 + sum(abs(sf.kernels[i].position_priority - i) if sf.kernels[i].position_priority is not None else 0 for i in range(sf.length))
@@ -64,8 +64,13 @@ def costmodel(sf):
     # Penalize vertical vectorization
     vertical_penalty = 1 + math.log(sf.vertical_width)
 
+    # Penalize scalar sum factorization kernels
+    scalar_penalty = 1
+    if isinstance(sf, SumfactKernel):
+        scalar_penalty = get_vcl_type_size(np.float64)
+
     # Return total operations
-    return sf.operations * position_penalty_factor(sf) * vertical_penalty
+    return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty
 
 
 @backend(interface="vectorization_strategy", name="explicit")
@@ -89,8 +94,6 @@ def explicit_costfunction(sf):
 
 
 def strategy_cost(strategy):
-    qp, strategy = strategy
-    set_quadrature_points(qp)
     func = get_backend(interface="vectorization_strategy",
                        selector=lambda: get_option("vectorization_strategy"))
     keys = set(sf.cache_key for sf in strategy.values())
@@ -160,35 +163,14 @@ def decide_vectorization_strategy():
 
     # Find the best vectorization strategy by using a costmodel
     width = get_vcl_type_size(np.float64)
-    strategy = min(vectorization_opportunity_generator(active_sumfacts, width),
-                   key=strategy_cost)
 
-    # Treat the quadrature points
-    qp, sfdict = strategy
-    set_quadrature_points(qp)
-
-    logger.debug("decide_vectorization_strategy: Decided for the following strategy:"
-                 "\n".join(stringify_vectorization_strategy(strategy)))
-
-    # We map inactive sum factorization kernels to 0
-    sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts})
-
-    # Register the results
-    for sf in all_sumfacts:
-        _cache_vectorization_info(sf, sfdict[sf])
-
-
-def vectorization_opportunity_generator(sumfacts, width):
-    """ Generator that yields all vectorization opportunities for the given
-    sum factorization kernels as tuples of quadrature point tuple and vectorization
-    dictionary """
     #
-    # Find all the possible quadrature point tuples
+    # Optimize over all the possible quadrature point tuples
     #
     quad_points = [quadrature_points_per_direction()]
 
     if get_option("vectorization_allow_quadrature_changes"):
-        sf = next(iter(sumfacts))
+        sf = next(iter(active_sumfacts))
         depth = 1
         while depth <= width:
             i = 0 if sf.matrix_sequence[0].face is None else 1
@@ -198,12 +180,44 @@ def vectorization_opportunity_generator(sumfacts, width):
             depth = depth * 2
         quad_points = list(set(quad_points))
 
-    for qp in quad_points:
-        #
-        # Determine vectorization opportunities given a fixed quadrature point number
-        #
-        for opp in fixed_quad_vectorization_opportunity_generator(frozenset(sumfacts), width, qp):
-            yield qp, opp
+    # Find the minimum cost strategy between all the quadrature point tuples
+    optimal_strategies = {qp: fixed_quadrature_optimal_vectorization(active_sumfacts, width, qp) for qp in quad_points}
+    qp = min(optimal_strategies, key=lambda qp: strategy_cost(optimal_strategies[qp]))
+    sfdict = optimal_strategies[qp]
+    set_quadrature_points(qp)
+
+    logger.debug("decide_vectorization_strategy: Decided for the following strategy:"
+                 "\n".join(stringify_vectorization_strategy((qp, sfdict))))
+
+    # We map inactive sum factorization kernels to 0
+    sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts})
+
+    # Register the results
+    for sf in all_sumfacts:
+        _cache_vectorization_info(sf, sfdict[sf])
+
+
+def fixed_quadrature_optimal_vectorization(sumfacts, width, qp):
+    """ For a given quadrature point tuple, find the optimal strategy!
+
+    In order to have this scale sufficiently, we cannot simply list all vectorization
+    opportunities and score them individually, but we need to do a divide and conquer
+    approach.
+    """
+    set_quadrature_points(qp)
+
+    # Find the sets of simultaneously realizable kernels (thats an equivalence relation)
+    keys = frozenset(sf.input_key for sf in sumfacts)
+
+    # Find minimums for each of these sets
+    sfdict = frozendict()
+    for key in keys:
+        key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key)
+        minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp),
+                      key=strategy_cost)
+        sfdict = add_to_frozendict(sfdict, minimum)
+
+    return sfdict
 
 
 def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
@@ -227,20 +241,14 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=
                                                               ):
         yield opp
 
-    # Find all the sum factorization kernels that the chosen kernel can be parallelized with
-    #
-    # TODO: Right now we check for same input, which is not actually needed in order
-    #       to be a suitable candidate! We should relax this concept at some point!
-    candidates = filter(lambda sf: sf.input_key == sf_to_decide.input_key, sumfacts)
-
     horizontal = 1
     while horizontal <= width:
         # Iterate over the possible combinations of sum factorization kernels
         # taking into account all the permutations of kernels. This also includes
         # combinations which use a padding of 1 - but only for pure horizontality.
-        generators = [it.permutations(candidates, horizontal)]
+        generators = [it.permutations(sumfacts, horizontal)]
         if horizontal >= 4:
-            generators.append(it.permutations(candidates, horizontal - 1))
+            generators.append(it.permutations(sumfacts, horizontal - 1))
         for combo in it.chain(*generators):
             # The chosen kernels must be part of the kernels for recursion
             # to work correctly