diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index 66958d0b375476b200a1858f8030f9ff5ea28629..ea5255725b527d6fa59f3f005f49cdc0d2a11dd8 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -235,6 +235,19 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
     # Some convenience methods to extract information about the sum factorization kernel
     #
 
+    def __lt__(self, other):
+        if self.parallel_key != other.parallel_key:
+            return self.parallel_key < other.parallel_key
+        if self.input_key != other.input_key:
+            return self.input_key < other.input_key
+        if self.position_priority == other.position_priority:
+            return repr(self) < repr(other)
+        if self.position_priority is None:
+            return False
+        if other.position_priority is None:
+            return True
+        return self.position_priority < other.position_priority
+
     @property
     def length(self):
         """ The number of matrices to apply """
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index c79161ac61cdb932cdfc419888e9d2aef7ddf347..e4a044edf0e62b8b2176efe4755a21617f0fcd8c 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -48,13 +48,6 @@ def attach_vectorization_info(sf):
         return _cache_vectorization_info(sf, None)
 
 
-def position_penalty_factor(sf):
-    if isinstance(sf, SumfactKernel) or sf.vertical_width > 1:
-        return 1
-    else:
-        return 1 + sum(abs(sf.kernels[i].position_priority - i) if sf.kernels[i].position_priority is not None else 0 for i in range(sf.length))
-
-
 @backend(interface="vectorization_strategy", name="model")
 def costmodel(sf):
     # Penalize vertical vectorization
@@ -66,7 +59,7 @@ def costmodel(sf):
         scalar_penalty = get_vcl_type_size(dtype_floatingpoint())
 
     # Return total operations
-    return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty
+    return sf.operations * vertical_penalty * scalar_penalty
 
 
 @backend(interface="vectorization_strategy", name="fromlist")
@@ -90,7 +83,7 @@ def explicit_costfunction(sf):
 
     if sf.horizontal_width == horizontal and sf.vertical_width == vertical:
         # Penalize position mapping
-        return sf.operations * position_penalty_factor(sf)
+        return sf.operations
     else:
         return 1000000000000
 
@@ -292,7 +285,7 @@ def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, alread
 
     # Find the number of input coefficients we can work on
     keys = frozenset(sf.input_key for sf in sumfacts)
-    inputkey_sumfacts = [frozenset(filter(lambda sf: sf.input_key == key, sumfacts)) for key in keys]
+    inputkey_sumfacts = [tuple(sorted(filter(lambda sf: sf.input_key == key, sumfacts))) for key in keys]
 
     for parallel in (1, 2):
         if parallel == 2 and next(iter(sumfacts)).stage == 3:
@@ -301,98 +294,38 @@ def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, alread
                             it.permutations(range(len(keys)), parallel)):
             horizontal = 1
             while horizontal <= width // parallel:
-                generators = [filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))),
-                                     it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal) for part in which)))]
-                if horizontal >=4:
-                    generators.append(filter(lambda c: sum(c, ()) == tuple(sorted(sum(c, ()))),
-                                             it.product(*tuple(it.permutations(inputkey_sumfacts[part], horizontal - 1) for part in which))))
-                for combo in it.chain(*generators):
-                    combo = sum(combo, ())
-
-                    vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp)
-                    if vecdict is None:
-                        # This particular choice was rejected for some reason.
-                        # Possible reasons:
-                        # * the quadrature point tuple not being suitable
-                        #   for this vectorization strategy
-                        continue
-
-                    # Go into recursion to also vectorize all kernels not in this combo
-                    for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo),
-                                                                               width,
-                                                                               qp,
-                                                                               add_to_frozendict(already, vecdict),
-                                                                               ):
-                        yielded = True
-                        yield opp
+                combo = sum((inputkey_sumfacts[part][:horizontal] for part in which), ())
 
+                vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp)
                 horizontal *= 2
 
+                if vecdict is None:
+                    # This particular choice was rejected for some reason.
+                    # Possible reasons:
+                    # * the quadrature point tuple not being suitable
+                    #   for this vectorization strategy
+                    # * there are not enough horizontal kernels
+                    continue
+
+                # Go into recursion to also vectorize all kernels not in this combo
+                for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo),
+                                                                            width,
+                                                                            qp,
+                                                                            add_to_frozendict(already, vecdict),
+                                                                            ):
+                    yielded = True
+                    yield opp
+
     # If we did not yield on this recursion level, yield what we got so far
     if not yielded:
         yield already
 
-#
-# def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
-#     if len(sumfacts) == 0:
-#         # We have gone into recursion deep enough to have all sum factorization nodes
-#         # assigned their vectorized counterpart. We can yield the result now!
-#         yield already
-#         return
-#
-#     # Ensure a deterministic order of the given sumfact kernels. This is necessary for the
-#     # fromlist strategy to pick correct strategies across different program runs
-#     sumfacts = sorted(sumfacts, key=lambda sf: repr(sf))
-#
-#     # Otherwise we pick a random sum factorization kernel and construct all the vectorization
-#     # opportunities realizing this particular kernel and go into recursion.
-#     sf_to_decide = sumfacts[0]
-#
-#     # Have "unvectorized" as an option, although it is not good
-#     for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]),
-#                                                               width,
-#                                                               qp,
-#                                                               add_to_frozendict(already,
-#                                                                                 {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
-#                                                                                 ),
-#                                                               ):
-#         yield opp
-#
-#     horizontal = 1
-#     while horizontal <= width:
-#         # Iterate over the possible combinations of sum factorization kernels
-#         # taking into account all the permutations of kernels. This also includes
-#         # combinations which use a padding of 1 - but only for pure horizontality.
-#         generators = [it.permutations(sumfacts, horizontal)]
-#         if horizontal >= 4:
-#             generators.append(it.permutations(sumfacts, horizontal - 1))
-#         for combo in it.chain(*generators):
-#             # The chosen kernels must be part of the kernels for recursion
-#             # to work correctly
-#             if sf_to_decide not in combo:
-#                 continue
-#
-#             # Set up the vectorization dict for this combo
-#             vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
-#             if vecdict is None:
-#                 # This particular choice was rejected for some reason.
-#                 # Possible reasons:
-#                 # * the quadrature point tuple not being suitable
-#                 #   for this vectorization strategy
-#                 continue
-#
-#             # Go into recursion to also vectorize all kernels not in this combo
-#             for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo),
-#                                                                       width,
-#                                                                       qp,
-#                                                                       add_to_frozendict(already, vecdict),
-#                                                                       ):
-#                 yield opp
-#
-#         horizontal = horizontal * 2
-
 
 def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
+    # Discard opportunities that do not contain enough horizontal kernels
+    if len(sumfacts) not in (horizontal, horizontal - 1):
+        return None
+
     # Enhance the list of sumfact nodes by adding vertical splittings
     kernels = []
     for sf in sumfacts: