diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 90ab3a11178b77984ede80913f517cea13df574c..9a9cca4ca53c5a50e3e40226116a620c064f2c16 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -54,15 +54,12 @@ class PerftoolOptionsArray(ImmutableRecord):
     project_basedir = PerftoolOption(helpstr="The base (build) directory of the dune-perftool project")
     fastdg = PerftoolOption(default=False, helpstr="Use FastDGGridOperator from PDELab.")
     sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization")
-    vectorize_quad = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
-    vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
-    vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
-    vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
-    vectorize_greedy = PerftoolOption(default=False, helpstr="the heuristic currently in use (to produce paper numbers)")
-    vectorize_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization")
-    vectorize_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization")
-    vectorize_padding = PerftoolOption(default=None, helpstr="an explicit value for padding in vectorization")
-    vectorize_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers")
+    vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
+    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model")
+    vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy")
+    vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy")
+    vectorization_padding = PerftoolOption(default=None, helpstr="an explicit value for the allowed padding in vectorization")
+    vectorization_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers")
     turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.")
     architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl")
     grid_offset = PerftoolOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.")
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 7a9f4169c837f019d5af2842875c3ef39d2eb133..900ce61711e67310472d143ebea5930f52ba5756 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -549,7 +549,7 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
     kernel = heuristic_duplication(kernel)
 
     # Maybe apply vectorization strategies
-    if get_option("vectorize_quad"):
+    if get_option("vectorization_quadloop"):
         if get_option("sumfact"):
             from dune.perftool.loopy.transformations.vectorize_quad import vectorize_quadrature_loop
             kernel = vectorize_quadrature_loop(kernel)
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 53a5c278f1d361abc415bf4186b856ce8c01a7d6..e3edaf2cb70d1a06681742445bc83314b42c306b 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -4,7 +4,9 @@ import logging
 
 from dune.perftool.loopy.vcl import get_vcl_type_size
 from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel
-from dune.perftool.generation import (generator_factory,
+from dune.perftool.generation import (backend,
+                                      generator_factory,
+                                      get_backend,
                                       get_counted_variable,
                                       get_global_context_value,
                                       )
@@ -48,165 +50,38 @@ def attach_vectorization_info(sf):
         return _cache_vectorization_info(sf, None)
 
 
-def no_vec(sf):
-    return sf.copy(buffer=get_counted_variable("buffer"))
+@backend(interface="vectorization_costfunction", name="greedy")
+def greedy_costfunction(sf):
+    return 1
 
 
-def no_vectorization(sumfacts):
-    return {sf: no_vec(sf) for sf in sumfacts}
+@backend(interface="vectorization_costfunction", name="explicit")
+def explicit_costfunction(sf):
+    # Read the explicitly set values for horizontal and vertical vectorization
+    width = get_vcl_type_size(np.float64)
+    horizontal = int(get_option("vectorization_horizontal", width))
+    vertical = int(get_option("vectorization_vertical", 1))
 
-
-def vertical_vectorization_strategy(sumfact, depth):
-    # If depth is 1, there is nothing do
-    if depth == 1:
-        if isinstance(sumfact, SumfactKernel):
-            return {sumfact: sumfact}
-        else:
-            return {k: sumfact for k in sumfact.kernels}
-
-    # Assert that this is not already sliced
-    assert all(mat.slice_size is None for mat in sumfact.matrix_sequence)
-    result = {}
-
-    # Determine which of the matrices in the kernel should be sliced
-    def determine_slice_direction(sf):
-        for i, mat in enumerate(sf.matrix_sequence):
-            if mat.quadrature_size % depth == 0:
-                return i
-            elif get_option("vectorize_allow_quadrature_changes") and mat.quadrature_size != 1:
-                quad = list(quadrature_points_per_direction())
-                quad[i] = round_to_multiple(quad[i], depth)
-                set_quadrature_points(tuple(quad))
-                return i
-            elif mat.quadrature_size != 1:
-                raise PerftoolError("Vertical vectorization is not possible!")
-
-    if isinstance(sumfact, SumfactKernel):
-        kernels = [sumfact]
+    if sf.horizontal_width == horizontal and sf.vertical_width == vertical:
+        return 1
     else:
-        assert isinstance(sumfact, VectorizedSumfactKernel)
-        kernels = sumfact.kernels
-
-    newkernels = []
-    for sf in kernels:
-        sliced = determine_slice_direction(sf)
-        oldtab = sf.matrix_sequence[sliced]
-        for i in range(depth):
-            seq = list(sf.matrix_sequence)
-            seq[sliced] = oldtab.copy(slice_size=depth,
-                                      slice_index=i)
-            newkernels.append(sf.copy(matrix_sequence=tuple(seq)))
-
-    if isinstance(sumfact, SumfactKernel):
-        buffer = get_counted_variable("vertical_buffer")
-        result[sumfact] = VectorizedSumfactKernel(kernels=tuple(newkernels),
-                                                  buffer=buffer,
-                                                  vertical_width=depth,
-                                                  )
-    else:
-        assert isinstance(sumfact, VectorizedSumfactKernel)
-        for sf in kernels:
-            result[sf] = sumfact.copy(kernels=tuple(newkernels),
-                                      vertical_width=depth,
-                                      )
-
-    return result
-
-
-def horizontal_vectorization_strategy(sumfacts, width, allow_padding=1):
-    result = {}
-    todo = set(sumfacts)
-    while todo:
-        kernels = []
-        for sf in sorted(todo, key=lambda s: s.position_priority):
-            if len(kernels) < width:
-                kernels.append(sf)
-                todo.discard(sf)
-
-        buffer = get_counted_variable("joined_buffer")
-        if len(kernels) in range(width - allow_padding, width + 1):
-            for sf in kernels:
-                result[sf] = VectorizedSumfactKernel(kernels=tuple(kernels),
-                                                     horizontal_width=width,
-                                                     buffer=buffer,
-                                                     )
-
-    return result
-
-
-def diagonal_vectorization_strategy(sumfacts, width):
-    # Read explicitly set values
-    horizontal = get_option("vectorize_horizontal")
-    vertical = get_option("vectorize_vertical")
-    padding = get_option("vectorize_padding")
-
-    if width == 4:
-        if horizontal is None:
-            horizontal = 2
-        if vertical is None:
-            vertical = 2
-        if padding is None:
-            padding = 0
-    elif width == 8:
-        if horizontal is None:
-            horizontal = 4
-        if vertical is None:
-            vertical = 2
-        if padding is None:
-            padding = 1
-    else:
-        raise NotImplementedError
-
-    horizontal = int(horizontal)
-    vertical = int(vertical)
-    padding = int(padding)
-
-    result = {}
-
-    horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=padding)
-
-    for sf in horizontal_kernels:
-        vert = vertical_vectorization_strategy(horizontal_kernels[sf], width // horizontal_kernels[sf].horizontal_width)
-        for k in vert:
-            result[k] = vert[k]
+        return 2
 
-    return result
 
-
-def greedy_vectorization_strategy(sumfacts, width):
-    sumfacts = set(sumfacts)
-    horizontal = width
-    vertical = 1
-    allowed_padding = 1
-
-    result = {}
-
-    while horizontal > 0:
-        if horizontal > 1:
-            horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=allowed_padding)
-        else:
-            horizontal_kernels = {sf: sf for sf in sumfacts}
-
-        for sf in horizontal_kernels:
-            if horizontal_kernels[sf].horizontal_width == horizontal:
-                vert = vertical_vectorization_strategy(horizontal_kernels[sf],
-                                                       vertical)
-                for k in vert:
-                    result[k] = vert[k]
-                sumfacts.discard(sf)
-
-        horizontal = horizontal // 2
-        vertical = vertical * 2
-
-        # We heuristically allow padding only on the full SIMD width
-        allowed_padding = 0
-    return result
+def strategy_cost(strategy):
+    qp, strategy = strategy
+    set_quadrature_points(qp)
+    func = get_backend(interface="vectorization_costfunction",
+                       name=get_option("vectorization_strategy"))
+    return sum(float(func(sf)) for sf in strategy.values())
 
 
-def print_vectorization_strategy(strategy):
+def stringify_vectorization_strategy(strategy):
+    result = []
     qp, strategy = strategy
-    print "\nPrinting potential vectorization strategy:"
-    print "Quadrature point tuple: {}".format(qp)
+
+    result.append["Printing potential vectorization strategy:"]
+    result.append["Quadrature point tuple: {}".format(qp)]
 
     # Look for all realizations in the strategy and iterate over them
     cache_keys = frozenset(v.cache_key for v in strategy.values())
@@ -214,12 +89,12 @@ def print_vectorization_strategy(strategy):
         # Filter all the kernels that are realized by this and print
         for key in strategy:
             if strategy[key].cache_key == ck:
-                print "{}:".format(key)
+                result.append["{}:".format(key)]
 
         # Find one representative to print
         for val in strategy.values():
             if val.cache_key == ck:
-                print "    {}".format(val)
+                result.append["    {}".format(val)]
                 break
 
 
@@ -243,45 +118,29 @@ def decide_vectorization_strategy():
     # All sum factorization kernels that get used
     active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts]
 
-    # We map inacitve sum factorizatino kernels to 0
-    sfdict = {}
-    for sf in inactive_sumfacts:
-        sfdict[sf] = 0
+    # If no vectorization is needed, abort now
+    if get_option("vectorization_strategy") == "none":
+        for sf in all_sumfacts:
+            _cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer")))
+        return
 
     logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes"
                  .format(len(active_sumfacts)))
 
-    if get_option("vectorize_grads"):
-        # Currently we base our idea here on the fact that we only group sum
-        # factorization kernels with the same input.
-        inputkeys = set(sf.input_key for sf in active_sumfacts)
-        for inputkey in inputkeys:
-            width = get_vcl_type_size(np.float64)
-            sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
-            for old, new in horizontal_vectorization_strategy(sumfact_filter, width).items():
-                sfdict[old] = new
-    elif get_option("vectorize_slice"):
-        for sumfact in active_sumfacts:
-            width = get_vcl_type_size(np.float64)
-            for old, new in vertical_vectorization_strategy(sumfact, width).items():
-                sfdict[old] = new
-    elif get_option("vectorize_diagonal"):
-        inputkeys = set(sf.input_key for sf in active_sumfacts)
-        for inputkey in inputkeys:
-            width = get_vcl_type_size(np.float64)
-            sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
-            for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items():
-                sfdict[old] = new
-    elif get_option("vectorize_greedy"):
-        inputkeys = set(sf.input_key for sf in active_sumfacts)
-        for inputkey in inputkeys:
-            width = get_vcl_type_size(np.float64)
-            sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
-            for old, new in greedy_vectorization_strategy(sumfact_filter, width).items():
-                sfdict[old] = new
-    else:
-        for old, new in no_vectorization(active_sumfacts).items():
-            sfdict[old] = new
+    # Find the best vectorization strategy by using a costmodel
+    width = get_vcl_type_size(np.float64)
+    strategy = min(vectorization_opportunity_generator(active_sumfacts, width),
+                   key=strategy_cost)
+
+    # Treat the quadrature points
+    qp, sfdict = strategy
+    set_quadrature_points(qp)
+
+    logger.debug("decide_vectorization_strategy: Decided for the following strategy:"
+                 "\n".join(stringify_vectorization_strategy(strategy)))
+
+    # We map inactive sum factorization kernels to 0
+    sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts})
 
     # Register the results
     for sf in all_sumfacts:
@@ -297,7 +156,7 @@ def vectorization_opportunity_generator(sumfacts, width):
     #
     quad_points = [quadrature_points_per_direction()]
 
-    if True or get_option("vectorize_allow_quadrature_changes"):
+    if True or get_option("vectorization_allow_quadrature_changes"):
         sf = next(iter(sumfacts))
         depth = 1
         while depth <= width:
@@ -331,7 +190,9 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=
     for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}),
                                                               width,
                                                               qp,
-                                                              add_to_frozendict(already, {sf_to_decide: sf_to_decide}),
+                                                              add_to_frozendict(already,
+                                                                                {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
+                                                                                ),
                                                               ):
         yield opp