diff --git a/python/dune/perftool/error.py b/python/dune/perftool/error.py
index 3f99a83abeb947890b5c7fb36ffd2663514796fb..4d428b41b516845c3eb18803539edbf293f44cf1 100644
--- a/python/dune/perftool/error.py
+++ b/python/dune/perftool/error.py
@@ -15,3 +15,7 @@ class PerftoolCodegenError(PerftoolError):
 
 class PerftoolLoopyError(PerftoolError):
     pass
+
+
+class PerftoolVectorizationError(PerftoolCodegenError):
+    pass
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 5611bbedf4eab089eceb8740487c6f60236f543e..adf7e8ce483fd36bdaa44a90fa8b5f54bb25e442 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -57,17 +57,19 @@ class PerftoolOptionsArray(ImmutableRecord):
     fastdg = PerftoolOption(default=False, helpstr="Use FastDGGridOperator from PDELab.")
     sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization")
     vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
-    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model")
+    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|fromlist")
     vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy")
     vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy")
     vectorization_padding = PerftoolOption(default=None, helpstr="an explicit value for the allowed padding in vectorization")
     vectorization_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers")
+    vectorization_list_index = PerftoolOption(default=None, helpstr="Which vectorization to pick from a list (only valid with vectorization_strategy=fromlist).")
     turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.")
     architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl")
     grid_offset = PerftoolOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.")
     simplify = PerftoolOption(default=False, helpstr="Whether to simplify expressions using sympy")
     precision_bits = PerftoolOption(default=64, helpstr="The number of bits for the floating point type")
     assure_statement_ordering = PerftoolOption(default=False, helpstr="Whether special care should be taken for a good statement ordering in sumfact kernels, runs into a loopy scheduler performance bug, but is necessary for production.")
+    generate_jacobians = PerftoolOption(default=True, helpstr="Whether jacobian_* methods should be generated. This is set to false automatically, when numerical_jacobian is set to true.")
 
     # Arguments that are mainly to be set by logic depending on other options
     max_vector_width = PerftoolOption(default=256, helpstr=None)
@@ -127,6 +129,9 @@ def process_options(opt):
     if opt.sumfact:
         opt = opt.copy(unroll_dimension_loops=True)
 
+    if opt.numerical_jacobian:
+        opt = opt.copy(generate_jacobians=False)
+
     if opt.overlapping:
         opt = opt.copy(parallel=True)
 
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 23c92da10c0945b6e4196b1ccde44f0cdbdb5c83..76d6b0c162e2b5677905ac6c79cfefc8f9f3e01b 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -821,22 +821,23 @@ def generate_localoperator_kernels(formdata, data):
         jacform = preprocess_form(jacform).preprocessed_form
 
         with global_context(form_type="jacobian"):
-            for measure in set(i.integral_type() for i in jacform.integrals()):
-                logger.info("generate_localoperator_kernels: measure {}".format(measure))
-                with global_context(integral_type=measure):
-                    with global_context(kernel=assembler_routine_name()):
-                        kernel = [k for k in get_backend(interface="generate_kernels_per_integral")(jacform.integrals_by_type(measure))]
-                operator_kernels[(measure, 'jacobian')] = kernel
-
-            # Generate dummy functions for those kernels, that vanished in the differentiation process
-            # We *could* solve this problem by using lambda_* terms but we do not really want that, so
-            # we use empty jacobian assembly methods instead
-            alpha_measures = set(i.integral_type() for i in form.integrals())
-            jacobian_measures = set(i.integral_type() for i in jacform.integrals())
-            for it in alpha_measures - jacobian_measures:
-                with global_context(integral_type=it):
-                    from dune.perftool.pdelab.signatures import assembly_routine_signature
-                    operator_kernels[(it, 'jacobian')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)]
+            if get_option("generate_jacobians"):
+                for measure in set(i.integral_type() for i in jacform.integrals()):
+                    logger.info("generate_localoperator_kernels: measure {}".format(measure))
+                    with global_context(integral_type=measure):
+                        with global_context(kernel=assembler_routine_name()):
+                            kernel = [k for k in get_backend(interface="generate_kernels_per_integral")(jacform.integrals_by_type(measure))]
+                    operator_kernels[(measure, 'jacobian')] = kernel
+
+                # Generate dummy functions for those kernels, that vanished in the differentiation process
+                # We *could* solve this problem by using lambda_* terms but we do not really want that, so
+                # we use empty jacobian assembly methods instead
+                alpha_measures = set(i.integral_type() for i in form.integrals())
+                jacobian_measures = set(i.integral_type() for i in jacform.integrals())
+                for it in alpha_measures - jacobian_measures:
+                    with global_context(integral_type=it):
+                        from dune.perftool.pdelab.signatures import assembly_routine_signature
+                        operator_kernels[(it, 'jacobian')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)]
 
         # Jacobian apply methods for matrix-free computations
         if get_option("matrix_free"):
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 46563ee1f6805037b13b13136e16f2fcc1d23ff0..19278ef7ff5fe6cfbefdd9534f30bbe4ee5ea345 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -64,9 +64,12 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
                                  restriction=restriction,
                                  )
 
-    def __str__(self):
+    def __repr__(self):
         return "{}_{}".format(self.coeff_func(self.restriction), self.element_index)
 
+    def __str__(self):
+        return repr(self)
+
     def realize(self, sf, index, insn_dep):
         lfs = name_lfs(self.element, self.restriction, self.element_index)
         basisiname = sumfact_iname(name_lfs_bound(lfs), "basis")
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index cbe8fbbc374c6ada8e3c63a1a81adb5a3c742ac6..4009a7bf7a2bb5356f53129f28c80420ff5a8c1a 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -22,7 +22,7 @@ from dune.perftool.generation import (class_member,
                                       )
 from dune.perftool.loopy.buffer import get_buffer_temporary
 from dune.perftool.loopy.target import dtype_floatingpoint
-from dune.perftool.loopy.vcl import ExplicitVCLCast
+from dune.perftool.loopy.vcl import ExplicitVCLCast, get_vcl_type_size
 from dune.perftool.pdelab.localoperator import (name_domain_field,
                                                 lop_template_range_field,
                                                 )
@@ -196,7 +196,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
         # Check whether we can realize this by broadcasting the values of a simple tabulation
         if len(set(self.tabs)) == 1:
             theta = self.tabs[0].pymbolic(indices[:-1])
-            return prim.Call(ExplicitVCLCast(dtype_floatingpoint(), vector_width=len(self.tabs)), (theta,))
+            return prim.Call(ExplicitVCLCast(dtype_floatingpoint(), vector_width=get_vcl_type_size(dtype_floatingpoint())), (theta,))
 
         abbrevs = tuple("{}x{}".format("d" if t.derivative else "",
                                        "s{}".format(t.slice_index) if t.slice_size is not None else "")
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index f3fb96b99b0d7cb0a633e6d499f6e362db5da207..4dab904154d80072585453383e482725bfa5da97 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -18,9 +18,9 @@ from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray,
                                               quadrature_points_per_direction,
                                               set_quadrature_points,
                                               )
-from dune.perftool.error import PerftoolError
+from dune.perftool.error import PerftoolVectorizationError
 from dune.perftool.options import get_option
-from dune.perftool.tools import add_to_frozendict, round_to_multiple
+from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff
 
 from pytools import product
 from frozendict import frozendict
@@ -33,7 +33,7 @@ import math
 @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o)
 def _cache_vectorization_info(old, new):
     if new is None:
-        raise PerftoolError("Vectorization info for sum factorization kernel was not gathered correctly!")
+        raise PerftoolVectorizationError("Vectorization info for sum factorization kernel was not gathered correctly!")
     return new
 
 
@@ -69,6 +69,12 @@ def costmodel(sf):
     return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty
 
 
+@backend(interface="vectorization_strategy", name="fromlist")
+def fromlist_costmodel(sf):
+    # The fromlist strategy needs to reuse the cost model!
+    return costmodel(sf)
+
+
 @backend(interface="vectorization_strategy", name="explicit")
 def explicit_costfunction(sf):
     # Read the explicitly set values for horizontal and vertical vectorization
@@ -139,7 +145,7 @@ def decide_vectorization_strategy():
     from dune.perftool.generation import retrieve_cache_items
     all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")]
 
-    # Stage 1 sumfactorizations that were actually used
+    # Stage 1 sum factorizations that were actually used
     basis_sumfacts = [i for i in retrieve_cache_items('kernel_default and basis_sf_kernels')]
 
     # This means we can have sum factorizations that will not get used
@@ -164,7 +170,6 @@ def decide_vectorization_strategy():
     # Optimize over all the possible quadrature point tuples
     #
     quad_points = [quadrature_points_per_direction()]
-
     if get_option("vectorization_allow_quadrature_changes"):
         sf = next(iter(active_sumfacts))
         depth = 1
@@ -176,10 +181,43 @@ def decide_vectorization_strategy():
             depth = depth * 2
         quad_points = list(set(quad_points))
 
-    # Find the minimum cost strategy between all the quadrature point tuples
-    optimal_strategies = {qp: fixed_quadrature_optimal_vectorization(active_sumfacts, width, qp) for qp in quad_points}
-    qp = min(optimal_strategies, key=lambda qp: strategy_cost(optimal_strategies[qp]))
-    sfdict = optimal_strategies[qp]
+    if get_option("vectorization_strategy") == "fromlist":
+        # This is a bit special and does not follow the minimization procedure at all
+
+        def _choose_strategy_from_list(stage1_sumfacts):
+            strategy = 0
+            for qp in quad_points:
+                for strat in fixed_quad_vectorization_opportunity_generator(frozenset(stage1_sumfacts), width, qp):
+                    if strategy == int(get_option("vectorization_list_index")):
+                        set_quadrature_points(qp)
+                        # Output the strategy and its cost into a separate file
+                        if get_global_context_value("form_type") == "jacobian_apply":
+                            with open("strategycosts.csv", "a") as f:
+                                f.write("{} {}\n".format(strategy, strategy_cost(strat)))
+                        return qp, strat
+                    strategy = strategy + 1
+
+            raise PerftoolVectorizationError("Specified vectorization list index '{}' was too high!".format(get_option("vectorization_list_index")))
+
+        s1_sumfacts = frozenset(sf for sf in active_sumfacts if sf.stage == 1)
+
+        total = sum(len([s for s in fixed_quad_vectorization_opportunity_generator(frozenset(s1_sumfacts), width, qp)]) for qp in quad_points)
+        print("'fromlist' vectorization is attempting to pick #{} of {} strategies...".format(int(get_option("vectorization_list_index")),
+                                                                                              total))
+        qp, sfdict = _choose_strategy_from_list(s1_sumfacts)
+
+        keys = frozenset(sf.input_key for sf in active_sumfacts if sf.stage != 1)
+        for key in keys:
+            key_sumfacts = frozenset(sf for sf in active_sumfacts if sf.input_key == key)
+            minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp),
+                          key=strategy_cost)
+            sfdict = add_to_frozendict(sfdict, minimum)
+    else:
+        # Find the minimum cost strategy between all the quadrature point tuples
+        optimal_strategies = {qp: fixed_quadrature_optimal_vectorization(active_sumfacts, width, qp) for qp in quad_points}
+        qp = min(optimal_strategies, key=lambda qp: strategy_cost(optimal_strategies[qp]))
+        sfdict = optimal_strategies[qp]
+
     set_quadrature_points(qp)
 
     logger.debug("decide_vectorization_strategy: Decided for the following strategy:"
@@ -207,6 +245,7 @@ def fixed_quadrature_optimal_vectorization(sumfacts, width, qp):
 
     # Find minimums for each of these sets
     sfdict = frozendict()
+
     for key in keys:
         key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key)
         minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp),
@@ -223,12 +262,16 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=
         yield already
         return
 
+    # Ensure a deterministic order of the given sumfact kernels. This is necessary for the
+    # fromlist strategy to pick correct strategies across different program runs
+    sumfacts = sorted(sumfacts, key=lambda sf: repr(sf))
+
     # Otherwise we pick a random sum factorization kernel and construct all the vectorization
     # opportunities realizing this particular kernel and go into recursion.
-    sf_to_decide = next(iter(sumfacts))
+    sf_to_decide = sumfacts[0]
 
     # Have "unvectorized" as an option, although it is not good
-    for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}),
+    for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]),
                                                               width,
                                                               qp,
                                                               add_to_frozendict(already,
@@ -261,7 +304,7 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=
                 continue
 
             # Go into recursion to also vectorize all kernels not in this combo
-            for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference(combo),
+            for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo),
                                                                       width,
                                                                       qp,
                                                                       add_to_frozendict(already, vecdict),
diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py
index d358f3ab45ef8231a3a952a881167e7108c14704..e302f28c0e63d03d013c181dc0eb4f831ec1648c 100644
--- a/python/dune/perftool/tools.py
+++ b/python/dune/perftool/tools.py
@@ -74,3 +74,11 @@ def add_to_frozendict(fd, valdict):
     t = dict(fd)
     t.update(valdict)
     return frozendict.frozendict(t)
+
+
+def list_diff(l1, l2):
+        l = []
+        for item in l1:
+            if item not in l2:
+                l.append(item)
+        return l
diff --git a/python/setup.py b/python/setup.py
index ccbe9e56658fa87a0c574767f6c7f4282612a750..c7e80ce1b0de4a2a561ee1a1a4d2189ec0df0d0c 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -45,5 +45,6 @@ setup(name='dune.perftool',
       entry_points = {
         "console_scripts": [
             "ufl2pdelab = dune.perftool.compile:compile_form",
+            "picklevecstrats = dune.perftool.sumfact.vectorization:pickle_vectorization_strategies",
         ]
     })