diff --git a/bin/donkey_benchmark_wrapper.py b/bin/donkey_benchmark_wrapper.py
new file mode 100755
index 0000000000000000000000000000000000000000..7951b8b06bfdeb217bd3328c9b3b88f229bbc606
--- /dev/null
+++ b/bin/donkey_benchmark_wrapper.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+import time
+import sys
+import subprocess
+import os
+
+# Run the actual command
+command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split()
+command.extend(sys.argv[1:])
+ret = subprocess.call(command)
+
+# If that failed - fail!
+if ret != 0:
+    sys.exit(ret)
+
+# If that was succesful, wait for the output file to be available on the filesystem
+# This step is necessary because the NFS synchronization is too slow for our workflow.
+while not os.path.isfile(sys.argv[2]):
+    time.sleep(0.1)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 8b6d3318bc46f180eb7ef4a388a20cbcc65fbbe5..a4c8a3702eccd689568cd9def3f24462e09d3f4f 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -20,3 +20,7 @@ dune_python_add_test(NAME pep8-ourcode
                      )
 
 add_subdirectory(test)
+
+# Add a dummy target to extract compiler flags for the autotune tool chain
+add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc)
+target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing)
diff --git a/python/_autotune.cc b/python/_autotune.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0c87b37802d394fb8dafc0ca6ae150797e81199a
--- /dev/null
+++ b/python/_autotune.cc
@@ -0,0 +1,2 @@
+int main()
+{}
diff --git a/python/dune/perftool/error.py b/python/dune/perftool/error.py
index 4d428b41b516845c3eb18803539edbf293f44cf1..fb77655aef3689b27b2764d76a3e20c158cbf982 100644
--- a/python/dune/perftool/error.py
+++ b/python/dune/perftool/error.py
@@ -19,3 +19,7 @@ class PerftoolLoopyError(PerftoolError):
 
 class PerftoolVectorizationError(PerftoolCodegenError):
     pass
+
+
+class PerftoolAutotuneError(PerftoolVectorizationError):
+    pass
diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py
index fc9a6ce42f58f2eecae533a6b7add45dbce450c2..607b4055850d33f2fb78f2d03ee4be771caec904 100644
--- a/python/dune/perftool/generation/__init__.py
+++ b/python/dune/perftool/generation/__init__.py
@@ -54,6 +54,7 @@ from dune.perftool.generation.loopy import (barrier,
                                             valuearg,
                                             )
 
-from dune.perftool.generation.context import (global_context,
+from dune.perftool.generation.context import (cache_restoring,
+                                              global_context,
                                               get_global_context_value,
                                               )
diff --git a/python/dune/perftool/generation/cache.py b/python/dune/perftool/generation/cache.py
index b4ae54f28479d607907ac59c3c6078283535bc85..31b1b85e0da2797a158b4d49ed10bdf89d7dfe0b 100644
--- a/python/dune/perftool/generation/cache.py
+++ b/python/dune/perftool/generation/cache.py
@@ -268,7 +268,7 @@ def delete_cache_items(condition=True, keep=False):
         gen._memoize_cache = _filter_cache_items(gen, condition)
 
 
-def retrieve_cache_functions(condition=True):
+def retrieve_cache_functions(condition="True"):
     return [g.func for g in _generators if eval(condition, _ConditionDict(g.item_tags))]
 
 
diff --git a/python/dune/perftool/generation/context.py b/python/dune/perftool/generation/context.py
index 55feda422efbabe48c7663e87fea289585253cde..4be1125a2deb0a695f1815f82293d29dc81880f2 100644
--- a/python/dune/perftool/generation/context.py
+++ b/python/dune/perftool/generation/context.py
@@ -31,3 +31,23 @@ def global_context(**kwargs):
 
 def get_global_context_value(key, default=None):
     return _global_context_cache.get(key, default)
+
+
+class _CacheRestoringContext(object):
+    def __enter__(self):
+        from dune.perftool.generation.cache import _generators as g
+        self.cache = {}
+        for i in g:
+            self.cache[i] = {}
+            for k, v in i._memoize_cache.items():
+                self.cache[i][k] = v
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        for i, c in self.cache.items():
+            i._memoize_cache = {}
+            for k, v in c.items():
+                i._memoize_cache[k] = v
+
+
+def cache_restoring():
+    return _CacheRestoringContext()
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index cc21196fa1679ad109510e3a65155b0394c60d3d..9c551db9a6788b5628a3eb8c5eba287846b5a281 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -44,7 +44,6 @@ def dtype_floatingpoint():
         raise NotImplementedError("{}bit floating point type".format(bits))
 
 
-@pt.memoize
 def numpy_to_cpp_dtype(key):
     _registry = {'float32': 'float',
                  'int32': 'int',
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index c0f4900b8fff2f77b7cbefd593214cfc8d3af87d..a62de9950cf6d3dff0a84b3bc52b2e1e33ed5299 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -83,7 +83,7 @@ class PerftoolFormOptionsArray(ImmutableRecord):
     sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization")
     sumfact_regular_jacobians = PerftoolOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)")
     vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
-    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target")
+    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune")
     vectorization_not_fully_vectorized_error = PerftoolOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize")
     vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy")
     vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy")
diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
index b9541a0f13e802c1f97a5f5ce5e33980327895cb..3bf16b24dd2e3fd5bd330319be2418770b6d3f93 100644
--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -243,6 +243,11 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord):
         else:
             return ()
 
+    @property
+    def fastdg_interface_object_size(self):
+        size = sum(_local_sizes(self.trial_element)) if self.trial_element else 1
+        return size * sum(_local_sizes(self.test_element))
+
 
 def _local_sizes(element):
     from ufl import FiniteElement, MixedElement
@@ -381,6 +386,10 @@ def generate_accumulation_instruction(expr, visitor):
     test_info = visitor.test_info
     trial_info = visitor.trial_info
 
+    # Count flops on the expression for the vectorization decision making algorithm
+    from dune.perftool.sumfact.vectorization import count_quadrature_point_operations
+    count_quadrature_point_operations(expr)
+
     # Number of basis functions per direction
     leaf_element = test_info.element
     from ufl import MixedElement
diff --git a/python/dune/perftool/sumfact/autotune.py b/python/dune/perftool/sumfact/autotune.py
new file mode 100644
index 0000000000000000000000000000000000000000..244873c24b0b64cd61f8b04a6c66500599b15670
--- /dev/null
+++ b/python/dune/perftool/sumfact/autotune.py
@@ -0,0 +1,225 @@
+""" Autotuning for sum factorization kernels """
+
+from dune.perftool.generation import cache_restoring, delete_cache_items
+from dune.perftool.loopy.target import DuneTarget
+from dune.perftool.sumfact.realization import realize_sumfact_kernel_function
+from dune.perftool.options import get_option, set_option
+from dune.perftool.error import PerftoolAutotuneError
+
+import loopy as lp
+from pytools import product
+
+import os
+import re
+import subprocess
+import filelock
+
+
+def get_cmake_cache_entry(entry):
+    for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"):
+        match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH|UNINITIALIZED|STATIC]+=(.*)".format(entry), line)
+        if match:
+            return match.groups()[0]
+
+
+def get_perftool_dir():
+    if get_cmake_cache_entry("CMAKE_PROJECT_NAME") == "dune-perftool":
+        return get_option("project_basedir")
+    else:
+        return get_cmake_cache_entry("dune-perftool_DIR")
+
+
+def compiler_invocation(name, filename):
+    # Determine the CMake Generator in use
+    gen = get_cmake_cache_entry("CMAKE_GENERATOR")
+    assert(gen == "Unix Makefiles")
+
+    # Find compiler path
+    compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER")
+    compile_flags = [compiler]
+
+    # Parse compiler flags
+    for line in open(os.path.join(get_perftool_dir(), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"):
+        match = re.match("([^=]*)=(.*)", line)
+        if match:
+            compile_flags.extend(match.groups()[1].split())
+
+    # Add the source file
+    compile_flags.append(filename)
+
+    # Parse linker flags
+    for line in open(os.path.join(get_perftool_dir(), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"):
+        match = re.match(".*_autotune_target (.*)", line)
+        if match:
+            for flag in match.groups()[0].split():
+                if flag.startswith("-") or os.path.isabs(flag):
+                    compile_flags.append(flag)
+                else:
+                    compile_flags.append(os.path.join(get_perftool_dir(), "python", flag))
+
+    # Set an output name
+    compile_flags.append("-o")
+    compile_flags.append(name)
+
+    return compile_flags
+
+
+def generate_standalone_code(sf, filename):
+    delete_cache_items("kernel_default")
+
+    with open(filename, "w") as f:
+        f.writelines(["#include \"config.h\"\n",
+                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
+                      "#include<dune/perftool/common/tsc.hh>\n",
+                      "#include<dune/perftool/common/vectorclass.hh>\n",
+                      "#include<dune/perftool/sumfact/onedquadrature.hh>\n",
+                      "#include<dune/perftool/sumfact/horizontaladd.hh>\n",
+                      "#include<random>\n",
+                      "#include<fstream>\n",
+                      "#include<iostream>\n",
+                      "\n"
+                      ])
+
+        f.writelines(["int main(int argc, char** argv)\n",
+                      "{\n",
+                      ])
+
+        # Setup a polynomial object (normally done in the LocalOperator members)
+        opcounting = get_option("opcounter")
+        set_option("opcounter", False)
+        from dune.perftool.loopy.target import type_floatingpoint
+        real = type_floatingpoint()
+        f.write("  using RF = {};\n".format(real))
+        f.write("  using DF = {};\n".format(real))
+
+        from dune.perftool.sumfact.tabulation import name_polynomials
+        degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence)
+        for deg in set(degs):
+            f.write("  Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
+
+        # Get kernels
+        from dune.perftool.pdelab.localoperator import extract_kernel_from_cache
+        knl = realize_sumfact_kernel_function(sf)
+        constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
+        constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
+        constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
+
+        # Allocate buffers
+        size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width,
+                   product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width)
+        size = int(size * (get_option("precision_bits") / 8))
+        f.writelines(["  char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size),
+                      "  char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size),
+                      ])
+
+        # Setup fastdg inputs
+        for arg in sf.interface.signature_args:
+            if "jacobian" in arg:
+                f.write("{} = 0;\n".format(arg))
+            else:
+                size = sf.interface.fastdg_interface_object_size
+                f.write("RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size))
+
+        # Write stuff into the input buffer
+        f.writelines(["  {0} *input = ({0} *)buffer0;\n".format(real),
+                      "  {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2),
+                      "  for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)),
+                      "    input[i] = ({})(i+1);\n".format(real),
+                      ])
+
+        target = DuneTarget()
+        from loopy.codegen import CodeGenerationState
+        codegen_state = CodeGenerationState(kernel=constructor_knl,
+                                            implemented_data_info=None,
+                                            implemented_domain=None,
+                                            implemented_predicates=frozenset(),
+                                            seen_dtypes=frozenset(),
+                                            seen_functions=frozenset(),
+                                            seen_atomic_dtypes=frozenset(),
+                                            var_subst_map={},
+                                            allow_complex=False,
+                                            is_generating_device_code=True,
+                                            )
+
+        for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
+            f.write("  {}\n".format(next(iter(decl.generate()))))
+
+        for _, line in constructor_knl.preambles:
+            if "gfsu" not in line:
+                f.write("  {}\n".format(line))
+
+        # Add setup code for theta matrices. We add some lines not necessary,
+        # but it would be more work to remove them than keeping them.
+        for line in lp.generate_body(constructor_knl).split("\n")[1:-1]:
+            if "gfsu" not in line and "meshwidth" not in line and "geometry" not in line:
+                f.write("  {}\n".format(line))
+
+        # INtroduces a variable that makes sure that the kernel cannot be optimized away
+        f.writelines(["  {} accum;\n".format(real),
+                      "  std::mt19937 rng;\n",
+                      "  rng.seed(42);\n",
+                      "  std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
+                      ])
+
+        # Start a TSC timer
+        f.writelines(["  auto start = Dune::PDELab::TSC::start();\n",
+                      ])
+
+        # Add the implementation of the kernel.
+        f.write("  for(int i=0; i<{}; ++i)\n".format(int(1e9 / sf.operations)))
+        f.write("  {\n")
+        for line in knl.member.lines[1:]:
+            f.write("    {}\n".format(line))
+        f.write("  }\n")
+
+        # Stop the TSC timer and write the result to a file
+        f.writelines(["  auto stop = Dune::PDELab::TSC::stop();\n",
+                      "  std::ofstream file;\n",
+                      "  file.open(argv[1]);\n",
+                      "  file << Dune::PDELab::TSC::elapsed(start, stop) << std::endl;\n",
+                      "  file.close();\n",
+                      "  accum += output[dis(rng)];\n",
+                      "  std::cout << accum;\n",
+                      "}\n",
+                      ])
+        set_option("opcounter", opcounting)
+
+
+def autotune_realization(sf):
+    # Make sure that the benchmark directory exists
+    dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
+    if not os.path.exists(dir):
+        os.mkdir(dir)
+
+    basename = "autotune_sumfact_{}".format(sf.function_name)
+    name = os.path.join(dir, "autotune_sumfact_{}".format(sf.function_name))
+    filename = os.path.join(dir, "{}.cc".format(basename))
+    logname = os.path.join(dir, "{}.log".format(basename))
+    lock = "{}.lock".format(name)
+
+    # Generate and compile a benchmark program
+    with cache_restoring():
+        with filelock.FileLock(lock):
+            if not os.path.isfile(logname):
+                generate_standalone_code(sf, filename)
+
+                ret = subprocess.call(compiler_invocation(name, filename))
+                if ret != 0:
+                    raise PerftoolAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(name, filename))))
+
+                # Check whether the user specified an execution wrapper
+                call = []
+                wrapper = get_cmake_cache_entry("DUNE_PERFTOOL_BENCHMARK_WRAPPER")
+                if wrapper:
+                    call.append(wrapper)
+
+                # Run the benchmark program
+                call.append(name)
+                call.append(logname)
+                devnull = open(os.devnull, 'w')
+                ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
+                if ret != 0:
+                    raise PerftoolAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
+
+            # Extract the result form the log file
+            return float(next(iter(open(logname, "r")))) / 1000000
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index b4795478e990e5f82090f8a6346ee73168ede6e9..d846024d54e9f6b62785862803ab4e03498ac6ac 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -138,6 +138,11 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord):
         else:
             return ()
 
+    @property
+    def fastdg_interface_object_size(self):
+        from dune.perftool.sumfact.accumulation import _local_sizes
+        return sum(_local_sizes(self.element))
+
 
 def _basis_functions_per_direction(element):
     """Number of basis functions per direction """
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index 435975e41af8c29c0aaf078450f0046dd721137a..f1ef75074baa956e7a8fd3cb354b233495899501 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -124,6 +124,10 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
     def function_name_suffix(self):
         return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces))
 
+    @property
+    def fastdg_interface_object_size(self):
+        return self.interfaces[0].fastdg_interface_object_size
+
 
 class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
     def __init__(self, interfaces):
@@ -212,6 +216,10 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
     def function_name_suffix(self):
         return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces))
 
+    @property
+    def fastdg_interface_object_size(self):
+        return self.interfaces[0].fastdg_interface_object_size
+
 
 class SumfactKernelBase(object):
     pass
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index e913ed6243ff50f1e028bcff41dc2d2547e0c690..ef1c23da8c93d8db09fce419242892de68151d4f 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -373,13 +373,17 @@ def name_polynomials(degree):
     return name
 
 
-@preamble(kernel="operator")
 def sort_quadrature_points_weights(qp, qw, bound):
     range_field = lop_template_range_field()
     domain_field = name_domain_field()
     include_file("dune/perftool/sumfact/onedquadrature.hh", filetag="operatorfile")
-    return "onedQuadraturePointsWeights<{}, {}, {}>({}, {});" \
-        .format(range_field, domain_field, bound, qp, qw)
+    return frozenset({instruction(code="onedQuadraturePointsWeights<{}, {}, {}>({}, {});"
+                                  .format(range_field, domain_field, bound, qp, qw),
+                                  assignees=frozenset({qp, qw}),
+                                  read_variables=frozenset({qp, qw}),
+                                  kernel="operator",
+                                  ),
+                      })
 
 
 @iname(kernel="operator")
@@ -450,10 +454,11 @@ def define_theta(name, tabmat, additional_indices=(), width=None):
 
     args = [inames[1]]
 
+    dep = frozenset()
     if tabmat.face is None:
         qp = name_oned_quadrature_points(bound)
         qw = name_oned_quadrature_weights(bound)
-        sort_quadrature_points_weights(qp, qw, bound)
+        dep = sort_quadrature_points_weights(qp, qw, bound)
         args.append(prim.Subscript(prim.Variable(qp), (inames[0],)))
     else:
         args.append(tabmat.face)
@@ -461,6 +466,7 @@ def define_theta(name, tabmat, additional_indices=(), width=None):
     instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices),
                 expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)),
                 kernel="operator",
+                depends_on=dep,
                 )
 
 
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 27585679b456dd27a3470b999798fab02823eb3b..7d779eff11d82c4acf2fcd810223790537c8e7d6 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -12,6 +12,7 @@ from dune.perftool.generation import (backend,
                                       get_backend,
                                       get_counted_variable,
                                       get_global_context_value,
+                                      kernel_cached,
                                       )
 from dune.perftool.pdelab.restriction import (Restriction,
                                               restricted_name,
@@ -23,6 +24,7 @@ from dune.perftool.error import PerftoolVectorizationError
 from dune.perftool.options import get_form_option, get_option, set_form_option
 from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff
 
+from pymbolic.mapper.flop_counter import FlopCounter
 from pytools import product
 from frozendict import frozendict
 import itertools as it
@@ -98,6 +100,20 @@ def target_costfunction(sf):
     return abs(realcost - ratio * target)
 
 
+def accumulate_for_strategy(strategy, func, reduce=lambda x, y: x + y, start=0):
+    """ Accumulate a quantity over a given vectorization strategy """
+    accum = start
+
+    # Make sure to count each implementation exactly once
+    keys = set(sf.cache_key for sf in strategy.values())
+    for sf in strategy.values():
+        if sf.cache_key in keys:
+            accum = reduce(accum, func(sf))
+            keys.discard(sf.cache_key)
+
+    return accum
+
+
 def strategy_cost(strat_tuple):
     qp, strategy = strat_tuple
 
@@ -109,20 +125,62 @@ def strategy_cost(strat_tuple):
         func = explicit_costfunction
     elif s == "target":
         func = target_costfunction
+    elif s == "autotune":
+        from dune.perftool.sumfact.autotune import autotune_realization
+        func = autotune_realization
     else:
         raise NotImplementedError("Vectorization strategy '{}' unknown!".format(s))
 
     keys = set(sf.cache_key for sf in strategy.values())
     set_quadrature_points(qp)
 
-    # Sum over all the sum factorization kernels in the realization
-    score = 0.0
-    for sf in strategy.values():
-        if sf.cache_key in keys:
-            score = score + float(func(sf))
-            keys.discard(sf.cache_key)
+    return accumulate_for_strategy(strategy, lambda sf: float(func(sf)))
+
+
+class PrimitiveApproximateOpcounter(FlopCounter):
+    def map_sumfact_kernel(self, expr):
+        return 0
+
+    def map_tagged_variable(self, expr):
+        return self.map_variable(expr)
+
+
+@kernel_cached
+def store_operation_count(expr, count):
+    return count
+
+
+def count_quadrature_point_operations(expr):
+    counter = PrimitiveApproximateOpcounter()
+    store_operation_count(expr, counter(expr))
+
+
+def quadrature_penalized_strategy_cost(strat_tuple):
+    """ Implements a penalization of the cost function that accounts for
+    the increase in flops that occur in the quadrature loop. This needs to
+    somehow get a guess of how much work is done in the quadrature loop relative
+    to the sum factorization kernels.
+    """
+    qp, strategy = strat_tuple
+
+    # Evaluate the original cost function. This result will be scaled by this function.
+    cost = strategy_cost(strat_tuple)
+
+    # Get the total number of Flops done in sum factorization kernels
+    sf_flops = accumulate_for_strategy(strategy, lambda sf: sf.operations)
+
+    # Get the minimal possible number of quadrature points and the actual quadrature points
+    num_qp_new = product(qp)
+    set_quadrature_points(None)
+    num_qp_old = product(quadrature_points_per_direction())
+    set_quadrature_points(qp)
+
+    # Determine the number of floating point operations per quadrature point.
+    # This flop counting is a very crude approximation, but that is totally sufficient here.
+    ops_per_qp = sum(i.value for i in store_operation_count._memoize_cache.values())
 
-    return score
+    # Do the actual scaling.
+    return float((sf_flops + ops_per_qp * num_qp_new) / (sf_flops + ops_per_qp * num_qp_old)) * cost
 
 
 def fixedqp_strategy_costfunction(qp):
@@ -296,7 +354,7 @@ def level1_optimal_vectorization_strategy(sumfacts, width):
         with open("mapping.csv", 'a') as f:
             f.write(" ".join((identifier, str(cost), short_stringify_vectorization_strategy((qp, optimal_strategies[qp])))) + "\n")
     else:
-        qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp])))
+        qp = min(optimal_strategies, key=lambda qp: quadrature_penalized_strategy_cost((qp, optimal_strategies[qp])))
 
     return qp, optimal_strategies[qp]
 
diff --git a/python/setup.py b/python/setup.py
index 476a9f6f7b5399c04837af8eb5e627f4182156f2..e4517d2f7f6c1b57b7f57f69c90b20a5f4a05f3f 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -41,7 +41,7 @@ setup(name='dune.perftool',
                 'dune.perftool.ufl',
                 'dune.perftool.ufl.transformations',
                 ],
-      install_requires=['dune.testtools', 'sympy', 'frozendict', 'pytest', 'pytest-pep8'],
+      install_requires=['dune.testtools', 'sympy', 'frozendict', 'pytest', 'pytest-pep8', 'filelock'],
       cmdclass={'test': PyTest},
       entry_points = {
         "console_scripts": [
diff --git a/test/sumfact/poisson/poisson_3d.mini b/test/sumfact/poisson/poisson_3d.mini
index e3e6da7d29475cedd70db44a2b667d6171ae5c80..bf7b0f3bfdc7823e970694ecead48e6ecb4d7f6a 100644
--- a/test/sumfact/poisson/poisson_3d.mini
+++ b/test/sumfact/poisson/poisson_3d.mini
@@ -4,7 +4,7 @@ __exec_suffix = {deg_suffix}_{diff_suffix}_{quadvec_suffix}_{gradvec_suffix}
 deg_suffix = deg{formcompiler.ufl_variants.degree}
 diff_suffix = numdiff, symdiff | expand num
 quadvec_suffix = quadvec, nonquadvec | expand quad
-gradvec_suffix = gradvec, nongradvec | expand grad
+gradvec_suffix = gradvec, autotunevec, nongradvec | expand grad
 
 cells = 8 8 8
 extension = 1. 1. 1.
@@ -21,7 +21,7 @@ compare_l2errorsquared = 1e-4, 1e-8 | expand deg
 numerical_jacobian = 1, 0 | expand num
 sumfact = 1
 vectorization_quadloop = 1, 0 | expand quad
-vectorization_strategy = explicit, none | expand grad
+vectorization_strategy = explicit, autotune, none | expand grad
 
 [formcompiler.ufl_variants]
 degree = 1, 2 | expand deg