diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7cf8469e5b7b3ad58a370e1c114229740813b812..d26e0ab5fa633638a039ad51fc61f9545aab4e26 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,5 +40,7 @@ add_subdirectory(test)
 add_subdirectory(bin)
 add_subdirectory(applications)
 
+add_executable(autotune_target EXCLUDE_FROM_ALL autotune.cc)
+
 # finalize the dune project, e.g. generating config.h etc.
 finalize_dune_project(GENERATE_CONFIG_H_CMAKE)
diff --git a/python/_autotune.cc b/python/_autotune.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0c87b37802d394fb8dafc0ca6ae150797e81199a
--- /dev/null
+++ b/python/_autotune.cc
@@ -0,0 +1,2 @@
+int main()
+{}
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 9d4ccb3f5da38ac1d83bd5d7b63f7c7a1059c99d..2bbc2474063943f198e778123ecd23cc1d4abde7 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -82,7 +82,7 @@ class PerftoolFormOptionsArray(ImmutableRecord):
     sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization")
     sumfact_regular_jacobians = PerftoolOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)")
     vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
-    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target")
+    vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune")
     vectorization_not_fully_vectorized_error = PerftoolOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize")
     vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy")
     vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy")
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 3fb213c87e98883e4f474c4145090b40aff1eec4..1b49197e8e48af700e3697ef3a90b338cf7fa200 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -459,7 +459,7 @@ def generate_kernel(integrals):
     # Delete the cache contents and do the real thing!
     logger.debug('generate_kernel: visit_integrals (no dry run)')
     from dune.perftool.generation import delete_cache_items
-    delete_cache_items("kernel_default")
+    delete_cache_items("kernel_default or member")
     for integral in integrals:
         visit_integral(integral)
 
diff --git a/python/dune/perftool/sumfact/autotune.py b/python/dune/perftool/sumfact/autotune.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d171ecf8921eb7c1dc8ee21a1df310baafedfb6
--- /dev/null
+++ b/python/dune/perftool/sumfact/autotune.py
@@ -0,0 +1,182 @@
+""" Autotuning for sum factorization kernels """
+
+from dune.perftool.generation import delete_cache_items
+from dune.perftool.loopy.target import DuneTarget
+from dune.perftool.sumfact.realization import realize_sumfact_kernel_function
+from dune.perftool.options import get_option
+
+import loopy as lp
+from pytools import product
+
+import os
+import re
+import subprocess
+
+
+def get_cmake_cache_entry(entry):
+    for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"):
+        match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH]+=(.*)".format(entry), line)
+        if match:
+            return match.groups()[0]
+
+
+def compiler_invocation(name, filename):
+    # Determine the CMake Generator in use
+    gen = get_cmake_cache_entry("CMAKE_GENERATOR")
+    assert(gen == "Unix Makefiles")
+
+    # Find compiler path
+    compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER")
+    compile_flags = [compiler]
+
+    # Parse compiler flags
+    for line in open(os.path.join(get_option("project_basedir"), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"):
+        match = re.match("([^=]*)=(.*)", line)
+        if match:
+            compile_flags.extend(match.groups()[1].split())
+
+    # Add the source file
+    compile_flags.append(filename)
+
+    # Parse linker flags
+    for line in open(os.path.join(get_option("project_basedir"), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"):
+        match = re.match(".*_autotune_target (.*)", line)
+        if match:
+            for flag in match.groups()[0].split():
+                if flag.startswith("-") or os.path.isabs(flag):
+                    compile_flags.append(flag)
+                else:
+                    compile_flags.append(os.path.join(get_option("project_basedir"), "python", flag))
+
+    # Set an output name
+    compile_flags.append("-o")
+    compile_flags.append(name)
+
+    return compile_flags
+
+
+def generate_standalone_code(sf, filename, logname):
+    delete_cache_items("kernel_default")
+
+    with open(filename, "w") as f:
+        f.writelines(["#include \"config.h\"\n",
+                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
+                      "#include<dune/perftool/common/tsc.hh>\n",
+                      "#include<dune/perftool/common/vectorclass.hh>\n",
+                      "#include<dune/perftool/sumfact/onedquadrature.hh>\n",
+                      "#include<random>\n",
+                      "#include<fstream>\n",
+                      "\n"
+                      ])
+
+        f.writelines(["int main(int argc, char** argv)\n",
+                      "{\n",
+                      ])
+
+        # Setup a polynomial object (normally done in the LocalOperator members)
+        from dune.perftool.loopy.target import type_floatingpoint
+        real = type_floatingpoint()
+        f.write("  using RF = {};\n".format(real))
+        f.write("  using DF = {};\n".format(real))
+
+        from dune.perftool.sumfact.tabulation import name_polynomials
+        degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence)
+        for deg in set(degs):
+            f.write("  Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
+
+        # Get kernels
+        from dune.perftool.pdelab.localoperator import extract_kernel_from_cache
+        knl = realize_sumfact_kernel_function(sf)
+        constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
+        constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
+        constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
+
+        # Allocate buffers
+        size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width,
+                   product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width)
+        f.writelines(["  char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size),
+                      "  char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size),
+                      ])
+
+        # Write stuff into the input buffer
+        f.writelines(["  {0} *input = ({0} *)buffer0;\n".format(real),
+                      "  {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2),
+                      "  for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)),
+                      "    input[i] = ({})(i+1);\n".format(real),
+                      ])
+
+        target = DuneTarget()
+        from loopy.codegen import CodeGenerationState
+        codegen_state = CodeGenerationState(kernel=constructor_knl,
+                                            implemented_data_info=None,
+                                            implemented_domain=None,
+                                            implemented_predicates=frozenset(),
+                                            seen_dtypes=frozenset(),
+                                            seen_functions=frozenset(),
+                                            seen_atomic_dtypes=frozenset(),
+                                            var_subst_map={},
+                                            allow_complex=False,
+                                            is_generating_device_code=True,
+                                            )
+
+        for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
+            f.write("  {}\n".format(next(iter(decl.generate()))))
+
+        for _, line in constructor_knl.preambles:
+            if "gfsu" not in line:
+                f.write("  {}\n".format(line))
+
+        # Add setup code for theta matrices. We add some lines not necessary,
+        # but it would be more work to remove them than keeping them.
+        for line in lp.generate_body(constructor_knl).split("\n")[1:-1]:
+            if "gfsu" not in line and "meshwidth" not in line:
+                f.write("  {}\n".format(line))
+
+        # INtroduces a variable that makes sure that the kernel cannot be optimized away
+        f.writelines(["  {} accum;\n".format(real),
+                      "  std::mt19937 rng;\n",
+                      "  rng.seed(42);\n",
+                      "  std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
+                      ])
+
+        # Start a TSC timer
+        f.writelines([
+                      "  auto start = Dune::PDELab::TSC::start();\n",
+                      ])
+
+        # Add the implementation of the kernel.
+        f.write("  for(int i=0; i<10000000; ++i)\n")
+        f.write("  {\n")
+        for line in knl.member.lines[1:]:
+            f.write("    {}\n".format(line))
+        f.write("  }\n")
+
+        # Stop the TSC timer and write the result to a file
+        f.writelines(["  auto stop = Dune::PDELab::TSC::stop();\n",
+                      "  std::ofstream file;\n",
+                      "  file.open(\"{}\");\n".format(logname),
+                      "  file << Dune::PDELab::TSC::elapsed(start, stop) << std::endl;\n",
+                      "  file.close();\n",
+                      "  accum += output[dis(rng)];\n",
+                      "  std::cout << accum;\n",
+                      "}\n",
+                      ])
+
+
+def autotune_realization(sf):
+    name = "autotune_sumfact_{}".format(sf.function_name)
+    filename = "{}.cc".format(name)
+    logname = "{}.log".format(name)
+
+    # Generate and compile a benchmark program
+    generate_standalone_code(sf, filename, logname)
+    ret = subprocess.call(compiler_invocation(name, filename))
+    assert ret == 0
+
+    # Run the benchmark program
+    devnull = open(os.devnull, 'w')
+    ret = subprocess.call(["./{}".format(name)], stdout=devnull, stderr=subprocess.STDOUT)
+    assert ret == 0
+
+    # Extract the result form the log file
+    return float(next(iter(open(logname, "r")))) / 1000000
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index de13c45c3105e8c0e7411d1d849f7f55f4629e96..74a6a3d1e400210395f11314397f39acf0acc4b6 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -362,13 +362,17 @@ def name_polynomials(degree):
     return name
 
 
-@preamble(kernel="operator")
 def sort_quadrature_points_weights(qp, qw, bound):
     range_field = lop_template_range_field()
     domain_field = name_domain_field()
     include_file("dune/perftool/sumfact/onedquadrature.hh", filetag="operatorfile")
-    return "onedQuadraturePointsWeights<{}, {}, {}>({}, {});" \
-        .format(range_field, domain_field, bound, qp, qw)
+    return frozenset({instruction(code="onedQuadraturePointsWeights<{}, {}, {}>({}, {});" \
+                                  .format(range_field, domain_field, bound, qp, qw),
+                                  assignees=frozenset({qp, qw}),
+                                  read_variables=frozenset({qp, qw}),
+                                  kernel="operator",
+                                  ),
+                      })
 
 
 @iname(kernel="operator")
@@ -439,10 +443,11 @@ def define_theta(name, tabmat, additional_indices=(), width=None):
 
     args = [inames[1]]
 
+    dep = frozenset()
     if tabmat.face is None:
         qp = name_oned_quadrature_points(bound)
         qw = name_oned_quadrature_weights(bound)
-        sort_quadrature_points_weights(qp, qw, bound)
+        dep = sort_quadrature_points_weights(qp, qw, bound)
         args.append(prim.Subscript(prim.Variable(qp), (inames[0],)))
     else:
         args.append(tabmat.face)
@@ -450,6 +455,7 @@ def define_theta(name, tabmat, additional_indices=(), width=None):
     instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices),
                 expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)),
                 kernel="operator",
+                depends_on=dep,
                 )
 
 
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index aed3f4a30b9115fc0a4aef1169f85db8e492853a..58df262f7158fa9bd8d855bfe665b10172bc1f04 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -110,6 +110,9 @@ def strategy_cost(strat_tuple):
         func = explicit_costfunction
     elif s == "target":
         func = target_costfunction
+    elif s == "autotune":
+        from dune.perftool.sumfact.autotune import autotune_realization
+        func = autotune_realization
     else:
         raise NotImplementedError("Vectorization strategy '{}' unknown!".format(s))