diff --git a/bin/donkey_benchmark_compilation_wrapper.sh b/bin/donkey_benchmark_compilation_wrapper.sh
index e2691b5d71db2473b731fabcea6a1095ec2516d9..a786d111264ef0e67769100713f1776e3e002c3d 100755
--- a/bin/donkey_benchmark_compilation_wrapper.sh
+++ b/bin/donkey_benchmark_compilation_wrapper.sh
@@ -13,7 +13,4 @@ ml parmetis
 
 ("$@")
 code=$?
-echo "Code: $code"
-sleep 0.1s
-
 exit $code
diff --git a/bin/donkey_benchmark_execution_wrapper.py b/bin/donkey_benchmark_execution_wrapper.py
index 7951b8b06bfdeb217bd3328c9b3b88f229bbc606..d383963318291ae947c70b1f083c320a60250303 100755
--- a/bin/donkey_benchmark_execution_wrapper.py
+++ b/bin/donkey_benchmark_execution_wrapper.py
@@ -13,8 +13,3 @@ ret = subprocess.call(command)
 # If that failed - fail!
 if ret != 0:
     sys.exit(ret)
-
-# If that was succesful, wait for the output file to be available on the filesystem
-# This step is necessary because the NFS synchronization is too slow for our workflow.
-while not os.path.isfile(sys.argv[2]):
-    time.sleep(0.1)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b96bd06cbce6361f1130e17bb7cb4d97efe08ec7..8881504f934e7b1d755580ffc46d47f2a1467346 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -25,5 +25,5 @@ add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc)
 target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing)
 
 if(benchmark_FOUND)
-  target_link_libraries(_autotune_target benchmark::benchmark)
+  target_link_libraries(_autotune_target benchmark)
 endif()
diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index d19fd7be5587258e8db1f3ada3301b1e7017f95e..a1d7442addc4e2d1d720b1cc81d8a914c1f75532 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -14,6 +14,7 @@ import re
 import subprocess
 import filelock
 import hashlib
+import time
 
 
 def get_cmake_cache_entry(entry):
@@ -94,10 +95,8 @@ def write_global_data(sf, filename):
 
 
 def write_setup_code(sf, filename, define_thetas=True):
-    opcounting = get_option("opcounter")
     with open(filename, "a") as f:
         # Setup a polynomial object (normally done in the LocalOperator members)
-        set_option("opcounter", False)
         from dune.codegen.loopy.target import type_floatingpoint
         real = type_floatingpoint()
         f.write("  using RF = {};\n".format(real))
@@ -116,11 +115,12 @@ def write_setup_code(sf, filename, define_thetas=True):
         constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
 
         # Allocate buffers
+        alignment = get_option("max_vector_width") // 8
         size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width,
                    product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width)
         size = int(size * (get_option("precision_bits") / 8))
-        f.writelines(["  char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size),
-                      "  char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size),
+        f.writelines(["  char buffer0[{}] __attribute__ ((aligned ({})));\n".format(size, alignment),
+                      "  char buffer1[{}] __attribute__ ((aligned ({})));\n".format(size, alignment),
                       ])
 
         # Setup fastdg inputs
@@ -129,7 +129,7 @@ def write_setup_code(sf, filename, define_thetas=True):
                 f.write("{} = 0;\n".format(arg))
             else:
                 size = sf.interface.fastdg_interface_object_size
-                f.write("  RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size))
+                f.write("  RF {}[{}] __attribute__ ((aligned ({})));\n".format(arg.split()[-1], size, alignment))
 
         # Write stuff into the input buffer
         f.writelines(["  {0} *input = ({0} *)buffer0;\n".format(real),
@@ -172,12 +172,15 @@ def write_setup_code(sf, filename, define_thetas=True):
                       "  rng.seed(42);\n",
                       "  std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
                       ])
-    return opcounting
 
 
 def generate_standalone_code_google_benchmark(sf, filename):
     delete_cache_items("kernel_default")
 
+    # Turn off opcounting
+    opcounting = get_option("opcounter")
+    set_option("opcounter", False)
+
     # Extract sum factorization kernel
     from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
     knl = realize_sumfact_kernel_function(sf)
@@ -205,6 +208,10 @@ def generate_standalone_code_google_benchmark(sf, filename):
     write_global_data(sf, filename)
 
     with open(filename, "a") as f:
+        arguments = ', '.join(sf.interface.signature_args)
+        if len(arguments) > 0:
+            arguments = ', ' + arguments
+        arguments = 'const char* buffer0, const char* buffer1' + arguments
         f.write("void sumfact_kernel({})\n".format(arguments))
         for line in knl.member.lines[1:]:
             f.write("{}\n".format(line))
@@ -212,7 +219,7 @@ def generate_standalone_code_google_benchmark(sf, filename):
         f.write("\n\n")
         f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")
 
-    opcounting = write_setup_code(sf, filename, define_thetas=False)
+    write_setup_code(sf, filename, define_thetas=False)
 
     additional_arguments = [i.split()[-1] for i in sf.interface.signature_args]
     additional_arguments = ', '.join(additional_arguments)
@@ -227,12 +234,18 @@ def generate_standalone_code_google_benchmark(sf, filename):
                       "\n",
                       "BENCHMARK_MAIN();"
                       ])
+
+    # Maybe turn opcounting on again
     set_option("opcounter", opcounting)
 
 
 def generate_standalone_code(sf, filename):
     delete_cache_items("kernel_default")
 
+    # Turn off opcounting
+    opcounting = get_option("opcounter")
+    set_option("opcounter", False)
+
     # Extract sum factorization kernel
     from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
     knl = realize_sumfact_kernel_function(sf)
@@ -257,7 +270,7 @@ def generate_standalone_code(sf, filename):
                       "{\n",
                       ])
 
-    opcounting = write_setup_code(sf, filename)
+    write_setup_code(sf, filename)
 
     # Write measurement
     with open(filename, "a") as f:
@@ -283,7 +296,9 @@ def generate_standalone_code(sf, filename):
                       "  std::cout << accum;\n",
                       "}\n",
                       ])
-        set_option("opcounter", opcounting)
+
+    # Maybe turn opcounting on again
+    set_option("opcounter", opcounting)
 
 
 def autotune_realization(sf):
@@ -320,6 +335,10 @@ def autotune_realization(sf):
                 if ret != 0:
                     raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call)))
 
+                # File system synchronization!
+                while not os.path.exists(executable):
+                    time.sleep(0.01)
+
                 # Check whether the user specified an execution wrapper
                 call = []
                 wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_EXECUTION_WRAPPER")
@@ -337,11 +356,19 @@ def autotune_realization(sf):
                 if ret != 0:
                     raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
 
+                # File system synchronization!
+                while not os.path.exists(logname):
+                    time.sleep(0.01)
+
             # Extract the result form the log file
             if get_option("autotune_google_benchmark"):
                 import json
                 with open(logname) as json_file:
-                    data = json.load(json_file)
-                    return data['benchmarks'][0]['cpu_time']
+                    try:
+                        data = json.load(json_file)
+                        return data['benchmarks'][0]['cpu_time']
+                    except Exception as e:
+                        print("Error while loading file {}".format(logname))
+                        raise e
             else:
                 return float(next(iter(open(logname, "r")))) / 1000000