diff --git a/bin/barplot_coarse.py b/bin/barplot_coarse.py
index 67de3aefb4e9bb0d4ef42a2b61a51fd28ddc873d..47906024390a599c882437cf439a50685c45d430 100644
--- a/bin/barplot_coarse.py
+++ b/bin/barplot_coarse.py
@@ -19,19 +19,15 @@ opframe = pandas.read_csv("./operations.csv",
 
 av = opframe[opframe.what == "alpha_volume_kernel"]
 ask = opframe[opframe.what == "alpha_skeleton_kernel"]
-ab = opframe[opframe.what == "alpha_boundary_kernel"]
 deg = [i - 0.3 for i in sorted(av['degree'])]
 
 _, y1 = list(itertools.izip(*sorted(itertools.izip(av['degree'], av['ops']))))
 _, y2 = list(itertools.izip(*sorted(itertools.izip(ask['degree'], ask['ops']))))
 y2 = [a+b for a, b in zip(y1, y2)]
-_, y3 = list(itertools.izip(*sorted(itertools.izip(ab['degree'], ab['ops']))))
-y3 = [a+b for a, b in zip(y2, y3)]
-y4 = [1.0] * len(deg)
+y3 = [1.0] * len(deg)
 
-r4 = ax.bar(deg, y4, width, color="grey") 
-r3 = ax.bar(deg, y3, width, color="green") 
-r2 = ax.bar(deg, y2, width, color="blue") 
+r3 = ax.bar(deg, y3, width, color="grey")
+r2 = ax.bar(deg, y2, width, color="blue")
 r1 = ax.bar(deg, y1, width, color="red")
 
 ax.set_ylabel("Percentage")
@@ -46,19 +42,15 @@ timeframe = pandas.read_csv("./timeratios.csv",
 
 av = timeframe[timeframe.what == "alpha_volume_kernel"]
 ask = timeframe[timeframe.what == "alpha_skeleton_kernel"]
-ab = timeframe[timeframe.what == "alpha_boundary_kernel"]
 deg = [i + 0.3 for i in sorted(av['degree'])]
 
 _, y1 = list(itertools.izip(*sorted(itertools.izip(av['degree'], av['ops']))))
 _, y2 = list(itertools.izip(*sorted(itertools.izip(ask['degree'], ask['ops']))))
 y2 = [a+b for a, b in zip(y1, y2)]
-_, y3 = list(itertools.izip(*sorted(itertools.izip(ab['degree'], ab['ops']))))
-y3 = [a+b for a, b in zip(y2, y3)]
-y4 = [1.0] * len(deg)
+y3 = [1.0] * len(deg)
 
-r4 = ax.bar(deg, y4, width, color="grey", label="PDELab overhead") 
-r3 = ax.bar(deg, y3, width, color="green", label="Boundary integrals") 
-r2 = ax.bar(deg, y2, width, color="blue", label="Skeleton integrals") 
+r3 = ax.bar(deg, y3, width, color="grey", label="PDELab overhead")
+r2 = ax.bar(deg, y2, width, color="blue", label="Skeleton integrals")
 r1 = ax.bar(deg, y1, width, color="red", label="Volume integrals")
 
 ax.legend(loc=3,
diff --git a/bin/barplot_fine.py b/bin/barplot_fine.py
index 4e050c80ef775780c559ed82bfa6f4a2acba1564..47f4514da839afc43d7c588ce4a7158e5d08e5b0 100644
--- a/bin/barplot_fine.py
+++ b/bin/barplot_fine.py
@@ -28,7 +28,8 @@ av3 = opframe[opframe.what == "alpha_volume_kernel_stage3"]
 ask1 = opframe[opframe.what == "alpha_skeleton_kernel_stage1"]
 ask2 = opframe[opframe.what == "alpha_skeleton_kernel_quadratureloop"]
 ask3 = opframe[opframe.what == "alpha_skeleton_kernel_stage3"]
-ab = opframe[opframe.what == "alpha_boundary_kernel"]
+setup1 = opframe[opframe.what == "alpha_volume_kernel_setup"]
+setup2 = opframe[opframe.what == "alpha_skeleton_kernel_setup"]
 deg = [i - width/1.8 for i in sorted(av1['degree'])]
 
 
@@ -38,23 +39,26 @@ def update(frame, result=None):
         return y
     else:
         return [a+b for a, b in zip(y, result)]
-        
+
+
 y1 = update(av1)
 y2 = update(av2, y1)
 y3 = update(av3, y2)
-y4 = update(ask1, y3)
-y5 = update(ask2, y4)
-y6 = update(ask3, y5)
-y7 = update(ab, y6)
-y8 = [1.0] * len(deg)
-
-r8 = ax.bar(deg, y8, width, color="grey") 
-r7 = ax.bar(deg, y7, width, color="green") 
-r6 = ax.bar(deg, y6, width, color="xkcd:light blue") 
-r5 = ax.bar(deg, y5, width, color="xkcd:blue")
-r4 = ax.bar(deg, y4, width, color="xkcd:dark blue") 
-r3 = ax.bar(deg, y3, width, color="yellow") 
-r2 = ax.bar(deg, y2, width, color="orange") 
+y4 = update(setup1, y3)
+y5 = update(ask1, y4)
+y6 = update(ask2, y5)
+y7 = update(ask3, y6)
+y8 = update(setup2, y7)
+y9 = [1.0] * len(deg)
+
+r9 = ax.bar(deg, y9, width, color="grey")
+r8 = ax.bar(deg, y8, width, color="xkcd:light green")
+r7 = ax.bar(deg, y7, width, color="xkcd:light blue")
+r6 = ax.bar(deg, y6, width, color="xkcd:blue")
+r5 = ax.bar(deg, y5, width, color="xkcd:dark blue")
+r4 = ax.bar(deg, y4, width, color="green")
+r3 = ax.bar(deg, y3, width, color="yellow")
+r2 = ax.bar(deg, y2, width, color="orange")
 r1 = ax.bar(deg, y1, width, color="red")
 
 ax.set_ylabel("Percentage")
@@ -73,25 +77,28 @@ av3 = timeframe[timeframe.what == "alpha_volume_kernel_stage3"]
 ask1 = timeframe[timeframe.what == "alpha_skeleton_kernel_stage1"]
 ask2 = timeframe[timeframe.what == "alpha_skeleton_kernel_quadratureloop"]
 ask3 = timeframe[timeframe.what == "alpha_skeleton_kernel_stage3"]
-ab = timeframe[timeframe.what == "alpha_boundary_kernel"]
+setup1 = timeframe[timeframe.what == "alpha_volume_kernel_setup"]
+setup2 = timeframe[timeframe.what == "alpha_skeleton_kernel_setup"]
 deg = [i + width/1.8 for i in sorted(av1['degree'])]
 
 y1 = update(av1)
 y2 = update(av2, y1)
 y3 = update(av3, y2)
-y4 = update(ask1, y3)
-y5 = update(ask2, y4)
-y6 = update(ask3, y5)
-y7 = update(ab, y6)
-y8 = [1.0] * len(deg)
-
-r8 = ax.bar(deg, y8, width, label="PDELab overhead", color="grey") 
-r7 = ax.bar(deg, y7, width, label="Boundary", color="green") 
-r6 = ax.bar(deg, y6, width, label="Skeleton, backward SF", color="xkcd:light blue") 
-r5 = ax.bar(deg, y5, width, label="Skeleton, Quadrature", color="xkcd:blue")
-r4 = ax.bar(deg, y4, width, label="Skeleton, forward SF", color="xkcd:dark blue") 
-r3 = ax.bar(deg, y3, width, label="Volume, backward SF", color="yellow") 
-r2 = ax.bar(deg, y2, width, label="Volume, Quadrature", color="orange") 
+y4 = update(setup1, y3)
+y5 = update(ask1, y4)
+y6 = update(ask2, y5)
+y7 = update(ask3, y6)
+y8 = update(setup2, y7)
+y9 = [1.0] * len(deg)
+
+r9 = ax.bar(deg, y9, width, label="PDELab overhead", color="grey")
+r8 = ax.bar(deg, y8, width, label="Geometry evaluations Skeleton", color="xkcd:light green")
+r7 = ax.bar(deg, y7, width, label="Skeleton, backward SF", color="xkcd:light blue")
+r6 = ax.bar(deg, y6, width, label="Skeleton, Quadrature", color="xkcd:blue")
+r5 = ax.bar(deg, y5, width, label="Skeleton, forward SF", color="xkcd:dark blue")
+r4 = ax.bar(deg, y4, width, label="Geometry evaluations Volume", color="green")
+r3 = ax.bar(deg, y3, width, label="Volume, backward SF", color="yellow")
+r2 = ax.bar(deg, y2, width, label="Volume, Quadrature", color="orange")
 r1 = ax.bar(deg, y1, width, label="Volume, forward SF", color="red")
 
 lgd = ax.legend(loc=3,
diff --git a/bin/knltimings.sh b/bin/knltimings.sh
index e299237fa622cc5c6f7022365c0d98cb60e886c2..4da92dca0743450a78f36f568ea7d50d5cb0d64e 100755
--- a/bin/knltimings.sh
+++ b/bin/knltimings.sh
@@ -7,12 +7,12 @@ then
 fi
 
 # Search for runnable executables
-FILES=$(ls *.ini)
+FILES=$(ls *.ini | grep -v '^verify')
 for inifile in $FILES
 do
   line=$(grep ^"opcounter = " $inifile)
   extract=${line##opcounter = }
- UPPER=1
+  UPPER=10
   if [ $extract -eq 1 ]
   then
     UPPER=1
diff --git a/bin/process_measurements.py b/bin/process_measurements.py
index 0d45c7293befca1c166617746c13069c3e9c57cc..53fb0873dee3a3cf3719c2faf5f98bc2a2925a18 100755
--- a/bin/process_measurements.py
+++ b/bin/process_measurements.py
@@ -22,10 +22,27 @@ def get_reference_kernel(kernel):
     return kernel
 
 
+def get_reference_integral_kernel(kernel):
+    if kernel.startswith("alpha") and len(kernel.split("_")) == 4:
+        return "_".join(kernel.split("_")[:-1])
+    if kernel.startswith("jacobian_apply") and len(kernel.split("_")) == 5:
+        return "_".join(kernel.split("_")[:-1])
+    return kernel
+
+
+def get_siblings_kernel(kernel):
+    s = kernel.split("_")
+    if (kernel.startswith("alpha") and len(s) == 4) or (kernel.startswith("jacobian_apply") and len(s) == 5):
+        for suffix in ["setup", "stage1", "quadratureloop", "stage3"]:
+            yield "_".join(s[:-1] + [suffix])
+    else:
+        yield kernel
+
 def calculate_operations_percentage():
-    frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'ident', 'kernel', 'what', 'value'), delimiter=' ')
+    frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'level', 'ident', 'kernel', 'what', 'value'), delimiter=' ')
     ops = frame[frame.what != "time"]
     ops = ops.groupby(('rank', 'ident', 'kernel'))['value'].max().to_frame().reset_index().groupby(('ident', 'kernel'))['value'].max()
+
     with open('operations.csv', 'w') as out:
         for key in ops.keys():
             ident, kernel = key
@@ -34,18 +51,26 @@ def calculate_operations_percentage():
 
 
 def calculate_times_percentage():
-    frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'ident', 'kernel', 'what', 'value'), delimiter=' ')
-    time = frame[frame.what == "time"]
-    time = time.groupby(('rank', 'ident', 'kernel'))['value'].min().to_frame().reset_index().groupby(('ident', 'kernel'))['value'].max()
+    frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'level', 'ident', 'kernel', 'what', 'value'), delimiter=' ')
+
+    time4 = frame[(frame.what == "time") & (frame.level == 4)]
+    time4 = time4.groupby(('rank', 'ident', 'kernel'))['value'].min().to_frame().reset_index().groupby(('ident', 'kernel'))['value'].max()
+    time3 = frame[(frame.what == "time") & (frame.level == 3)]
+    time3 = time3.groupby(('rank', 'ident', 'kernel'))['value'].min().to_frame().reset_index().groupby(('ident', 'kernel'))['value'].max()
+
     with open('timeratios.csv', 'w') as out:
-        for key in time.keys():
+        for key in time4.keys():
             ident, kernel = key
             degree = re.match(".*deg([0-9]*).*", ident).group(1)
-            out.write(" ".join([ident, degree, kernel, str(time[ident][kernel] / time[ident][get_reference_kernel(kernel)]) + "\n"]))
+            stage = time4[ident][kernel]
+            sum4 = sum(time4[ident][k] for k in get_siblings_kernel(kernel))
+            int3 = time3[ident][get_reference_integral_kernel(kernel)]
+            ref =  time3[ident][get_reference_kernel(kernel)]
+            out.write(" ".join([ident, degree, kernel, str(float(stage*int3) / float(sum4*ref)) + "\n"]))
 
 
 def calculate_floprate():
-    frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'ident', 'kernel', 'what', 'value'), delimiter=' ')
+    frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'level', 'ident', 'kernel', 'what', 'value'), delimiter=' ')
     time = frame[frame.what == "time"]
     ops = frame[frame.what != "time"]
 
@@ -56,11 +81,12 @@ def calculate_floprate():
         for key in time.keys():
             ident, kernel = key
             degree = re.match(".*deg([0-9]*).*", ident).group(1)
-            out.write(" ".join([ident, degree, kernel, str((ops[ident][kernel] / time[ident][kernel]) / 1e9)]) + "\n")
+            operations = ops[ident][kernel]
+            out.write(" ".join([ident, degree, kernel, str((operations / time[ident][kernel]) / 1e9)]) + "\n")
 
 
 def calculate_doftimes():
-    frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'ident', 'kernel', 'what', 'value'), delimiter=' ')
+    frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'level', 'ident', 'kernel', 'what', 'value'), delimiter=' ')
     dofs = frame[frame.what == "dofs"]
     time = frame[frame.what == "time"]
 
diff --git a/dune/perftool/common/opcounter.hh b/dune/perftool/common/opcounter.hh
index 9bcd4ce2fc252d6fde7b2c50339904d6cffc6ae4..5a0115e6b5649d45b86db1432d83a66330e00e0e 100644
--- a/dune/perftool/common/opcounter.hh
+++ b/dune/perftool/common/opcounter.hh
@@ -158,22 +158,22 @@ namespace oc {
         blend_count = 0;
       }
 
-      template<typename Stream>
-      void reportOperations(Stream& os, std::string exec, std::string kernel, bool doReset = false)
+      template<typename Stream, typename T>
+      void reportOperations(Stream& os, T level, std::string exec, std::string kernel, bool doReset = false)
       {
         auto total = addition_count + multiplication_count + division_count + exp_count + pow_count + sin_count + sqrt_count + comparison_count;
         if (total == 0)
           return;
-        os << exec << " " << kernel << " additions " << addition_count << std::endl
-           << exec << " " << kernel << " multiplications " << multiplication_count << std::endl
-           << exec << " " << kernel << " divisions " << division_count << std::endl
-           << exec << " " << kernel << " exp " << exp_count << std::endl
-           << exec << " " << kernel << " pow " << pow_count << std::endl
-           << exec << " " << kernel << " sin " << sin_count << std::endl
-           << exec << " " << kernel << " sqrt " << sqrt_count << std::endl
-           << exec << " " << kernel << " comparisons " << comparison_count << std::endl
-           << exec << " " << kernel << " blends " << blend_count << std::endl
-           << exec << " " << kernel << " totalops " << total << std::endl;
+        os << level << " " << exec << " " << kernel << " additions " << addition_count << std::endl
+           << level << " " << exec << " " << kernel << " multiplications " << multiplication_count << std::endl
+           << level << " " << exec << " " << kernel << " divisions " << division_count << std::endl
+           << level << " " << exec << " " << kernel << " exp " << exp_count << std::endl
+           << level << " " << exec << " " << kernel << " pow " << pow_count << std::endl
+           << level << " " << exec << " " << kernel << " sin " << sin_count << std::endl
+           << level << " " << exec << " " << kernel << " sqrt " << sqrt_count << std::endl
+           << level << " " << exec << " " << kernel << " comparisons " << comparison_count << std::endl
+           << level << " " << exec << " " << kernel << " blends " << blend_count << std::endl
+           << level << " " << exec << " " << kernel << " totalops " << total << std::endl;
 
         if (doReset)
           reset();
@@ -240,10 +240,10 @@ namespace oc {
       counters.reset();
     }
 
-    template<typename Stream>
-    static void reportOperations(Stream& os, std::string exec, std::string kernel, bool doReset = false)
+    template<typename Stream, typename T>
+    static void reportOperations(Stream& os, T level, std::string exec, std::string kernel, bool doReset = false)
     {
-      counters.reportOperations(os, exec, kernel, doReset);
+      counters.reportOperations(os, level, exec, kernel, doReset);
     }
 
     static Counters counters;
diff --git a/dune/perftool/common/timer_chrono.hh b/dune/perftool/common/timer_chrono.hh
index 6e2098a10a2433d0a3ffdf108d8d226ed82cf3f0..d1b0bd9b5d50e43a32a07623ef417c76337d8af6 100644
--- a/dune/perftool/common/timer_chrono.hh
+++ b/dune/perftool/common/timer_chrono.hh
@@ -79,35 +79,35 @@
 
 #ifdef ENABLE_COUNTER
 
-#define DUMP_TIMER(name,os,reset)\
+#define DUMP_TIMER(level,name,os,reset)\
   if (HP_TIMER_ELAPSED(name) > 1e-12) \
-    os << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \
-  HP_TIMER_OPCOUNTERS(name).reportOperations(os,ident,#name,reset);
+    os << #level << " " << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \
+  HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset);
 
-#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops)  \
+#define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops)  \
   if (HP_TIMER_ELAPSED(name) > 1e-12) \
-    os << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \
+    os << #level << " " << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \
   time += HP_TIMER_ELAPSED(name); \
   ops += HP_TIMER_OPCOUNTERS(name); \
-  HP_TIMER_OPCOUNTERS(name).reportOperations(os,ident,#name,reset);
+  HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset);
 
 #elif defined ENABLE_HP_TIMERS
 
-#define DUMP_TIMER(name,os,reset)                       \
+#define DUMP_TIMER(level,name,os,reset)                       \
   if (HP_TIMER_ELAPSED(name) > 1e-12) \
-    os << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \
+    os << #level << " " << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \
   if (reset) HP_TIMER_RESET(name);
 
-#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops)  \
+#define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops)  \
   if (HP_TIMER_ELAPSED(name) > 1e-12) \
-    os << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \
+    os << #level << " " << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \
   time += HP_TIMER_ELAPSED(name); \
   if (reset) HP_TIMER_RESET(name);
 
 #else
 
-#define DUMP_TIMER(name,os,reset)
-#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops)
+#define DUMP_TIMER(level,name,os,reset)
+#define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops)
 
 #endif
 
diff --git a/dune/perftool/common/timer_tsc.hh b/dune/perftool/common/timer_tsc.hh
index a893fdae507fc2319ce16206b0baefe30e305b81..449badc271f35c81cef500b234eb2d6eb315eaaf 100644
--- a/dune/perftool/common/timer_tsc.hh
+++ b/dune/perftool/common/timer_tsc.hh
@@ -77,35 +77,35 @@
 
 #ifdef ENABLE_COUNTER
 
-#define DUMP_TIMER(name,os,reset)\
+#define DUMP_TIMER(level,name,os,reset)\
   if (HP_TIMER_DURATION(name) > 1e-12) \
-    os << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
-  HP_TIMER_OPCOUNTERS(name).reportOperations(os,ident,#name,reset);
+    os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
+  HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset);
 
-#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops)  \
+#define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops)  \
   if (HP_TIMER_DURATION(name) > 1e-12) \
-    os << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
+    os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
   time += HP_TIMER_DURATION(name); \
   ops += HP_TIMER_OPCOUNTERS(name); \
-  HP_TIMER_OPCOUNTERS(name).reportOperations(os,ident,#name,reset);
+  HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset);
 
 #elif defined ENABLE_HP_TIMERS
 
-#define DUMP_TIMER(name,os,reset)                       \
+#define DUMP_TIMER(level,name,os,reset)                       \
   if (HP_TIMER_DURATION(name) > 1e-12) \
-    os << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
+    os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
   if (reset) HP_TIMER_RESET(name);
 
-#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops)  \
+#define DUMP_AND_ACCUMULATE_TIMER(level, name,os,reset,time,ops)  \
   if (HP_TIMER_DURATION(name) > 1e-12) \
-    os << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
+    os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
   time += HP_TIMER_DURATION(name); \
   if (reset) HP_TIMER_RESET(name);
 
 #else
 
-#define DUMP_TIMER(name,os,reset)
-#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops)
+#define DUMP_TIMER(level,name,os,reset)
+#define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops)
 
 #endif
 
diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh
index 9d924bf900afc5fc1865b142cb0501d8a78b7a34..6a40a8ea466f1c36fdbe18d5b7557441589cefdf 100644
--- a/dune/perftool/sumfact/transposereg.hh
+++ b/dune/perftool/sumfact/transposereg.hh
@@ -33,15 +33,30 @@ void transpose_reg(Vec4d& a0, Vec4d& a1)
 
 void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
 {
-  Vec8d b0, b1, b2, b3;
-  b0 = blend8d<0,1,8,9,2,3,10,11>(a0, a1);
-  b1 = blend8d<4,5,12,13,6,7,14,15>(a0, a1);
-  b2 = blend8d<0,1,8,9,2,3,10,11>(a2, a3);
-  b3 = blend8d<4,5,12,13,6,7,14,15>(a2, a3);
-  a0 = blend8d<0,1,2,3,8,9,10,11>(b0, b2);
-  a1 = blend8d<4,5,6,7,12,13,14,15>(b0, b2);
-  a2 = blend8d<0,1,2,3,8,9,10,11>(b1, b3);
-  a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3);
+    auto ac = _mm512_castpd_si512(a);
+    auto bc = _mm512_castpd_si512(b);
+    auto cc = _mm512_castpd_si512(c);
+    auto dc = _mm512_castpd_si512(d);
+
+    auto t1 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(ac, 0xCC, bc, ac, 6));
+    auto t2 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(bc, 0x33, bc, ac, 2));
+    auto t3 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(cc, 0xCC, dc, cc, 6));
+    auto t4 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(dc, 0x33, dc, cc, 2));
+
+    a = _mm512_insertf64x4(t1, _mm512_extractf64x4_pd(t3, 0), 1);
+    c = _mm512_insertf64x4(t3, _mm512_extractf64x4_pd(t1, 1), 0);
+    b = _mm512_insertf64x4(t2, _mm512_extractf64x4_pd(t4, 0), 1);
+    d = _mm512_insertf64x4(t4, _mm512_extractf64x4_pd(t2, 1), 0);
+//
+//  Vec8d b0, b1, b2, b3;
+//  b0 = blend8d<0,1,8,9,2,3,10,11>(a0, a1);
+//  b1 = blend8d<4,5,12,13,6,7,14,15>(a0, a1);
+//  b2 = blend8d<0,1,8,9,2,3,10,11>(a2, a3);
+//  b3 = blend8d<4,5,12,13,6,7,14,15>(a2, a3);
+//  a0 = blend8d<0,1,2,3,8,9,10,11>(b0, b2);
+//  a1 = blend8d<4,5,6,7,12,13,14,15>(b0, b2);
+//  a2 = blend8d<0,1,2,3,8,9,10,11>(b1, b3);
+//  a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3);
 }
 
 void transpose_reg (Vec8d& a0, Vec8d& a1)
diff --git a/patches/vectorclass/0001-Add-an-alternative-implementation-of-horizontal_add-.patch b/patches/vectorclass/0001-Add-an-alternative-implementation-of-horizontal_add-.patch
new file mode 100644
index 0000000000000000000000000000000000000000..10f1cd949df56339c25682fb578343e16774ceef
--- /dev/null
+++ b/patches/vectorclass/0001-Add-an-alternative-implementation-of-horizontal_add-.patch
@@ -0,0 +1,41 @@
+From d46b04686c44bea537773c99e5001482972a454a Mon Sep 17 00:00:00 2001
+From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
+Date: Thu, 12 Oct 2017 15:14:48 +0200
+Subject: [PATCH] Add an alternative implementation of horizontal_add for Vec8d
+
+---
+ vectorf512.h | 16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/vectorf512.h b/vectorf512.h
+index 0845d12..fb1a4d4 100644
+--- a/vectorf512.h
++++ b/vectorf512.h
+@@ -1339,13 +1339,19 @@ static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b)
+ 
+ 
+ // General arithmetic functions, etc.
++extern __inline double
++__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
++_mm512_cvtsd_f64 (__m512d __A)
++{
++  return __A[0];
++}
+ 
+ // Horizontal add: Calculates the sum of all vector elements.
+-static inline double horizontal_add (Vec8d const & a) {
+-#if defined(__INTEL_COMPILER)
+-    return _mm512_reduce_add_pd(a);
+-#else
+-    return horizontal_add(a.get_low() + a.get_high());
++static inline double horizontal_add (Vec8d const & x) {
++    __m512d intermediate = _mm512_add_pd(x, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(x), _mm512_castpd_si512(x), 1)$
++    intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castp$
++    intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castp$
++    return _mm512_cvtsd_f64(intermediate);
+ #endif
+ }
+ 
+-- 
+2.1.4
+
diff --git a/python/dune/perftool/generation/cpp.py b/python/dune/perftool/generation/cpp.py
index 0db90171a584a2237314fbf399e89656c7154a21..0f44b6950ef9b111f90f0d7a922835912c5b4500 100644
--- a/python/dune/perftool/generation/cpp.py
+++ b/python/dune/perftool/generation/cpp.py
@@ -2,7 +2,7 @@
 Define some generators based on the caching mechanism that
 are commonly needed for code generation
 """
-
+from dune.perftool.options import get_option
 from dune.perftool.generation import generator_factory
 from dune.perftool.cgen.clazz import AccessModifier, BaseClass, ClassMember
 
@@ -48,5 +48,5 @@ def dump_accumulate_timer(name):
     # reset = name_time_dumper_reset()
     reset = 'false'
 
-    code = "DUMP_TIMER({},{},{});".format(name, os, reset)
+    code = "DUMP_TIMER({},{},{},{});".format(get_option("instrumentation_level"), name, os, reset)
     return code
diff --git a/python/dune/perftool/pdelab/driver/__init__.py b/python/dune/perftool/pdelab/driver/__init__.py
index 2c38881f4a7351a199bc43f1515d8fc55c9914d2..46ced0b36a5032a32f1aa2ca59e9879e998a96c8 100644
--- a/python/dune/perftool/pdelab/driver/__init__.py
+++ b/python/dune/perftool/pdelab/driver/__init__.py
@@ -295,7 +295,7 @@ def generate_driver(formdatas, data):
         post_include("HP_DECLARE_TIMER(driver);\n", filetag="driver")
         contents.insert(0, Line(text="HP_TIMER_START(driver);\n"))
         contents.insert(len(contents) - 1, Line(text="HP_TIMER_STOP(driver);\n"))
-        contents.insert(len(contents) - 1, Line(text="DUMP_TIMER(driver, {}, true);\n".format(timestream)))
+        contents.insert(len(contents) - 1, Line(text="DUMP_TIMER({}, driver, {}, true);\n".format(get_option("instrumentation_level"), timestream)))
     contents.insert(0, Line(text="\n"))
     driver_body = Block(contents)
     driver = FunctionBody(driver_signature, driver_body)
diff --git a/python/dune/perftool/pdelab/driver/solve.py b/python/dune/perftool/pdelab/driver/solve.py
index 389a10c4c350801d71b7035e7a8115de8f8cff16..8042492c357580f23270f01718d02fc2823d99bd 100644
--- a/python/dune/perftool/pdelab/driver/solve.py
+++ b/python/dune/perftool/pdelab/driver/solve.py
@@ -67,7 +67,7 @@ def dune_solve():
         solve = ["HP_TIMER_START(solve);",
                  "{}".format(solve),
                  "HP_TIMER_STOP(solve);",
-                 "DUMP_TIMER(solve, {}, true);".format(timestream),
+                 "DUMP_TIMER({}, solve, {}, true);".format(get_option("instrumentation_level"), timestream),
                  ]
         if get_option('instrumentation_level') >= 3:
             solve.extend(print_times)
diff --git a/python/dune/perftool/pdelab/driver/timings.py b/python/dune/perftool/pdelab/driver/timings.py
index d420e61eba8adfcad4188f3cdf9e7441de610ba1..5c43a67a1550749dd92776688940744d711d2731 100644
--- a/python/dune/perftool/pdelab/driver/timings.py
+++ b/python/dune/perftool/pdelab/driver/timings.py
@@ -11,7 +11,9 @@ from dune.perftool.pdelab.driver import (get_formdata,
                                          name_initree,
                                          name_mpihelper,
                                          )
-from dune.perftool.pdelab.driver.gridfunctionspace import (name_trial_gfs,
+from dune.perftool.pdelab.driver.gridfunctionspace import (name_leafview,
+                                                           name_trial_gfs,
+                                                           type_leafview,
                                                            )
 from dune.perftool.pdelab.driver.gridoperator import (name_gridoperator,
                                                       name_localoperator,
@@ -37,9 +39,16 @@ def name_timing_identifier():
 @preamble
 def dump_dof_numbers(stream):
     ident = name_timing_identifier()
-    return "{} << {} << \" dofs dofs \" << {}.size() << std::endl;".format(stream,
-                                                                           ident,
-                                                                           name_trial_gfs())
+    level = get_option("instrumentation_level")
+    include_file("dune/pdelab/common/partitionviewentityset.hh", filetag="driver")
+    gvt = type_leafview()
+    gv = name_leafview()
+    return ["Dune::PDELab::NonOverlappingEntitySet<{}> es({});".format(gvt, gv),
+            "{} << \"{} \" << {} << \" dofs dofs \" << {}.maxLocalSize() * es.size(0) << std::endl;".format(stream,
+                                                                                                            level,
+                                                                                                            ident,
+                                                                                                            name_trial_gfs())
+            ]
 
 
 @preamble
@@ -98,7 +107,7 @@ def evaluate_residual_timer():
         evaluation = ["HP_TIMER_START(residual_evaluation);",
                       "{}.residual({}, r);".format(n_go, v),
                       "HP_TIMER_STOP(residual_evaluation);",
-                      "DUMP_TIMER(residual_evaluation, {}, true);".format(timestream)]
+                      "DUMP_TIMER({}, residual_evaluation, {}, true);".format(get_option("instrumentation_level"), timestream)]
         evaluation.extend(print_times)
     else:
         evaluation = ["{}.residual({}, r);".format(n_go, v)]
@@ -137,7 +146,7 @@ def apply_jacobian_timer():
         evaluation = ["HP_TIMER_START(apply_jacobian);",
                       "{}.jacobian_apply({}, j);".format(n_go, v),
                       "HP_TIMER_STOP(apply_jacobian);",
-                      "DUMP_TIMER(apply_jacobian, {}, true);".format(timestream)]
+                      "DUMP_TIMER({}, apply_jacobian, {}, true);".format(get_option("instrumentation_level"), timestream)]
         evaluation.extend(print_times)
     else:
         evaluation = ["{}.jacobian_apply({}, j);".format(n_go, v)]
@@ -174,7 +183,7 @@ def assemble_matrix_timer():
         assembly = ["HP_TIMER_START(matrix_assembly);",
                     "{}.jacobian({},m);".format(n_go, v),
                     "HP_TIMER_STOP(matrix_assembly);",
-                    "DUMP_TIMER(matrix_assembly, {}, true);".format(timestream)]
+                    "DUMP_TIMER({}, matrix_assembly, {}, true);".format(get_option("instrumentation_level"), timestream)]
         assembly.extend(print_times)
     else:
         assembly = ["{}.jacobian({},m);".format(n_go, v)]
diff --git a/python/dune/perftool/ufl/visitor.py b/python/dune/perftool/ufl/visitor.py
index 972a4bb7ac39c847c841993b12165b8264a36f73..7a3b15d201c5968b0eb4524964e4270523d9be1b 100644
--- a/python/dune/perftool/ufl/visitor.py
+++ b/python/dune/perftool/ufl/visitor.py
@@ -55,8 +55,8 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
             self.current_info = info
             expr = self._call(o, False)
             if expr != 0:
-                from dune.perftool.sympy import simplify_pymbolic_expression
-                expr = simplify_pymbolic_expression(expr)
+                # from dune.perftool.sympy import simplify_pymbolic_expression
+                # expr = simplify_pymbolic_expression(expr)
                 self.interface.generate_accumulation_instruction(expr, self)
 
     def _call(self, o, do_predicates):