diff --git a/bin/timings.sh b/bin/timings.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b85e6589dc9086b1488f788cccc86118e8a51e1e
--- /dev/null
+++ b/bin/timings.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# If an argument was given use it as the working directory
+if [ $# -eq 1 ]
+then
+  cd $1
+fi
+
+# Search for runnable executables
+FILES=$(ls *.ini | grep -v '^verify')
+for inifile in $FILES
+do
+  line=$(grep ^"opcounter = " $inifile)
+  extract=${line##opcounter = }
+  UPPER=10
+  if [ $extract -eq 1 ]
+  then
+    UPPER=1
+  fi
+  COUNT=0
+  while [ $COUNT -lt $UPPER ]; do
+    exec=${inifile%.ini}
+    MAXCORES=40
+    mpirun --bind-to core -np $MAXCORES ./$exec $inifile
+    COUNT=$((COUNT + 1))
+  done
+done
diff --git a/dune/perftool/common/opcounter.hh b/dune/perftool/common/opcounter.hh
index 5a0115e6b5649d45b86db1432d83a66330e00e0e..966f286e135409f79c4c3fd38d5c3d4117ef65a6 100644
--- a/dune/perftool/common/opcounter.hh
+++ b/dune/perftool/common/opcounter.hh
@@ -16,6 +16,16 @@ namespace oc {
   template<typename F>
   class OpCounter;
 
+  template<typename T>
+  struct isOpCounter : public std::false_type
+  {};
+
+  template<typename F>
+  struct isOpCounter<OpCounter<F>> : public std::true_type
+  {};
+
+  template<typename T>
+  constexpr bool isOpCounterV = isOpCounter<T>::value;
 }
 
 namespace Dune {
diff --git a/dune/perftool/common/vectorclass.hh b/dune/perftool/common/vectorclass.hh
index 0f5b9b88afb7869a32e321f5276d2b245c28439b..9a87d0586a2bf4309826d48f4f850e3d644b2d8f 100644
--- a/dune/perftool/common/vectorclass.hh
+++ b/dune/perftool/common/vectorclass.hh
@@ -244,6 +244,30 @@ static inline _vcl::Vec4db operator == (Vec4d const & a, Vec4d const & b) {
   return a_ == b_;
 }
 
+// vector operator == : returns true for elements for which a == b
+static inline _vcl::Vec4db operator == (oc::OpCounter<double> a, Vec4d const & b) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ == b_;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline _vcl::Vec4db operator == (Vec4d const & b, oc::OpCounter<double> a) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ == b_;
+}
+
 // vector operator != : returns true for elements for which a != b
 static inline _vcl::Vec4db operator != (Vec4d const & a, Vec4d const & b) {
   BARRIER;
@@ -258,6 +282,30 @@ static inline _vcl::Vec4db operator != (Vec4d const & a, Vec4d const & b) {
   return a_ != b_;
 }
 
+// vector operator != : returns true for elements for which a != b
+static inline _vcl::Vec4db operator != (oc::OpCounter<double> a, Vec4d const & b) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ != b_;
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline _vcl::Vec4db operator != (Vec4d const & b, oc::OpCounter<double> a) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ != b_;
+}
+
 // vector operator < : returns true for elements for which a < b
 static inline _vcl::Vec4db operator < (Vec4d const & a, Vec4d const & b) {
   BARRIER;
@@ -272,6 +320,30 @@ static inline _vcl::Vec4db operator < (Vec4d const & a, Vec4d const & b) {
   return a_ < b_;
 }
 
+// vector operator < : returns true for elements for which a < b
+static inline _vcl::Vec4db operator < (oc::OpCounter<double> a, Vec4d const & b) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ < b_;
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline _vcl::Vec4db operator < (Vec4d const & b, oc::OpCounter<double> a) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return b_ < a_;
+}
+
 // vector operator <= : returns true for elements for which a <= b
 static inline _vcl::Vec4db operator <= (Vec4d const & a, Vec4d const & b) {
   BARRIER;
@@ -286,16 +358,61 @@ static inline _vcl::Vec4db operator <= (Vec4d const & a, Vec4d const & b) {
   return a_ <= b_;
 }
 
+// vector operator <= : returns true for elements for which a <= b
+static inline _vcl::Vec4db operator <= (oc::OpCounter<double> a, Vec4d const & b) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return a_ <= b_;
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline _vcl::Vec4db operator <= (Vec4d const & b, oc::OpCounter<double> a) {
+  BARRIER;
+  _vcl::Vec4d a_(a._v), b_;
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec4d::F::comparisons(4);
+  BARRIER;
+  return b_ <= a_;
+}
+
 // vector operator > : returns true for elements for which a > b
 static inline _vcl::Vec4db operator > (Vec4d const & a, Vec4d const & b) {
     return b < a;
 }
 
+// vector operator > : returns true for elements for which a > b
+static inline _vcl::Vec4db operator > (oc::OpCounter<double> a, Vec4d const & b) {
+    return a < b;
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline _vcl::Vec4db operator > (Vec4d const & b, oc::OpCounter<double> a) {
+    return a < b;
+}
+
 // vector operator >= : returns true for elements for which a >= b
 static inline _vcl::Vec4db operator >= (Vec4d const & a, Vec4d const & b) {
     return b <= a;
 }
 
+// vector operator >= : returns true for elements for which a >= b
+static inline _vcl::Vec4db operator >= (oc::OpCounter<double> a, Vec4d const & b) {
+    return b <= a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline _vcl::Vec4db operator >= (Vec4d const & b, oc::OpCounter<double> a) {
+    return a <= b;
+}
+
+
 // avoid logical operators for now, I don't think we need them
 #if 0
 
@@ -411,81 +528,40 @@ static inline Vec4d exp(Vec4d const & a){
   return r;
 }
 
-
-// ignore pow() for now
-#if 0
-
-// pow(Vec4d, int):
-template <typename TT> static Vec4d pow(Vec4d const & a, TT n);
-
-// Raise floating point numbers to integer power n
-template <>
-inline Vec4d pow<int>(Vec4d const & x0, int n) {
-    return pow_template_i<Vec4d>(x0, n);
+// pow
+template <typename TT>
+static inline Vec4d pow(Vec4d const & a, oc::OpCounter<TT> n)
+{
+  BARRIER;
+  Vec4d r;
+  std::transform(a._d,a._d+4,r._d,[=](auto x){ return pow(x, n); });
+  BARRIER;
+  return r;
 }
 
-// allow conversion from unsigned int
-template <>
-inline Vec4d pow<uint32_t>(Vec4d const & x0, uint32_t n) {
-    return pow_template_i<Vec4d>(x0, (int)n);
+// pow
+template <typename TT>
+static inline
+std::enable_if_t<not oc::isOpCounterV<TT>, Vec4d> pow(Vec4d const & a, TT n)
+{
+  BARRIER;
+  Vec4d r;
+  std::transform(a._d,a._d+4,r._d,[=](auto x){ return pow(x, n); });
+  BARRIER;
+  return r;
 }
 
 
-// Raise floating point numbers to integer power n, where n is a compile-time constant
-template <int n>
-static inline Vec4d pow_n(Vec4d const & a) {
-    if (n < 0)    return Vec4d(1.0) / pow_n<-n>(a);
-    if (n == 0)   return Vec4d(1.0);
-    if (n >= 256) return pow(a, n);
-    Vec4d x = a;                       // a^(2^i)
-    Vec4d y;                           // accumulator
-    const int lowest = n - (n & (n-1));// lowest set bit in n
-    if (n & 1) y = x;
-    if (n < 2) return y;
-    x = x*x;                           // x^2
-    if (n & 2) {
-        if (lowest == 2) y = x; else y *= x;
-    }
-    if (n < 4) return y;
-    x = x*x;                           // x^4
-    if (n & 4) {
-        if (lowest == 4) y = x; else y *= x;
-    }
-    if (n < 8) return y;
-    x = x*x;                           // x^8
-    if (n & 8) {
-        if (lowest == 8) y = x; else y *= x;
-    }
-    if (n < 16) return y;
-    x = x*x;                           // x^16
-    if (n & 16) {
-        if (lowest == 16) y = x; else y *= x;
-    }
-    if (n < 32) return y;
-    x = x*x;                           // x^32
-    if (n & 32) {
-        if (lowest == 32) y = x; else y *= x;
-    }
-    if (n < 64) return y;
-    x = x*x;                           // x^64
-    if (n & 64) {
-        if (lowest == 64) y = x; else y *= x;
-    }
-    if (n < 128) return y;
-    x = x*x;                           // x^128
-    if (n & 128) {
-        if (lowest == 128) y = x; else y *= x;
-    }
-    return y;
-}
-
-template <int n>
-static inline Vec4d pow(Vec4d const & a, Const_int_t<n>) {
-    return pow_n<n>(a);
+static inline Vec4d select(const _vcl::Vec4db& s, const Vec4d& a, const Vec4d& b)
+{
+  BARRIER;
+  Vec4d r;
+  for(int i=0; i<4; ++i)
+    r._d[i] = s.extract(i) ? a._d[i] : b._d[i];
+  BARRIER;
+  return r;
 }
 
-#endif
-
 // function round: round to nearest integer (even). (result as double vector)
 static inline Vec4d round(Vec4d const & a) {
   BARRIER;
@@ -1095,7 +1171,7 @@ struct Vec8f
 
 /*****************************************************************************
 *
-*          Operators for Vec4d
+*          Operators for Vec8f
 *
 *****************************************************************************/
 
diff --git a/python/dune/perftool/pdelab/driver/__init__.py b/python/dune/perftool/pdelab/driver/__init__.py
index 91666020ee0487fdb6a8733f5a1893ffa014587d..4a832ba7709c9f8ca7f7c6f6380230f43fa55579 100644
--- a/python/dune/perftool/pdelab/driver/__init__.py
+++ b/python/dune/perftool/pdelab/driver/__init__.py
@@ -296,7 +296,7 @@ def generate_driver():
     add_section("vtk", "Do visualization...")
     add_section("instat", "Set up instationary stuff...")
     add_section("timings", "Maybe take performance measurements...")
-    add_section("print", "Maybe print residuals and matrices to stdout...")
+    add_section("printing", "Maybe print residuals and matrices to stdout...")
     add_section("error", "Maybe calculate errors for test results...")
 
     if get_option("instrumentation_level") >= 1:
diff --git a/python/dune/perftool/pdelab/driver/solve.py b/python/dune/perftool/pdelab/driver/solve.py
index 4ecce62ff12bf1afd4f36b4d16ca1216c0ba3910..8f3ef96ff9fd2babe838a2b3ceb7baeeb63d67f7 100644
--- a/python/dune/perftool/pdelab/driver/solve.py
+++ b/python/dune/perftool/pdelab/driver/solve.py
@@ -204,7 +204,7 @@ def name_stationarynonlinearproblemsolver(go_type, go):
     return name
 
 
-@preamble(section="print")
+@preamble(section="printing")
 def print_residual():
     ini = name_initree()
     n_go = name_gridoperator(get_form_ident())
@@ -227,7 +227,7 @@ def print_residual():
             "}"]
 
 
-@preamble(section="print")
+@preamble(section="printing")
 def print_matrix():
     ini = name_initree()
     t_go = type_gridoperator(get_form_ident())
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index c56f24e25f1e62fa116b8f753c0eb7de04a2d8db..b205b34c4f48fcd0df5bdfdae3213d8c2c929969 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -613,6 +613,8 @@ class TimerMethod(ClassMember):
 
 class LoopyKernelMethod(ClassMember):
     def __init__(self, signature, kernel, add_timings=True, initializer_list=[]):
+        self.name = kernel.name
+
         from loopy import generate_body
         from cgen import LiteralLines, Block
         content = signature
@@ -659,6 +661,10 @@ class LoopyKernelMethod(ClassMember):
 def cgen_class_from_cache(tag, members=[]):
     from dune.perftool.generation import retrieve_cache_items
 
+    # Sort the given member functions by their name to help debugging by fixing
+    # the order
+    members = sorted(members, key=lambda m: m.name)
+
     # Generate the name by concatenating basename and template parameters
     basename, fullname = class_type_from_cache(tag)