diff --git a/applications/knl/poisson_dg/knl_poisson_dg.mini b/applications/knl/poisson_dg/knl_poisson_dg.mini
index ba2f345fd39edcca22fbbb9aded8e4bd3193157c..41755b7e08a746af270a3d6572dea6dc6273404a 100644
--- a/applications/knl/poisson_dg/knl_poisson_dg.mini
+++ b/applications/knl/poisson_dg/knl_poisson_dg.mini
@@ -3,6 +3,7 @@ __exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{for
 
 opcount_suffix = opcount, nonopcount | expand opcount
 {opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude
+dune-opcounter_FOUND, 1 | expand opcount | cmake_guard
 
 # Calculate the size of the grid to equlibritate it to 100 MB/rank
 # Input parameters
diff --git a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini
index 4f3f5826f71fd9fc742efa4d88ae222e45759529..9d87f7eea1d29b61fc50ee460b8a1098b42cfc5a 100644
--- a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini
+++ b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini
@@ -3,6 +3,7 @@ __exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{for
 
 opcount_suffix = opcount, nonopcount | expand opcount
 {opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude
+dune-opcounter_FOUND, 1 | expand opcount | cmake_guard
 
 # Calculate the size of the grid to equlibritate it to 100 MB/rank
 # Input parameters
diff --git a/applications/poisson_dg/poisson_dg.mini b/applications/poisson_dg/poisson_dg.mini
index 7e65a4815f5fccc2dc6aae955bbdef559e9ebe92..d031d8233611e2492ef17eb80c3ae5d24518183f 100644
--- a/applications/poisson_dg/poisson_dg.mini
+++ b/applications/poisson_dg/poisson_dg.mini
@@ -3,6 +3,7 @@ __exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{for
 
 opcount_suffix = opcount, nonopcount | expand opcount
 {opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude
+dune-opcounter_FOUND, 1 | expand opcount | cmake_guard
 
 # Calculate the size of the grid to equlibritate it to 100 MB/rank
 # Input parameters
diff --git a/applications/poisson_dg_tensor/poisson_dg_tensor.mini b/applications/poisson_dg_tensor/poisson_dg_tensor.mini
index 4e7ac555b33fa5fcef568cca5b6dddf6dd3552e3..a5738a46a076588437d0cf2a509fa4623290af2f 100644
--- a/applications/poisson_dg_tensor/poisson_dg_tensor.mini
+++ b/applications/poisson_dg_tensor/poisson_dg_tensor.mini
@@ -3,6 +3,7 @@ __exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{for
 
 opcount_suffix = opcount, nonopcount | expand opcount
 {opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude
+dune-opcounter_FOUND, 1 | expand opcount | cmake_guard
 
 # Calculate the size of the grid to equlibritate it to 100 MB/rank
 # Input parameters
diff --git a/applications/stokes_dg/stokes_dg.mini b/applications/stokes_dg/stokes_dg.mini
index a80567f9b73473ae09e52c3a768177431435e849..6160ced2c164d0fb777f4777234bb30dd0d11768 100644
--- a/applications/stokes_dg/stokes_dg.mini
+++ b/applications/stokes_dg/stokes_dg.mini
@@ -3,6 +3,7 @@ __exec_suffix = deg{formcompiler.ufl_variants.v_degree}_{opcount_suffix}_level{f
 
 opcount_suffix = opcount, nonopcount | expand opcount
 {opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude
+dune-opcounter_FOUND, 1 | expand opcount | cmake_guard
 
 # Calculate the size of the grid to equlibritate it to 100 MB/rank
 # Input parameters
diff --git a/cmake/modules/GeneratedSystemtests.cmake b/cmake/modules/GeneratedSystemtests.cmake
index 634d7ad35605ef6707436eaad98f882c40bb4c1d..2b1d87728c04a9359df79f3711c0df9c4f3f621e 100644
--- a/cmake/modules/GeneratedSystemtests.cmake
+++ b/cmake/modules/GeneratedSystemtests.cmake
@@ -96,7 +96,7 @@ function(dune_add_formcompiler_system_test)
                )
 
       set_tests_properties(${tname} PROPERTIES SKIP_RETURN_CODE 77)
-      set_tests_properties(${tname} PROPERTIES TIMEOUT 60)
+      set_tests_properties(${tname} PROPERTIES TIMEOUT 120)
     endif()
   endforeach()
 endfunction()
diff --git a/dune.module b/dune.module
index 264e9fb20580d83c62defce6410f9f3326728cc5..c6a0e7bb6e053b8d274511f4dbb54d9d2eab84c4 100644
--- a/dune.module
+++ b/dune.module
@@ -8,3 +8,4 @@ Version: 0.0
 Maintainer: dominic.kempf@iwr.uni-heidelberg.de
 #depending on
 Depends: dune-testtools dune-pdelab dune-alugrid
+Suggests: dune-opcounter
diff --git a/dune/perftool/common/CMakeLists.txt b/dune/perftool/common/CMakeLists.txt
index 31520e34b6dde5cc0de439d9885128f44e5b1a1c..6cd100631e8241eb91cbd4a3f11b27197941b8c9 100644
--- a/dune/perftool/common/CMakeLists.txt
+++ b/dune/perftool/common/CMakeLists.txt
@@ -1,6 +1,7 @@
 install(FILES muladd_workarounds.hh
-              opcounter.hh
               timer.hh
+              timer_tsc.hh
+              timer_chrono.hh
               tsc.hh
               vectorclass.hh
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/dune/perftool/common
diff --git a/dune/perftool/common/muladd_workarounds.hh b/dune/perftool/common/muladd_workarounds.hh
index f05dab7c8695d5f6d587a86b473a2fc86a9f372c..ba728fad57bbbe78ed997c6b8f42f5cbf265988b 100644
--- a/dune/perftool/common/muladd_workarounds.hh
+++ b/dune/perftool/common/muladd_workarounds.hh
@@ -3,36 +3,14 @@
 
 /* We are currently having some issues with FMA nodes not being
  * eliminated correctly upon code generation. We "solve" the problem
- * for now with overloads of the mul_add function for scalars.
+ * for now with a generic implementation of  the mul_add function.
  */
 
-#include<dune/perftool/common/opcounter.hh>
 
-
-inline double mul_add(double op1, double& op2, double op3)
-{
-	return op1 * op2 + op3;
-}
-
-
-inline float mul_add(float op1, float& op2, float op3)
+template<typename T>
+inline T mul_add(T op1, T op2, T op3)
 {
-	return op1 * op2 + op3;
+  return op1 * op2 + op3;
 }
 
-
-#ifdef ENABLE_COUNTER
-
-oc::OpCounter<double> mul_add(oc::OpCounter<double> op1, oc::OpCounter<double>& op2, oc::OpCounter<double> op3)
-{
-	return op1 * op2 + op3;
-}
-
-oc::OpCounter<float> mul_add(oc::OpCounter<float> op1, oc::OpCounter<float>& op2, oc::OpCounter<float> op3)
-{
-	return op1 * op2 + op3;
-}
-
-#endif
-
 #endif
diff --git a/dune/perftool/common/opcounter.hh b/dune/perftool/common/opcounter.hh
deleted file mode 100644
index edb16eaacc39f3c383fc7ace2fc7b733354116e7..0000000000000000000000000000000000000000
--- a/dune/perftool/common/opcounter.hh
+++ /dev/null
@@ -1,974 +0,0 @@
-#ifndef __OPCOUNTER__
-#define __OPCOUNTER__
-
-#include <type_traits>
-#include <iostream>
-#include <cmath>
-#include <cstdlib>
-#include <ostream>
-#include <sstream>
-#include <istream>
-
-#include <dune/common/typetraits.hh>
-
-namespace oc {
-
-  template<typename F>
-  class OpCounter;
-
-  template<typename T>
-  struct isOpCounter : public std::false_type
-  {};
-
-  template<typename F>
-  struct isOpCounter<OpCounter<F>> : public std::true_type
-  {};
-
-  template<typename T>
-  constexpr bool isOpCounterV = isOpCounter<T>::value;
-}
-
-namespace Dune {
-
-  template <typename T>
-  struct IsNumber<oc::OpCounter<T>>
-    : public std::integral_constant<bool, IsNumber<T>::value>{
-  };
-
-  template<typename T, int n>
-  class FieldVector;
-
-}
-
-namespace oc {
-
-  template<typename F>
-  class OpCounter
-  {
-
-  public:
-
-    using size_type = std::size_t;
-
-    using value_type = F;
-
-    OpCounter()
-      : _v()
-    {}
-
-    template<typename T>
-    OpCounter(const T& t, typename std::enable_if<std::is_same<T,int>::value and !std::is_same<F,int>::value>::type* = nullptr)
-      : _v(t)
-    {}
-
-    OpCounter(const F& f)
-      : _v(f)
-    {}
-
-    OpCounter(F&& f)
-      : _v(f)
-    {}
-
-    explicit OpCounter(const char* s)
-      : _v(strtod(s,nullptr))
-    {}
-
-    OpCounter& operator=(const char* s)
-    {
-      _v = strtod(s,nullptr);
-      return *this;
-    }
-
-    explicit operator F() const
-    {
-      return _v;
-    }
-
-    OpCounter& operator=(const F& f)
-    {
-      _v = f;
-      return *this;
-    }
-
-    OpCounter& operator=(F&& f)
-    {
-      _v = f;
-      return *this;
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const OpCounter& f)
-    {
-      os << "OC(" << f._v << ")";
-      return os;
-    }
-
-    friend std::istringstream& operator>>(std::istringstream& iss, OpCounter& f)
-    {
-      iss >> f._v;
-      return iss;
-    }
-
-    friend std::istream& operator>>(std::istream& is, OpCounter& f)
-    {
-      is >> f._v;
-      return is;
-    }
-
-    F* data()
-    {
-      return &_v;
-    }
-
-    const F* data() const
-    {
-      return &_v;
-    }
-
-    F _v;
-
-    // The alignment is necessary to work around
-    // https://bugs.llvm.org/show_bug.cgi?id=34219:
-    // "SLP vectorizer: aligned store to unaligned address"
-    // The bug is present in 3.8.0 (tags/RELEASE_380/final 263969) and in
-    // clang version 6.0.0 (trunk 310993)
-    struct alignas(16) Counters {
-
-      size_type addition_count;
-      size_type multiplication_count;
-      size_type division_count;
-      size_type exp_count;
-      size_type pow_count;
-      size_type sin_count;
-      size_type sqrt_count;
-      size_type comparison_count;
-      size_type blend_count;
-      size_type permute_count;
-
-      Counters()
-        : addition_count(0)
-        , multiplication_count(0)
-        , division_count(0)
-        , exp_count(0)
-        , pow_count(0)
-        , sin_count(0)
-        , sqrt_count(0)
-        , comparison_count(0)
-        , blend_count(0)
-        , permute_count(0)
-      {}
-
-      void reset()
-      {
-        addition_count = 0;
-        multiplication_count = 0;
-        division_count = 0;
-        exp_count = 0;
-        pow_count = 0;
-        sin_count = 0;
-        sqrt_count = 0;
-        comparison_count = 0;
-        blend_count = 0;
-        permute_count = 0;
-      }
-
-      template<typename Stream, typename T>
-      void reportOperations(Stream& os, T level, std::string exec, std::string kernel, bool doReset = false)
-      {
-        auto total = addition_count + multiplication_count + division_count + exp_count + pow_count + sin_count + sqrt_count + comparison_count;
-        if (total == 0)
-          return;
-        os << level << " " << exec << " " << kernel << " additions " << addition_count << std::endl
-           << level << " " << exec << " " << kernel << " multiplications " << multiplication_count << std::endl
-           << level << " " << exec << " " << kernel << " divisions " << division_count << std::endl
-           << level << " " << exec << " " << kernel << " exp " << exp_count << std::endl
-           << level << " " << exec << " " << kernel << " pow " << pow_count << std::endl
-           << level << " " << exec << " " << kernel << " sin " << sin_count << std::endl
-           << level << " " << exec << " " << kernel << " sqrt " << sqrt_count << std::endl
-           << level << " " << exec << " " << kernel << " comparisons " << comparison_count << std::endl
-           << level << " " << exec << " " << kernel << " blends " << blend_count << std::endl
-           << level << " " << exec << " " << kernel << " permutes " << permute_count << std::endl
-           << level << " " << exec << " " << kernel << " totalops " << total << std::endl;
-
-        if (doReset)
-          reset();
-      }
-
-      Counters& operator+=(const Counters& rhs)
-      {
-        addition_count += rhs.addition_count;
-        multiplication_count += rhs.multiplication_count;
-        division_count += rhs.division_count;
-        exp_count += rhs.exp_count;
-        pow_count += rhs.pow_count;
-        sin_count += rhs.sin_count;
-        sqrt_count += rhs.sqrt_count;
-        comparison_count += rhs.comparison_count;
-        blend_count += rhs.blend_count;
-        permute_count += rhs.permute_count;
-        return *this;
-      }
-
-      Counters operator-(const Counters& rhs)
-      {
-        Counters r;
-        r.addition_count = addition_count - rhs.addition_count;
-        r.multiplication_count = multiplication_count - rhs.multiplication_count;
-        r.division_count = division_count - rhs.division_count;
-        r.exp_count = exp_count - rhs.exp_count;
-        r.pow_count = pow_count - rhs.pow_count;
-        r.sin_count = sin_count - rhs.sin_count;
-        r.sqrt_count = sqrt_count - rhs.sqrt_count;
-        r.comparison_count = comparison_count - rhs.comparison_count;
-        r.blend_count = blend_count - rhs.blend_count;
-        r.permute_count = permute_count - rhs.permute_count;
-        return r;
-      }
-
-    };
-
-    static void additions(std::size_t n)
-    {
-      counters.addition_count += n;
-    }
-
-    static void multiplications(std::size_t n)
-    {
-      counters.multiplication_count += n;
-    }
-
-    static void divisions(std::size_t n)
-    {
-      counters.division_count += n;
-    }
-
-    static void comparisons(std::size_t n)
-    {
-      counters.comparison_count += n;
-    }
-
-    static void blends(std::size_t n)
-    {
-      counters.blend_count += n;
-    }
-
-    static void permutes(std::size_t n)
-    {
-      counters.permute_count += n;
-    }
-
-    static void reset()
-    {
-      counters.reset();
-    }
-
-    template<typename Stream, typename T>
-    static void reportOperations(Stream& os, T level, std::string exec, std::string kernel, bool doReset = false)
-    {
-      counters.reportOperations(os, level, exec, kernel, doReset);
-    }
-
-    static Counters counters;
-
-  };
-
-  template<typename F>
-  typename OpCounter<F>::Counters OpCounter<F>::counters;
-
-  // ********************************************************************************
-  // negation
-  // ********************************************************************************
-
-  template<typename F>
-  OpCounter<F> operator-(const OpCounter<F>& a)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    return {-a._v};
-  }
-
-
-  // ********************************************************************************
-  // addition
-  // ********************************************************************************
-
-  template<typename F>
-  OpCounter<F> operator+(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    return {a._v + b._v};
-  }
-
-  template<typename F>
-  OpCounter<F> operator+(const OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    return {a._v + b};
-  }
-
-  template<typename F>
-  OpCounter<F> operator+(const F& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    return {a + b._v};
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>
-    >::type
-  operator+(const OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    return {a._v + b};
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>
-    >::type
-  operator+(const T& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    return {a + b._v};
-  }
-
-  template<typename F>
-  OpCounter<F>& operator+=(OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    a._v += b._v;
-    return a;
-  }
-
-  template<typename F>
-  OpCounter<F>& operator+=(OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    a._v += b;
-    return a;
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>&
-    >::type
-  operator+=(OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    a._v += b;
-    return a;
-  }
-
-  template<typename F>
-  OpCounter<F>& operator+=(OpCounter<F>& a, const Dune::FieldVector<OpCounter<F>,1>& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    a._v += b[0]._v;
-    return a;
-  }
-
-  // ********************************************************************************
-  // subtraction
-  // ********************************************************************************
-
-  template<typename F>
-  OpCounter<F> operator-(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    return {a._v - b._v};
-  }
-
-  template<typename F>
-  OpCounter<F> operator-(const OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    return {a._v - b};
-  }
-
-  template<typename F>
-  OpCounter<F> operator-(const F& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    return {a - b._v};
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>
-    >::type
-  operator-(const OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    return {a._v - b};
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>
-    >::type
-  operator-(const T& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    return {a - b._v};
-  }
-
-  template<typename F>
-  OpCounter<F>& operator-=(OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    a._v -= b._v;
-    return a;
-  }
-
-  template<typename F>
-  OpCounter<F>& operator-=(OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    a._v -= b;
-    return a;
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>&
-    >::type
-  operator-=(OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.addition_count;
-    a._v -= b;
-    return a;
-  }
-
-
-  // ********************************************************************************
-  // multiplication
-  // ********************************************************************************
-
-  template<typename F>
-  OpCounter<F> operator*(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.multiplication_count;
-    return {a._v * b._v};
-  }
-
-  template<typename F>
-  OpCounter<F> operator*(const OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.multiplication_count;
-    return {a._v * b};
-  }
-
-  template<typename F>
-  OpCounter<F> operator*(const F& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.multiplication_count;
-    return {a * b._v};
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>
-    >::type
-  operator*(const OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.multiplication_count;
-    return {a._v * b};
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>
-    >::type
-  operator*(const T& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.multiplication_count;
-    return {a * b._v};
-  }
-
-  template<typename F>
-  OpCounter<F>& operator*=(OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.multiplication_count;
-    a._v *= b._v;
-    return a;
-  }
-
-  template<typename F>
-  OpCounter<F>& operator*=(OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.multiplication_count;
-    a._v *= b;
-    return a;
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>&
-    >::type
-  operator*=(OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.multiplication_count;
-    a._v *= b;
-    return a;
-  }
-
-
-  // ********************************************************************************
-  // division
-  // ********************************************************************************
-
-  template<typename F>
-  OpCounter<F> operator/(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.division_count;
-    return {a._v / b._v};
-  }
-
-  template<typename F>
-  OpCounter<F> operator/(const OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.division_count;
-    return {a._v / b};
-  }
-
-  template<typename F>
-  OpCounter<F> operator/(const F& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.division_count;
-    return {a / b._v};
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>
-    >::type
-  operator/(const OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.division_count;
-    return {a._v / b};
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>
-    >::type
-  operator/(const T& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.division_count;
-    return {a / b._v};
-  }
-
-  template<typename F>
-  OpCounter<F>& operator/=(OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.division_count;
-    a._v /= b._v;
-    return a;
-  }
-
-  template<typename F>
-  OpCounter<F>& operator/=(OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.division_count;
-    a._v /= b;
-    return a;
-  }
-
-  template<typename F, typename T>
-  typename std::enable_if<
-    std::is_arithmetic<T>::value,
-    OpCounter<F>&
-    >::type
-  operator/=(OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.division_count;
-    a._v /= b;
-    return a;
-  }
-
-
-
-  // ********************************************************************************
-  // comparisons
-  // ********************************************************************************
-
-
-  // ********************************************************************************
-  // less
-  // ********************************************************************************
-
-  template<typename F>
-  bool operator<(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v < b._v};
-  }
-
-  template<typename F>
-  bool operator<(const OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v < b};
-  }
-
-  template<typename F>
-  bool operator<(const F& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a < b._v};
-  }
-
-  template<typename F, typename T>
-  bool operator<(const OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v < b};
-  }
-
-  template<typename F, typename T>
-  bool operator<(const T& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a < b._v};
-  }
-
-
-  // ********************************************************************************
-  // less_or_equals
-  // ********************************************************************************
-
-  template<typename F>
-  bool operator<=(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v <= b._v};
-  }
-
-  template<typename F>
-  bool operator<=(const OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v <= b};
-  }
-
-  template<typename F>
-  bool operator<=(const F& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a <= b._v};
-  }
-
-  template<typename F, typename T>
-  bool operator<=(const OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v <= b};
-  }
-
-  template<typename F, typename T>
-  bool operator<=(const T& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a <= b._v};
-  }
-
-
-  // ********************************************************************************
-  // greater
-  // ********************************************************************************
-
-  template<typename F>
-  bool operator>(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v > b._v};
-  }
-
-  template<typename F>
-  bool operator>(const OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v > b};
-  }
-
-  template<typename F>
-  bool operator>(const F& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a > b._v};
-  }
-
-  template<typename F, typename T>
-  bool operator>(const OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v > b};
-  }
-
-  template<typename F, typename T>
-  bool operator>(const T& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a > b._v};
-  }
-
-
-  // ********************************************************************************
-  // greater_or_equals
-  // ********************************************************************************
-
-  template<typename F>
-  bool operator>=(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v >= b._v};
-  }
-
-  template<typename F>
-  bool operator>=(const OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v >= b};
-  }
-
-  template<typename F>
-  bool operator>=(const F& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a >= b._v};
-  }
-
-  template<typename F, typename T>
-  bool operator>=(const OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v >= b};
-  }
-
-  template<typename F, typename T>
-  bool operator>=(const T& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a >= b._v};
-  }
-
-
-  // ********************************************************************************
-  // inequals
-  // ********************************************************************************
-
-  template<typename F>
-  bool operator!=(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v != b._v};
-  }
-
-  template<typename F>
-  bool operator!=(const OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v != b};
-  }
-
-  template<typename F>
-  bool operator!=(const F& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a != b._v};
-  }
-
-  template<typename F, typename T>
-  bool operator!=(const OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v != b};
-  }
-
-  template<typename F, typename T>
-  bool operator!=(const T& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a != b._v};
-  }
-
-
-  // ********************************************************************************
-  // equals
-  // ********************************************************************************
-
-  template<typename F>
-  bool operator==(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v == b._v};
-  }
-
-  template<typename F>
-  bool operator==(const OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v == b};
-  }
-
-  template<typename F>
-  bool operator==(const F& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a == b._v};
-  }
-
-  template<typename F, typename T>
-  bool operator==(const OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a._v == b};
-  }
-
-  template<typename F, typename T>
-  bool operator==(const T& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {a == b._v};
-  }
-
-
-
-  // ********************************************************************************
-  // functions
-  // ********************************************************************************
-
-  template<typename F>
-  OpCounter<F> exp(const OpCounter<F>& a)
-  {
-    ++OpCounter<F>::counters.exp_count;
-    return {std::exp(a._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> pow(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.pow_count;
-    return {std::pow(a._v,b._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> pow(const OpCounter<F>& a, const F& b)
-  {
-    ++OpCounter<F>::counters.pow_count;
-    return {std::pow(a._v,b)};
-  }
-
-  template<typename F, typename T>
-  OpCounter<F> pow(const OpCounter<F>& a, const T& b)
-  {
-    ++OpCounter<F>::counters.pow_count;
-    return {std::pow(a._v,b)};
-  }
-
-  template<typename F>
-  OpCounter<F> pow(const F& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.pow_count;
-    return {std::pow(a,b._v)};
-  }
-
-  template<typename F, typename T>
-  OpCounter<F> pow(const T& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.pow_count;
-    return {std::pow(a,b._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> sin(const OpCounter<F>& a)
-  {
-    ++OpCounter<F>::counters.sin_count;
-    return {std::sin(a._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> cos(const OpCounter<F>& a)
-  {
-    ++OpCounter<F>::counters.sin_count;
-    return {std::cos(a._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> sqrt(const OpCounter<F>& a)
-  {
-    ++OpCounter<F>::counters.sqrt_count;
-    return {std::sqrt(a._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> abs(const OpCounter<F>& a)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    return {std::abs(a._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> max(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    using std::max;
-    return {max(a._v,b._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> min(const OpCounter<F>& a, const OpCounter<F>& b)
-  {
-    ++OpCounter<F>::counters.comparison_count;
-    using std::min;
-    return {min(a._v,b._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> round(const OpCounter<F>& a)
-  {
-    using std::round;
-    return {round(a._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> trunc(const OpCounter<F>& a)
-  {
-    using std::trunc;
-    return {trunc(a._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> ceil(const OpCounter<F>& a)
-  {
-    using std::ceil;
-    return {ceil(a._v)};
-  }
-
-  template<typename F>
-  OpCounter<F> floor(const OpCounter<F>& a)
-  {
-    using std::floor;
-    return {floor(a._v)};
-  }
-
-  template<typename F>
-  bool isnan(const OpCounter<F>& a)
-  {
-	using std::isnan;
-	return {isnan(a._v)};
-  }
-
-}
-
-#endif // __OPCOUNTER__
diff --git a/dune/perftool/common/timer_chrono.hh b/dune/perftool/common/timer_chrono.hh
index d1b0bd9b5d50e43a32a07623ef417c76337d8af6..8e96a70a2e4126471f2636300bfb9a46841d5f9f 100644
--- a/dune/perftool/common/timer_chrono.hh
+++ b/dune/perftool/common/timer_chrono.hh
@@ -7,7 +7,7 @@
 
 #include <chrono>
 
-#include <dune/perftool/common/opcounter.hh>
+#include <dune/opcounter/opcounter.hh>
 
 #define HP_TIMER_OPCOUNTER oc::OpCounter<double>
 
diff --git a/dune/perftool/common/timer_tsc.hh b/dune/perftool/common/timer_tsc.hh
index 0f9e1e040f7d30ebe8d54c3d793e959699250e96..10ec75d98ffbb5a68425dbab21367fb20bdbf395 100644
--- a/dune/perftool/common/timer_tsc.hh
+++ b/dune/perftool/common/timer_tsc.hh
@@ -6,7 +6,7 @@
 #endif
 
 #include <dune/perftool/common/tsc.hh>
-#include <dune/perftool/common/opcounter.hh>
+#include <dune/opcounter/opcounter.hh>
 
 #define HP_TIMER_DURATION(name) __hp_timer_##name##_duration
 #define HP_TIMER_STARTTIME(name) __hp_timer_##name##_start
@@ -76,16 +76,22 @@
 #ifdef ENABLE_COUNTER
 
 #define DUMP_TIMER(level,name,os,reset)\
-  if (HP_TIMER_DURATION(name) > 1e-12) \
-    os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
-  HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset);
+  { \
+    std::string prefix = std::string(#level) + " " + ident + " " + std::string(#name); \
+    if (HP_TIMER_DURATION(name) > 1e-12) \
+      os << prefix << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
+    HP_TIMER_OPCOUNTERS(name).reportOperations(os,prefix,reset); \
+  }
 
 #define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops)  \
-  if (HP_TIMER_DURATION(name) > 1e-12) \
-    os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
-  time += HP_TIMER_DURATION(name); \
-  ops += HP_TIMER_OPCOUNTERS(name); \
-  HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset);
+  { \
+    std::string prefix = std::string(#level) + " " + ident + " " + std::string(#name); \
+    if (HP_TIMER_DURATION(name) > 1e-12) \
+      os << prefix << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \
+    time += HP_TIMER_DURATION(name); \
+    ops += HP_TIMER_OPCOUNTERS(name); \
+    HP_TIMER_OPCOUNTERS(name).reportOperations(os,prefix,reset); \
+  }
 
 #elif defined ENABLE_HP_TIMERS
 
diff --git a/dune/perftool/common/vectorclass.hh b/dune/perftool/common/vectorclass.hh
index 6204b0a213796861533b9668f4c9a869198eb7cc..9fe0cf17b47abda0691c14591be3fcc15ee6e4d7 100644
--- a/dune/perftool/common/vectorclass.hh
+++ b/dune/perftool/common/vectorclass.hh
@@ -1,1770 +1,72 @@
-#ifndef DUNE_PDELAB_COMMON_VECTORCLASS_HH
-#define DUNE_PDELAB_COMMON_VECTORCLASS_HH
+#ifndef DUNE_PERFTOOL_COMMON_VECTORCLASS_HH
+#define DUNE_PERFTOOL_COMMON_VECTORCLASS_HH
 
-#ifdef VECTORCLASS_H
-#error Do not include the vectorclass.h header directly, always use this wrapper!
-#endif
-
-#ifdef VCL_NAMESPACE
-#error Do not manually set VCL_NAMESPACE, it is used internally by PDELab
-#endif
-
-#define BARRIER asm volatile("": : :"memory")
 
 template<typename T>
 struct base_floatingpoint
 {};
 
-#ifndef ENABLE_COUNTER
+#ifdef ENABLE_COUNTER
+#if HAVE_DUNE_OPCOUNTER
+#include<dune/opcounter/vectorclass.hh>
+
+template<typename F, int size>
+struct base_floatingpoint<OpCounter::impl::OpCounterVector<F, size>>
+{
+  using value = OpCounter::OpCounter<F>;
+};
 
-#include <dune/perftool/vectorclass/vectorclass.h>
-#include <dune/perftool/vectorclass/vectormath_exp.h>
-#include <dune/perftool/vectorclass/vectormath_trig.h>
+
+#else
+#error "dune-opcounter is needed for opcounted vector types"
+#endif
+#else
+#include<dune/perftool/vectorclass/vectorclass.h>
+#include<dune/perftool/vectorclass/vectormath_exp.h>
+#include<dune/perftool/vectorclass/vectormath_hyp.h>
+#include<dune/perftool/vectorclass/vectormath_trig.h>
 
 template<>
-struct base_floatingpoint<Vec4d>
+struct base_floatingpoint<Vec2d>
 {
   using value = double;
 };
 
 template<>
-struct base_floatingpoint<Vec8f>
+struct base_floatingpoint<Vec4f>
 {
   using value = float;
 };
 
-#if MAX_VECTOR_SIZE >= 512
 
+#if MAX_VECTOR_SIZE >= 256
 template<>
-struct base_floatingpoint<Vec8d>
+struct base_floatingpoint<Vec4d>
 {
   using value = double;
 };
 
-#endif
-
-#else
-
-#include <algorithm>
-
-#define VCL_NAMESPACE _vcl
-#include <dune/perftool/vectorclass/vectorclass.h>
-#include <dune/perftool/vectorclass/vectormath_exp.h>
-#include <dune/perftool/vectorclass/vectormath_trig.h>
-
-#include <dune/perftool/common/opcounter.hh>
-
-#include <numeric>
-
-struct Vec4d
-{
-  oc::OpCounter<double> _d[4];
-
-  using F = oc::OpCounter<double>;
-
-  Vec4d()
-  {}
-
-  Vec4d(F d)
-  {
-    BARRIER;
-    std::fill(_d,_d+4,d);
-    BARRIER;
-  }
-
-  Vec4d(F dl, F du)
-  {
-    BARRIER;
-    std::fill(_d,_d+2,dl);
-    std::fill(_d+2,_d+4,du);
-    BARRIER;
-  }
-
-  Vec4d(F d0, F d1, F d2, F d3)
-    : _d{d0,d1,d2,d3}
-  {
-    BARRIER;
-  }
-
-  Vec4d& load(const F* p)
-  {
-    BARRIER;
-    std::copy(p,p+4,_d);
-    BARRIER;
-    return *this;
-  }
-
-  Vec4d& load_a(const F* p)
-  {
-    BARRIER;
-    std::copy(p,p+4,_d);
-    BARRIER;
-    return *this;
-  }
-
-  void store(F* p) const
-  {
-    BARRIER;
-    std::copy(_d,_d+4,p);
-    BARRIER;
-  }
-
-  void store_a(F* p) const
-  {
-    BARRIER;
-    std::copy(_d,_d+4,p);
-    BARRIER;
-  }
-
-  Vec4d const& insert(uint32_t index, F value)
-  {
-    BARRIER;
-    _d[index] = value;
-    BARRIER;
-    return *this;
-  }
-
-  F extract(uint32_t index) const
-  {
-    BARRIER;
-    return _d[index];
-  }
-
-  F operator [] (uint32_t index) const {
-    return extract(index);
-  }
-
-  constexpr static int size()
-  {
-    return 4;
-  }
-
-};
-
 template<>
-struct base_floatingpoint<Vec4d>
+struct base_floatingpoint<Vec8f>
 {
-  using value = typename Vec4d::F;
+  using value = float;
 };
-
-/*****************************************************************************
-*
-*          Operators for Vec4d
-*
-*****************************************************************************/
-
-// vector operator + : add element by element
-static inline Vec4d operator + (Vec4d const & a, Vec4d const & b) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return x + y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator += : add
-static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+4,b._d,a._d,[](auto x, auto y){ return x + y; });
-  BARRIER;
-  return a;
-}
-
-// postfix operator ++
-static inline Vec4d operator ++ (Vec4d & a, int) {
-  BARRIER;
-  Vec4d a0 = a;
-  a = a + Vec4d(1.0);
-  BARRIER;
-  return a0;
-}
-
-// prefix operator ++
-static inline Vec4d & operator ++ (Vec4d & a) {
-  BARRIER;
-  a = a + Vec4d(1.0);
-  BARRIER;
-  return a;
-}
-
-// vector operator - : subtract element by element
-static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return x - y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator - : unary minus
-// Change sign bit, even for 0, INF and NAN
-static inline Vec4d operator - (Vec4d const & a) {
-  BARRIER;
-  Vec4d r(a);
-  for (size_t i = 0 ; i < 4 ; ++i)
-    r._d[i] = -a._d[i];
-  BARRIER;
-  return r;
-}
-
-// vector operator -= : subtract
-static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+4,b._d,a._d,[](auto x, auto y){ return x - y; });
-  BARRIER;
-  return a;
-}
-
-// postfix operator --
-static inline Vec4d operator -- (Vec4d & a, int) {
-  BARRIER;
-  Vec4d a0 = a;
-  a = a - Vec4d(1.0);
-  BARRIER;
-  return a0;
-}
-
-// prefix operator --
-static inline Vec4d & operator -- (Vec4d & a) {
-  BARRIER;
-  a = a - Vec4d(1.0);
-  BARRIER;
-  return a;
-}
-
-// vector operator * : multiply element by element
-static inline Vec4d operator * (Vec4d const & a, Vec4d const & b) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return x * y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator *= : multiply
-static inline Vec4d & operator *= (Vec4d & a, Vec4d const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+4,b._d,a._d,[](auto x, auto y){ return x * y; });
-  BARRIER;
-  return a;
-}
-
-// vector operator / : divide all elements by same integer
-static inline Vec4d operator / (Vec4d const & a, Vec4d const & b) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return x / y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator /= : divide
-static inline Vec4d & operator /= (Vec4d & a, Vec4d const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+4,b._d,a._d,[](auto x, auto y){ return x / y; });
-  BARRIER;
-  return a;
-}
-
-// vector operator == : returns true for elements for which a == b
-static inline _vcl::Vec4db operator == (Vec4d const & a, Vec4d const & b) {
-  BARRIER;
-  _vcl::Vec4d a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return a_ == b_;
-}
-
-// vector operator == : returns true for elements for which a == b
-static inline _vcl::Vec4db operator == (oc::OpCounter<double> a, Vec4d const & b) {
-  BARRIER;
-  _vcl::Vec4d a_(a._v), b_;
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return a_ == b_;
-}
-
-// vector operator == : returns true for elements for which a == b
-static inline _vcl::Vec4db operator == (Vec4d const & b, oc::OpCounter<double> a) {
-  BARRIER;
-  _vcl::Vec4d a_(a._v), b_;
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return a_ == b_;
-}
-
-// vector operator != : returns true for elements for which a != b
-static inline _vcl::Vec4db operator != (Vec4d const & a, Vec4d const & b) {
-  BARRIER;
-  _vcl::Vec4d a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return a_ != b_;
-}
-
-// vector operator != : returns true for elements for which a != b
-static inline _vcl::Vec4db operator != (oc::OpCounter<double> a, Vec4d const & b) {
-  BARRIER;
-  _vcl::Vec4d a_(a._v), b_;
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return a_ != b_;
-}
-
-// vector operator != : returns true for elements for which a != b
-static inline _vcl::Vec4db operator != (Vec4d const & b, oc::OpCounter<double> a) {
-  BARRIER;
-  _vcl::Vec4d a_(a._v), b_;
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return a_ != b_;
-}
-
-// vector operator < : returns true for elements for which a < b
-static inline _vcl::Vec4db operator < (Vec4d const & a, Vec4d const & b) {
-  BARRIER;
-  _vcl::Vec4d a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return a_ < b_;
-}
-
-// vector operator < : returns true for elements for which a < b
-static inline _vcl::Vec4db operator < (oc::OpCounter<double> a, Vec4d const & b) {
-  BARRIER;
-  _vcl::Vec4d a_(a._v), b_;
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return a_ < b_;
-}
-
-// vector operator < : returns true for elements for which a < b
-static inline _vcl::Vec4db operator < (Vec4d const & b, oc::OpCounter<double> a) {
-  BARRIER;
-  _vcl::Vec4d a_(a._v), b_;
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return b_ < a_;
-}
-
-// vector operator <= : returns true for elements for which a <= b
-static inline _vcl::Vec4db operator <= (Vec4d const & a, Vec4d const & b) {
-  BARRIER;
-  _vcl::Vec4d a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return a_ <= b_;
-}
-
-// vector operator <= : returns true for elements for which a <= b
-static inline _vcl::Vec4db operator <= (oc::OpCounter<double> a, Vec4d const & b) {
-  BARRIER;
-  _vcl::Vec4d a_(a._v), b_;
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return a_ <= b_;
-}
-
-// vector operator <= : returns true for elements for which a <= b
-static inline _vcl::Vec4db operator <= (Vec4d const & b, oc::OpCounter<double> a) {
-  BARRIER;
-  _vcl::Vec4d a_(a._v), b_;
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec4d::F::comparisons(4);
-  BARRIER;
-  return b_ <= a_;
-}
-
-// vector operator > : returns true for elements for which a > b
-static inline _vcl::Vec4db operator > (Vec4d const & a, Vec4d const & b) {
-    return b < a;
-}
-
-// vector operator > : returns true for elements for which a > b
-static inline _vcl::Vec4db operator > (oc::OpCounter<double> a, Vec4d const & b) {
-    return a < b;
-}
-
-// vector operator > : returns true for elements for which a > b
-static inline _vcl::Vec4db operator > (Vec4d const & b, oc::OpCounter<double> a) {
-    return a < b;
-}
-
-// vector operator >= : returns true for elements for which a >= b
-static inline _vcl::Vec4db operator >= (Vec4d const & a, Vec4d const & b) {
-    return b <= a;
-}
-
-// vector operator >= : returns true for elements for which a >= b
-static inline _vcl::Vec4db operator >= (oc::OpCounter<double> a, Vec4d const & b) {
-    return b <= a;
-}
-
-// vector operator >= : returns true for elements for which a >= b
-static inline _vcl::Vec4db operator >= (Vec4d const & b, oc::OpCounter<double> a) {
-    return a <= b;
-}
-
-
-// avoid logical operators for now, I don't think we need them
-#if 0
-
-// Bitwise logical operators
-
-// vector operator & : bitwise and
-static inline Vec4d operator & (Vec4d const & a, Vec4d const & b) {
-    return _mm256_and_pd(a, b);
-}
-
-// vector operator &= : bitwise and
-static inline Vec4d & operator &= (Vec4d & a, Vec4d const & b) {
-    a = a & b;
-    return a;
-}
-
-// vector operator & : bitwise and of Vec4d and Vec4db
-static inline Vec4d operator & (Vec4d const & a, Vec4db const & b) {
-    return _mm256_and_pd(a, b);
-}
-static inline Vec4d operator & (Vec4db const & a, Vec4d const & b) {
-    return _mm256_and_pd(a, b);
-}
-
-// vector operator | : bitwise or
-static inline Vec4d operator | (Vec4d const & a, Vec4d const & b) {
-    return _mm256_or_pd(a, b);
-}
-
-// vector operator |= : bitwise or
-static inline Vec4d & operator |= (Vec4d & a, Vec4d const & b) {
-    a = a | b;
-    return a;
-}
-
-// vector operator ^ : bitwise xor
-static inline Vec4d operator ^ (Vec4d const & a, Vec4d const & b) {
-    return _mm256_xor_pd(a, b);
-}
-
-// vector operator ^= : bitwise xor
-static inline Vec4d & operator ^= (Vec4d & a, Vec4d const & b) {
-    a = a ^ b;
-    return a;
-}
-
-// vector operator ! : logical not. Returns Boolean vector
-static inline Vec4db operator ! (Vec4d const & a) {
-    return a == Vec4d(0.0);
-}
-
-#endif
-
-
-// General arithmetic functions, etc.
-
-// Horizontal add: Calculates the sum of all vector elements.
-static inline Vec4d::F horizontal_add (Vec4d const & a) {
-  BARRIER;
-  return std::accumulate(a._d,a._d+4,Vec4d::F(0.0));
-  BARRIER;
-}
-
-// function max: a > b ? a : b
-static inline Vec4d max(Vec4d const & a, Vec4d const & b) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return max(x,y); });
-  BARRIER;
-  return r;
-}
-
-// function min: a < b ? a : b
-static inline Vec4d min(Vec4d const & a, Vec4d const & b) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return min(x,y); });
-  BARRIER;
-  return r;
-}
-
-// function abs: absolute value
-// Removes sign bit, even for -0.0f, -INF and -NAN
-static inline Vec4d abs(Vec4d const & a) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,r._d,[](auto x){ return abs(x); });
-  BARRIER;
-  return r;
-}
-
-// function sqrt: square root
-static inline Vec4d sqrt(Vec4d const & a) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,r._d,[](auto x){ return sqrt(x); });
-  BARRIER;
-  return r;
-}
-
-// function square: a * a
-static inline Vec4d square(Vec4d const & a) {
-  return a * a;
-}
-
-
-// exponential function
-static inline Vec4d exp(Vec4d const & a){
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,r._d,[](auto x){ return exp(x); });
-  BARRIER;
-  return r;
-}
-
-// pow
-template <typename TT>
-static inline Vec4d pow(Vec4d const & a, oc::OpCounter<TT> n)
-{
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,r._d,[=](auto x){ return pow(x, n); });
-  BARRIER;
-  return r;
-}
-
-// pow
-template <typename TT>
-static inline
-std::enable_if_t<not oc::isOpCounterV<TT>, Vec4d> pow(Vec4d const & a, TT n)
-{
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,r._d,[=](auto x){ return pow(x, n); });
-  BARRIER;
-  return r;
-}
-
-
-static inline Vec4d select(const _vcl::Vec4db& s, const Vec4d& a, const Vec4d& b)
-{
-  BARRIER;
-  Vec4d r;
-  for(int i=0; i<4; ++i)
-    r._d[i] = s.extract(i) ? a._d[i] : b._d[i];
-  BARRIER;
-  return r;
-}
-
-// function round: round to nearest integer (even). (result as double vector)
-static inline Vec4d round(Vec4d const & a) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,r._d,[](auto x){ return round(x); });
-  BARRIER;
-  return r;
-}
-
-// function truncate: round towards zero. (result as double vector)
-static inline Vec4d truncate(Vec4d const & a) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,r._d,[](auto x){ return trunc(x); });
-  BARRIER;
-  return r;
-}
-
-// function floor: round towards minus infinity. (result as double vector)
-static inline Vec4d floor(Vec4d const & a) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,r._d,[](auto x){ return floor(x); });
-  BARRIER;
-  return r;
-}
-
-// function ceil: round towards plus infinity. (result as double vector)
-static inline Vec4d ceil(Vec4d const & a) {
-  BARRIER;
-  Vec4d r;
-  std::transform(a._d,a._d+4,r._d,[](auto x){ return ceil(x); });
-  BARRIER;
-  return r;
-}
-
-#if 0
-// function round_to_int: round to nearest integer (even). (result as integer vector)
-static inline Vec4i round_to_int(Vec4d const & a) {
-    // Note: assume MXCSR control register is set to rounding
-    return _mm256_cvtpd_epi32(a);
-}
-
-// function truncate_to_int: round towards zero. (result as integer vector)
-static inline Vec4i truncate_to_int(Vec4d const & a) {
-    return _mm256_cvttpd_epi32(a);
-}
 #endif
 
-
-// Fused multiply and add functions
-
-// Multiply and add
-static inline Vec4d mul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
-  BARRIER;
-  Vec4d r;
-  for (size_t i = 0 ; i < 4 ; ++i)
-    r._d[i] = a._d[i] * b._d[i] + c._d[i];
-  BARRIER;
-  return r;
-}
-
-
-// Multiply and subtract
-static inline Vec4d mul_sub(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
-  BARRIER;
-  Vec4d r;
-  for (size_t i = 0 ; i < 4 ; ++i)
-    r._d[i] = a._d[i] * b._d[i] - c._d[i];
-  BARRIER;
-  return r;
-}
-
-// Multiply and inverse subtract
-static inline Vec4d nmul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
-  BARRIER;
-  Vec4d r;
-  for (size_t i = 0 ; i < 4 ; ++i)
-    r._d[i] = - a._d[i] * b._d[i] + c._d[i];
-  BARRIER;
-  return r;
-}
-
-
-template <int i0, int i1, int i2, int i3>
-static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) {
-  BARRIER;
-  _vcl::Vec4d a_,b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  _vcl::Vec4d r_ = _vcl::blend4d<i0,i1,i2,i3>(a_,b_);
-  BARRIER;
-  Vec4d::F::blends(1);
-  BARRIER;
-  Vec4d r;
-  BARRIER;
-  r_.store(r._d[0].data());
-  BARRIER;
-  return r;
-}
-
-// permute vector Vec4d
-template <int i0, int i1, int i2, int i3>
-static inline Vec4d permute4d(Vec4d const & a) {
-  BARRIER;
-  _vcl::Vec4d a_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  _vcl::Vec4d r_ = _vcl::permute4d<i0,i1,i2,i3>(a_);
-  BARRIER;
-  Vec4d::F::permutes(1);
-  BARRIER;
-  Vec4d r;
-  BARRIER;
-  r_.store(r._d[0].data());
-  BARRIER;
-  return r;
-}
-
-
 #if MAX_VECTOR_SIZE >= 512
-
-struct Vec8d
-{
-  oc::OpCounter<double> _d[8];
-
-  using F = oc::OpCounter<double>;
-
-  Vec8d()
-  {}
-
-  Vec8d(F d)
-  {
-    BARRIER;
-    std::fill(_d,_d+8,d);
-    BARRIER;
-  }
-
-  Vec8d(F dl, F du)
-  {
-    BARRIER;
-    std::fill(_d,_d+4,dl);
-    std::fill(_d+4,_d+8,du);
-    BARRIER;
-  }
-
-  Vec8d(Vec4d low, Vec4d high)
-  {
-    BARRIER;
-    std::copy(_d, _d+4, low._d);
-    std::copy(_d+4, _d+8, high._d);
-    BARRIER;
-  }
-
-  Vec8d(F d0, F d1, F d2, F d3, F d4, F d5, F d6, F d7)
-    : _d{d0,d1,d2,d3,d4,d5,d6,d7}
-  {
-    BARRIER;
-  }
-
-  Vec4d get_low() const
-  {
-    BARRIER;
-    Vec4d ret;
-    ret.load(_d);
-    BARRIER;
-    return ret;
-  }
-
-  Vec4d get_high() const
-  {
-    BARRIER;
-    Vec4d ret;
-    ret.load(_d + 4);
-    BARRIER;
-    return ret;
-  }
-
-  Vec8d& load(const F* p)
-  {
-    BARRIER;
-    std::copy(p,p+8,_d);
-    BARRIER;
-    return *this;
-  }
-
-  Vec8d& load_a(const F* p)
-  {
-    BARRIER;
-    std::copy(p,p+8,_d);
-    BARRIER;
-    return *this;
-  }
-
-  void store(F* p) const
-  {
-    BARRIER;
-    std::copy(_d,_d+8,p);
-    BARRIER;
-  }
-
-  void store_a(F* p) const
-  {
-    BARRIER;
-    std::copy(_d,_d+8,p);
-    BARRIER;
-  }
-
-  Vec8d const& insert(uint32_t index, F value)
-  {
-    BARRIER;
-    _d[index] = value;
-    BARRIER;
-    return *this;
-  }
-
-  F extract(uint32_t index) const
-  {
-    BARRIER;
-    return _d[index];
-  }
-
-  F operator [] (uint32_t index) const {
-    return extract(index);
-  }
-
-  constexpr static int size()
-  {
-    return 8;
-  }
-
-};
-
 template<>
 struct base_floatingpoint<Vec8d>
 {
-  using value = typename Vec8d::F;
-};
-
-/*****************************************************************************
-*
-*          Operators for Vec8d
-*
-*****************************************************************************/
-
-// vector operator + : add element by element
-static inline Vec8d operator + (Vec8d const & a, Vec8d const & b) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x + y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator += : add
-static inline Vec8d & operator += (Vec8d & a, Vec8d const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x + y; });
-  BARRIER;
-  return a;
-}
-
-// postfix operator ++
-static inline Vec8d operator ++ (Vec8d & a, int) {
-  BARRIER;
-  Vec8d a0 = a;
-  a = a + Vec8d(1.0);
-  BARRIER;
-  return a0;
-}
-
-// prefix operator ++
-static inline Vec8d & operator ++ (Vec8d & a) {
-  BARRIER;
-  a = a + Vec8d(1.0);
-  BARRIER;
-  return a;
-}
-
-// vector operator - : subtract element by element
-static inline Vec8d operator - (Vec8d const & a, Vec8d const & b) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x - y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator - : unary minus
-// Change sign bit, even for 0, INF and NAN
-static inline Vec8d operator - (Vec8d const & a) {
-  BARRIER;
-  Vec8d r(a);
-  for (size_t i = 0 ; i < 8 ; ++i)
-    r._d[i] = -a._d[i];
-  BARRIER;
-  return r;
-}
-
-// vector operator -= : subtract
-static inline Vec8d & operator -= (Vec8d & a, Vec8d const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x - y; });
-  BARRIER;
-  return a;
-}
-
-// postfix operator --
-static inline Vec8d operator -- (Vec8d & a, int) {
-  BARRIER;
-  Vec8d a0 = a;
-  a = a - Vec8d(1.0);
-  BARRIER;
-  return a0;
-}
-
-// prefix operator --
-static inline Vec8d & operator -- (Vec8d & a) {
-  BARRIER;
-  a = a - Vec8d(1.0);
-  BARRIER;
-  return a;
-}
-
-// vector operator * : multiply element by element
-static inline Vec8d operator * (Vec8d const & a, Vec8d const & b) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x * y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator *= : multiply
-static inline Vec8d & operator *= (Vec8d & a, Vec8d const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x * y; });
-  BARRIER;
-  return a;
-}
-
-// vector operator / : divide all elements by same integer
-static inline Vec8d operator / (Vec8d const & a, Vec8d const & b) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x / y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator /= : divide
-static inline Vec8d & operator /= (Vec8d & a, Vec8d const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x / y; });
-  BARRIER;
-  return a;
-}
-
-// vector operator == : returns true for elements for which a == b
-static inline _vcl::Vec8db operator == (Vec8d const & a, Vec8d const & b) {
-  BARRIER;
-  _vcl::Vec8d a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec8d::F::comparisons(8);
-  BARRIER;
-  return a_ == b_;
-}
-
-// vector operator != : returns true for elements for which a != b
-static inline _vcl::Vec8db operator != (Vec8d const & a, Vec8d const & b) {
-  BARRIER;
-  _vcl::Vec8d a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec8d::F::comparisons(8);
-  BARRIER;
-  return a_ != b_;
-}
-
-// vector operator < : returns true for elements for which a < b
-static inline _vcl::Vec8db operator < (Vec8d const & a, Vec8d const & b) {
-  BARRIER;
-  _vcl::Vec8d a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec8d::F::comparisons(8);
-  BARRIER;
-  return a_ < b_;
-}
-
-// vector operator <= : returns true for elements for which a <= b
-static inline _vcl::Vec8db operator <= (Vec8d const & a, Vec8d const & b) {
-  BARRIER;
-  _vcl::Vec8d a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec8d::F::comparisons(8);
-  BARRIER;
-  return a_ <= b_;
-}
-
-// vector operator > : returns true for elements for which a > b
-static inline _vcl::Vec8db operator > (Vec8d const & a, Vec8d const & b) {
-    return b < a;
-}
-
-// vector operator >= : returns true for elements for which a >= b
-static inline _vcl::Vec8db operator >= (Vec8d const & a, Vec8d const & b) {
-    return b <= a;
-}
-
-// General arithmetic functions, etc.
-
-// Horizontal add: Calculates the sum of all vector elements.
-static inline Vec8d::F horizontal_add (Vec8d const & a) {
-  BARRIER;
-  return std::accumulate(a._d,a._d+8,Vec8d::F(0.0));
-  BARRIER;
-}
-
-// function max: a > b ? a : b
-static inline Vec8d max(Vec8d const & a, Vec8d const & b) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return max(x,y); });
-  BARRIER;
-  return r;
-}
-
-// function min: a < b ? a : b
-static inline Vec8d min(Vec8d const & a, Vec8d const & b) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return min(x,y); });
-  BARRIER;
-  return r;
-}
-
-// function abs: absolute value
-// Removes sign bit, even for -0.0f, -INF and -NAN
-static inline Vec8d abs(Vec8d const & a) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return abs(x); });
-  BARRIER;
-  return r;
-}
-
-// function sqrt: square root
-static inline Vec8d sqrt(Vec8d const & a) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return sqrt(x); });
-  BARRIER;
-  return r;
-}
-
-// function square: a * a
-static inline Vec8d square(Vec8d const & a) {
-  return a * a;
-}
-
-
-// exponential function
-static inline Vec8d exp(Vec8d const & a){
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return exp(x); });
-  BARRIER;
-  return r;
-}
-
-// function round: round to nearest integer (even). (result as double vector)
-static inline Vec8d round(Vec8d const & a) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return round(x); });
-  BARRIER;
-  return r;
-}
-
-// function truncate: round towards zero. (result as double vector)
-static inline Vec8d truncate(Vec8d const & a) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return trunc(x); });
-  BARRIER;
-  return r;
-}
-
-// function floor: round towards minus infinity. (result as double vector)
-static inline Vec8d floor(Vec8d const & a) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return floor(x); });
-  BARRIER;
-  return r;
-}
-
-// function ceil: round towards plus infinity. (result as double vector)
-static inline Vec8d ceil(Vec8d const & a) {
-  BARRIER;
-  Vec8d r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return ceil(x); });
-  BARRIER;
-  return r;
-}
-
-// Fused multiply and add functions
-
-// Multiply and add
-static inline Vec8d mul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
-  BARRIER;
-  Vec8d r;
-  for (size_t i = 0 ; i < 8 ; ++i)
-    r._d[i] = a._d[i] * b._d[i] + c._d[i];
-  BARRIER;
-  return r;
-}
-
-
-// Multiply and subtract
-static inline Vec8d mul_sub(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
-  BARRIER;
-  Vec8d r;
-  for (size_t i = 0 ; i < 8 ; ++i)
-    r._d[i] = a._d[i] * b._d[i] - c._d[i];
-  BARRIER;
-  return r;
-}
-
-// Multiply and inverse subtract
-static inline Vec8d nmul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
-  BARRIER;
-  Vec8d r;
-  for (size_t i = 0 ; i < 8 ; ++i)
-    r._d[i] = - a._d[i] * b._d[i] + c._d[i];
-  BARRIER;
-  return r;
-}
-
-
-template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-static inline Vec8d blend8d(Vec8d const & a, Vec8d const & b) {
-  BARRIER;
-  _vcl::Vec8d a_,b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  _vcl::Vec8d r_ = _vcl::blend8d<i0,i1,i2,i3,i4,i5,i6,i7>(a_,b_);
-  BARRIER;
-  Vec8d::F::blends(1);
-  BARRIER;
-  Vec8d r;
-  BARRIER;
-  r_.store(r._d[0].data());
-  BARRIER;
-  return r;
-}
-
-// permute vector Vec4d
-template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-static inline Vec8d permute8d(Vec8d const & a) {
-  BARRIER;
-  _vcl::Vec8d a_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  _vcl::Vec8d r_ = _vcl::permute8d<i0,i1,i2,i3,i4,i5,i6,i7>(a_);
-  BARRIER;
-  Vec8d::F::permutes(1);
-  BARRIER;
-  Vec8d r;
-  BARRIER;
-  r_.store(r._d[0].data());
-  BARRIER;
-  return r;
-}
-
-#endif // MAC_VECTOR_SIZE >= 512
-
-
-struct Vec8f
-{
-  oc::OpCounter<float> _d[8];
-
-  using F = oc::OpCounter<float>;
-
-  Vec8f()
-  {}
-
-  Vec8f(F d)
-  {
-    BARRIER;
-    std::fill(_d,_d+8,d);
-    BARRIER;
-  }
-
-  Vec8f(double d)
-  {
-    BARRIER;
-    std::fill(_d,_d+8,d);
-    BARRIER;
-  }
-
-  Vec8f(F d0, F d1, F d2, F d3, F d4, F d5, F d6, F d7)
-    : _d{d0,d1,d2,d3,d4,d5,d6,d7}
-  {
-    BARRIER;
-  }
-
-  Vec8f& load(const F* p)
-  {
-    BARRIER;
-    std::copy(p,p+8,_d);
-    BARRIER;
-    return *this;
-  }
-
-  Vec8f& load_a(const F* p)
-  {
-    BARRIER;
-    std::copy(p,p+8,_d);
-    BARRIER;
-    return *this;
-  }
-
-  void store(F* p) const
-  {
-    BARRIER;
-    std::copy(_d,_d+8,p);
-    BARRIER;
-  }
-
-  void store_a(F* p) const
-  {
-    BARRIER;
-    std::copy(_d,_d+8,p);
-    BARRIER;
-  }
-
-  Vec8f const& insert(uint32_t index, F value)
-  {
-    BARRIER;
-    _d[index] = value;
-    BARRIER;
-    return *this;
-  }
-
-  F extract(uint32_t index) const
-  {
-    BARRIER;
-    return _d[index];
-  }
-
-  constexpr static int size()
-  {
-    return 8;
-  }
-
+  using value = double;
 };
 
 template<>
-struct base_floatingpoint<Vec8f>
+struct base_floatingpoint<Vec16f>
 {
-  using value = typename Vec8f::F;
+  using value = float;
 };
-
-/*****************************************************************************
-*
-*          Operators for Vec8f
-*
-*****************************************************************************/
-
-// vector operator + : add element by element
-static inline Vec8f operator + (Vec8f const & a, Vec8f const & b) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x + y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator += : add
-static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x + y; });
-  BARRIER;
-  return a;
-}
-
-// postfix operator ++
-static inline Vec8f operator ++ (Vec8f & a, int) {
-  BARRIER;
-  Vec8f a0 = a;
-  a = a + Vec8f(1.0);
-  BARRIER;
-  return a0;
-}
-
-// prefix operator ++
-static inline Vec8f & operator ++ (Vec8f & a) {
-  BARRIER;
-  a = a + Vec8f(1.0);
-  BARRIER;
-  return a;
-}
-
-// vector operator - : subtract element by element
-static inline Vec8f operator - (Vec8f const & a, Vec8f const & b) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x - y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator - : unary minus
-// Change sign bit, even for 0, INF and NAN
-static inline Vec8f operator - (Vec8f const & a) {
-  BARRIER;
-  Vec8f r(a);
-  for (size_t i = 0 ; i < 8 ; ++i)
-    r._d[i] = -a._d[i];
-  BARRIER;
-  return r;
-}
-
-// vector operator -= : subtract
-static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x - y; });
-  BARRIER;
-  return a;
-}
-
-// postfix operator --
-static inline Vec8f operator -- (Vec8f & a, int) {
-  BARRIER;
-  Vec8f a0 = a;
-  a = a - Vec8f(1.0);
-  BARRIER;
-  return a0;
-}
-
-// prefix operator --
-static inline Vec8f & operator -- (Vec8f & a) {
-  BARRIER;
-  a = a - Vec8f(1.0);
-  BARRIER;
-  return a;
-}
-
-// vector operator * : multiply element by element
-static inline Vec8f operator * (Vec8f const & a, Vec8f const & b) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x * y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator *= : multiply
-static inline Vec8f & operator *= (Vec8f & a, Vec8f const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x * y; });
-  BARRIER;
-  return a;
-}
-
-// vector operator / : divide all elements by same integer
-static inline Vec8f operator / (Vec8f const & a, Vec8f const & b) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x / y; });
-  BARRIER;
-  return r;
-}
-
-// vector operator /= : divide
-static inline Vec8f & operator /= (Vec8f & a, Vec8f const & b) {
-  BARRIER;
-  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x / y; });
-  BARRIER;
-  return a;
-}
-
-// vector operator == : returns true for elements for which a == b
-static inline _vcl::Vec8fb operator == (Vec8f const & a, Vec8f const & b) {
-  BARRIER;
-  _vcl::Vec8f a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec8f::F::comparisons(8);
-  BARRIER;
-  return a_ == b_;
-}
-
-// vector operator != : returns true for elements for which a != b
-static inline _vcl::Vec8fb operator != (Vec8f const & a, Vec8f const & b) {
-  BARRIER;
-  _vcl::Vec8f a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec8f::F::comparisons(8);
-  BARRIER;
-  return a_ != b_;
-}
-
-// vector operator < : returns true for elements for which a < b
-static inline _vcl::Vec8fb operator < (Vec8f const & a, Vec8f const & b) {
-  BARRIER;
-  _vcl::Vec8f a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec8f::F::comparisons(8);
-  BARRIER;
-  return a_ < b_;
-}
-
-// vector operator <= : returns true for elements for which a <= b
-static inline _vcl::Vec8fb operator <= (Vec8f const & a, Vec8f const & b) {
-  BARRIER;
-  _vcl::Vec8f a_, b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  Vec8f::F::comparisons(8);
-  BARRIER;
-  return a_ <= b_;
-}
-
-// vector operator > : returns true for elements for which a > b
-static inline _vcl::Vec8fb operator > (Vec8f const & a, Vec8f const & b) {
-    return b < a;
-}
-
-// vector operator >= : returns true for elements for which a >= b
-static inline _vcl::Vec8fb operator >= (Vec8f const & a, Vec8f const & b) {
-    return b <= a;
-}
-
-// avoid logical operators for now, I don't think we need them
-#if 0
-
-// Bitwise logical operators
-
-// vector operator & : bitwise and
-static inline Vec8f operator & (Vec8f const & a, Vec8f const & b) {
-    return _mm256_and_pd(a, b);
-}
-
-// vector operator &= : bitwise and
-static inline Vec8f & operator &= (Vec8f & a, Vec8f const & b) {
-    a = a & b;
-    return a;
-}
-
-// vector operator & : bitwise and of Vec8f and Vec8fb
-static inline Vec8f operator & (Vec8f const & a, Vec8fb const & b) {
-    return _mm256_and_pd(a, b);
-}
-static inline Vec8f operator & (Vec8fb const & a, Vec8f const & b) {
-    return _mm256_and_pd(a, b);
-}
-
-// vector operator | : bitwise or
-static inline Vec8f operator | (Vec8f const & a, Vec8f const & b) {
-    return _mm256_or_pd(a, b);
-}
-
-// vector operator |= : bitwise or
-static inline Vec8f & operator |= (Vec8f & a, Vec8f const & b) {
-    a = a | b;
-    return a;
-}
-
-// vector operator ^ : bitwise xor
-static inline Vec8f operator ^ (Vec8f const & a, Vec8f const & b) {
-    return _mm256_xor_pd(a, b);
-}
-
-// vector operator ^= : bitwise xor
-static inline Vec8f & operator ^= (Vec8f & a, Vec8f const & b) {
-    a = a ^ b;
-    return a;
-}
-
-// vector operator ! : logical not. Returns Boolean vector
-static inline Vec8fb operator ! (Vec8f const & a) {
-    return a == Vec8f(0.0);
-}
-
 #endif
 
-
-// General arithmetic functions, etc.
-
-// Horizontal add: Calculates the sum of all vector elements.
-static inline Vec8f::F horizontal_add (Vec8f const & a) {
-  BARRIER;
-  return std::accumulate(a._d,a._d+8,Vec8f::F(0.0));
-  BARRIER;
-}
-
-// function max: a > b ? a : b
-static inline Vec8f max(Vec8f const & a, Vec8f const & b) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return max(x,y); });
-  BARRIER;
-  return r;
-}
-
-// function min: a < b ? a : b
-static inline Vec8f min(Vec8f const & a, Vec8f const & b) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return min(x,y); });
-  BARRIER;
-  return r;
-}
-
-// function abs: absolute value
-// Removes sign bit, even for -0.0f, -INF and -NAN
-static inline Vec8f abs(Vec8f const & a) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return abs(x); });
-  BARRIER;
-  return r;
-}
-
-// function sqrt: square root
-static inline Vec8f sqrt(Vec8f const & a) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return sqrt(x); });
-  BARRIER;
-  return r;
-}
-
-// function square: a * a
-static inline Vec8f square(Vec8f const & a) {
-  return a * a;
-}
-
-
-// exponential function
-static inline Vec8f exp(Vec8f const & a){
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return exp(x); });
-  BARRIER;
-  return r;
-}
-
-
-// ignore pow() for now
-#if 0
-
-// pow(Vec8f, int):
-template <typename TT> static Vec8f pow(Vec8f const & a, TT n);
-
-// Raise floating point numbers to integer power n
-template <>
-inline Vec8f pow<int>(Vec8f const & x0, int n) {
-    return pow_template_i<Vec8f>(x0, n);
-}
-
-// allow conversion from unsigned int
-template <>
-inline Vec8f pow<uint32_t>(Vec8f const & x0, uint32_t n) {
-    return pow_template_i<Vec8f>(x0, (int)n);
-}
-
-
-// Raise floating point numbers to integer power n, where n is a compile-time constant
-template <int n>
-static inline Vec8f pow_n(Vec8f const & a) {
-    if (n < 0)    return Vec8f(1.0) / pow_n<-n>(a);
-    if (n == 0)   return Vec8f(1.0);
-    if (n >= 256) return pow(a, n);
-    Vec8f x = a;                       // a^(2^i)
-    Vec8f y;                           // accumulator
-    const int lowest = n - (n & (n-1));// lowest set bit in n
-    if (n & 1) y = x;
-    if (n < 2) return y;
-    x = x*x;                           // x^2
-    if (n & 2) {
-        if (lowest == 2) y = x; else y *= x;
-    }
-    if (n < 4) return y;
-    x = x*x;                           // x^4
-    if (n & 4) {
-        if (lowest == 4) y = x; else y *= x;
-    }
-    if (n < 8) return y;
-    x = x*x;                           // x^8
-    if (n & 8) {
-        if (lowest == 8) y = x; else y *= x;
-    }
-    if (n < 16) return y;
-    x = x*x;                           // x^16
-    if (n & 16) {
-        if (lowest == 16) y = x; else y *= x;
-    }
-    if (n < 32) return y;
-    x = x*x;                           // x^32
-    if (n & 32) {
-        if (lowest == 32) y = x; else y *= x;
-    }
-    if (n < 64) return y;
-    x = x*x;                           // x^64
-    if (n & 64) {
-        if (lowest == 64) y = x; else y *= x;
-    }
-    if (n < 128) return y;
-    x = x*x;                           // x^128
-    if (n & 128) {
-        if (lowest == 128) y = x; else y *= x;
-    }
-    return y;
-}
-
-template <int n>
-static inline Vec8f pow(Vec8f const & a, Const_int_t<n>) {
-    return pow_n<n>(a);
-}
-
 #endif
 
-// function round: round to nearest integer (even). (result as double vector)
-static inline Vec8f round(Vec8f const & a) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return round(x); });
-  BARRIER;
-  return r;
-}
-
-// function truncate: round towards zero. (result as double vector)
-static inline Vec8f truncate(Vec8f const & a) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return trunc(x); });
-  BARRIER;
-  return r;
-}
-
-// function floor: round towards minus infinity. (result as double vector)
-static inline Vec8f floor(Vec8f const & a) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return floor(x); });
-  BARRIER;
-  return r;
-}
-
-// function ceil: round towards plus infinity. (result as double vector)
-static inline Vec8f ceil(Vec8f const & a) {
-  BARRIER;
-  Vec8f r;
-  std::transform(a._d,a._d+8,r._d,[](auto x){ return ceil(x); });
-  BARRIER;
-  return r;
-}
-
-#if 0
-// function round_to_int: round to nearest integer (even). (result as integer vector)
-static inline Vec8i round_to_int(Vec8f const & a) {
-    // Note: assume MXCSR control register is set to rounding
-    return _mm256_cvtpd_epi32(a);
-}
-
-// function truncate_to_int: round towards zero. (result as integer vector)
-static inline Vec8i truncate_to_int(Vec8f const & a) {
-    return _mm256_cvttpd_epi32(a);
-}
-#endif
-
-
-// Fused multiply and add functions
-
-// Multiply and add
-static inline Vec8f mul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
-  BARRIER;
-  Vec8f r;
-  for (size_t i = 0 ; i < 8 ; ++i)
-    r._d[i] = a._d[i] * b._d[i] + c._d[i];
-  BARRIER;
-  return r;
-}
-
-
-// Multiply and subtract
-static inline Vec8f mul_sub(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
-  BARRIER;
-  Vec8f r;
-  for (size_t i = 0 ; i < 8 ; ++i)
-    r._d[i] = a._d[i] * b._d[i] - c._d[i];
-  BARRIER;
-  return r;
-}
-
-// Multiply and inverse subtract
-static inline Vec8f nmul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
-  BARRIER;
-  Vec8f r;
-  for (size_t i = 0 ; i < 8 ; ++i)
-    r._d[i] = - a._d[i] * b._d[i] + c._d[i];
-  BARRIER;
-  return r;
-}
-
-
-template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-static inline Vec8f blend8f(Vec8f const & a, Vec8f const & b) {
-  BARRIER;
-  _vcl::Vec8f a_,b_;
-  BARRIER;
-  a_.load(a._d[0].data());
-  BARRIER;
-  b_.load(b._d[0].data());
-  BARRIER;
-  _vcl::Vec8f r_ = _vcl::blend8f<i0,i1,i2,i3,i4,i5,i6,i7>(a_,b_);
-  BARRIER;
-  Vec8f::F::blends(1);
-  BARRIER;
-  Vec8f r;
-  BARRIER;
-  r_.store(r._d[0].data());
-  BARRIER;
-  return r;
-}
-
-
-#endif // ENABLE_COUNTER
-
 #endif // DUNE_PDELAB_COMMON_VECTORCLASS_HH
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index bb9bf793b3adf64f3c29fa5883a97762d0438ee2..cc21196fa1679ad109510e3a65155b0394c60d3d 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -30,7 +30,8 @@ import cgen
 
 
 def _type_to_op_counter_type(name):
-    return "oc::OpCounter<{}>".format(name)
+    include_file("dune/opcounter/opcounter.hh")
+    return "OpCounter::OpCounter<{}>".format(name)
 
 
 def dtype_floatingpoint():
@@ -63,6 +64,10 @@ def type_floatingpoint():
     return numpy_to_cpp_dtype(NumpyType(dtype).dtype.name)
 
 
+def type_context_floatingpoint():
+    return {np.float32: 'f', np.float64: 'd'}.get(dtype_floatingpoint())
+
+
 class DuneExpressionToCExpressionMapper(ExpressionToCExpressionMapper):
     def map_subscript(self, expr, type_context):
         arr = self.find_array(expr)
@@ -88,6 +93,10 @@ class DuneExpressionToCExpressionMapper(ExpressionToCExpressionMapper):
         return expr
 
     def map_constant(self, expr, type_context):
+        # We correct the type context to force all floating point literals to be of
+        # the type that we use throughout the computation.
+        if type_context in ("f", "d"):
+            type_context = type_context_floatingpoint()
         ret = ExpressionToCExpressionMapper.map_constant(self, expr, type_context)
         if get_option('opcounter'):
             if type_context in ("f", "d"):
diff --git a/python/dune/perftool/loopy/transformations/instrumentation.py b/python/dune/perftool/loopy/transformations/instrumentation.py
index 89b08b6f0e191ca06db56d820c585ebe585e250b..2771ba141ccf5978067e3569038c4307bd01a357 100644
--- a/python/dune/perftool/loopy/transformations/instrumentation.py
+++ b/python/dune/perftool/loopy/transformations/instrumentation.py
@@ -22,7 +22,7 @@ def _union(a):
     return frozenset.union(*a)
 
 
-def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', operator=False):
+def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', operator=False, depends_on=frozenset()):
     """ Transform loopy kernel to contain instrumentation code
 
     Arguments:
@@ -32,6 +32,9 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
     identifier : The name of the counter to start and stop
     level : The instrumentation level this measurement is defined at
     filetag : The tag of the file that should contain the counter definitions
+    depends_on: Additional dependencies to add to the start instruction. This is used to correct
+                currently wrong behaviour of the transformation in cases where a lot of structure
+                of the instrumentation is known a priori.
     """
     # If the instrumentation level is not high enough, this is a no-op
     if level > get_option("instrumentation_level"):
@@ -53,6 +56,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
     insn_inames = _intersect(tuple(i.within_inames for i in insns))
     other_inames = _union(tuple(i.within_inames for i in lp.find_instructions(knl, lp.match.Not(match))))
     within = _intersect((insn_inames, other_inames))
+    uniontags = _intersect(tuple(i.tags for i in insns))
 
     # Get a unique identifer - note that the same timer could be started and stopped several times
     # within one kernel...
@@ -67,8 +71,9 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
                                  "HP_TIMER_START({});".format(identifier),
                                  id=start_id,
                                  within_inames=within,
-                                 depends_on=start_depends,
+                                 depends_on=depends_on.union(start_depends),
                                  boostable_into=frozenset(),
+                                 tags=uniontags,
                                  )
 
     # Add dependencies on the timing instructions
@@ -82,6 +87,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
                                 within_inames=within,
                                 depends_on=frozenset(i.id for i in insns),
                                 boostable_into=frozenset(),
+                                tags=uniontags,
                                 )
 
     # Find all the instructions that should depend on stop
@@ -98,4 +104,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o
     other_insns = list(filter(lambda i: i.id not in [j.id for j in rewritten_insns], knl.instructions))
 
     # Add all the modified instructions into the kernel object
-    return knl.copy(instructions=rewritten_insns + other_insns + [start_insn, stop_insn])
+    knl = knl.copy(instructions=rewritten_insns + other_insns + [start_insn, stop_insn])
+
+    from loopy.kernel.creation import resolve_dependencies
+    return resolve_dependencies(knl)
diff --git a/python/dune/perftool/pdelab/driver/error.py b/python/dune/perftool/pdelab/driver/error.py
index a4a7e2d414c7ffffce709c97d80c1b7a329dbef3..4934f06d66a7913041214130b51453df8cde979f 100644
--- a/python/dune/perftool/pdelab/driver/error.py
+++ b/python/dune/perftool/pdelab/driver/error.py
@@ -161,7 +161,7 @@ def accumulate_L2_squared():
 @preamble(section="error")
 def define_accumulated_L2_error(name):
     t = type_range()
-    return "{} {}(0.0);".format(t, name)
+    return "Dune::FieldVector<{}, 1> {}(0.0);".format(t, name)
 
 
 def name_accumulated_L2_error():
@@ -182,7 +182,7 @@ def compare_L2_squared():
             "if ({}.comm().rank() == 0){{".format(gv),
             "  std::cout << \"\\nl2errorsquared: \" << {} << std::endl << std::endl;".format(accum_error),
             "}",
-            "if (isnan({0}) or abs({0})>{1})".format(accum_error, get_option("compare_l2errorsquared")),
+            "if (isnan({0}[0]) or abs({0}[0])>{1})".format(accum_error, get_option("compare_l2errorsquared")),
             "  {} = true;".format(fail)]
 
 
diff --git a/python/dune/perftool/pdelab/driver/timings.py b/python/dune/perftool/pdelab/driver/timings.py
index 84f0ce839d7684d165faa6ab07b57dfe340d6b13..4ba2d8318de885030240adabcf3e9e139bb20524 100644
--- a/python/dune/perftool/pdelab/driver/timings.py
+++ b/python/dune/perftool/pdelab/driver/timings.py
@@ -8,6 +8,7 @@ from dune.perftool.generation import (cached,
                                       preamble,
                                       )
 from dune.perftool.pdelab.driver import (get_form_ident,
+                                         is_linear,
                                          name_initree,
                                          name_mpihelper,
                                          )
@@ -131,18 +132,19 @@ def apply_jacobian_timer():
     if get_option('instrumentation_level') >= 3:
         print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
 
-    if get_option('instrumentation_level') >= 2:
-        evaluation = ["HP_TIMER_START(apply_jacobian);",
-                      "{}.jacobian_apply({}, j);".format(n_go, v),
-                      "HP_TIMER_STOP(apply_jacobian);",
-                      "DUMP_TIMER({}, apply_jacobian, {}, true);".format(get_option("instrumentation_level"), timestream)]
-        evaluation.extend(print_times)
-    else:
+    if is_linear():
+        declaration = ["{} j({});".format(t_v, v), "j=0.0;"]
         evaluation = ["{}.jacobian_apply({}, j);".format(n_go, v)]
+    else:
+        declaration = ["{} j0({});".format(t_v, v), "j0=0.0;",
+                       "{} j1({});".format(t_v, v), "j1=0.0;"]
+        evaluation = ["{}.nonlinear_jacobian_apply({}, j0, j1);".format(n_go, v)]
 
-    evaluation = ["{} j({});".format(t_v, v), "j=0.0;"] + evaluation
+    if get_option('instrumentation_level') >= 2:
+        evaluation = ["HP_TIMER_START(apply_jacobian);"] + evaluation + ["HP_TIMER_STOP(apply_jacobian);", "DUMP_TIMER({}, apply_jacobian, {}, true);".format(get_option("instrumentation_level"), timestream)]
+        evaluation.extend(print_times)
 
-    return evaluation
+    return declaration + evaluation
 
 
 @preamble(section="timings")
diff --git a/python/dune/perftool/pdelab/driver/visitor.py b/python/dune/perftool/pdelab/driver/visitor.py
index 6549c9a18b8780a983abcfd0cc0be5f07d2e1e3f..da7c10ddb82273bef3b904a02295e3fe6e232eb5 100644
--- a/python/dune/perftool/pdelab/driver/visitor.py
+++ b/python/dune/perftool/pdelab/driver/visitor.py
@@ -4,11 +4,19 @@ from dune.perftool.ufl.visitor import UFL2LoopyVisitor
 import pymbolic.primitives as prim
 
 
-@preamble
+@preamble(section="init")
 def driver_using_statement(what):
     return "using {};".format(what)
 
 
+@preamble(section="gridoperator")
+def set_lop_to_starting_time():
+    from dune.perftool.pdelab.driver import get_form_ident
+    from dune.perftool.pdelab.driver.gridoperator import name_localoperator
+    lop = name_localoperator(get_form_ident())
+    return "{}.setTime(0.0);".format(lop)
+
+
 class DriverUFL2PymbolicVisitor(UFL2LoopyVisitor):
     def __init__(self):
         from dune.perftool.pdelab import PDELabInterface
@@ -33,6 +41,7 @@ class DriverUFL2PymbolicVisitor(UFL2LoopyVisitor):
             from dune.perftool.pdelab.driver import get_form_ident
             from dune.perftool.pdelab.driver.gridoperator import name_localoperator
             lop = name_localoperator(get_form_ident())
+            set_lop_to_starting_time()
             return prim.Call(prim.Variable("{}.getTime".format(lop)), ())
         else:
             return UFL2LoopyVisitor.coefficient(self, o)
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index a4bd0201c10f681499b6dbd450b585306ca4ec94..727b7ae35fae4ef86fe1a6884204d4cbf3acfb2f 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -572,8 +572,8 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin
     if add_timings and get_form_option("sumfact"):
         from dune.perftool.pdelab.signatures import assembler_routine_name
         kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage1"), "{}_kernel_stage1".format(assembler_routine_name()), 4)
-        kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage2"), "{}_kernel_quadratureloop".format(assembler_routine_name()), 4)
-        kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage3"), "{}_kernel_stage3".format(assembler_routine_name()), 4)
+        kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage2"), "{}_kernel_quadratureloop".format(assembler_routine_name()), 4, depends_on=frozenset({lp.match.Tagged("sumfact_stage1")}))
+        kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage3"), "{}_kernel_stage3".format(assembler_routine_name()), 4, depends_on=frozenset({lp.match.Tagged("sumfact_stage2")}))
 
     if wrap_in_cgen:
         # Wrap the kernel in something which can generate code
diff --git a/test/poisson/CMakeLists.txt b/test/poisson/CMakeLists.txt
index a05cbe8c3650c9e33f8f5c4f1bbce696b2df9ba8..0a4f9897176e1aaa2a2864c23bf97135eb67261b 100644
--- a/test/poisson/CMakeLists.txt
+++ b/test/poisson/CMakeLists.txt
@@ -44,10 +44,12 @@ dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl
                                   )
 
 # 8. Poisson with operator counting
-dune_add_formcompiler_system_test(UFLFILE opcount_poisson_dg.ufl
-                                  BASENAME opcount_poisson_dg_symdiff
-                                  INIFILE opcount_poisson_dg_symdiff.mini
-                                  )
+if(dune-opcounter_FOUND)
+  dune_add_formcompiler_system_test(UFLFILE opcount_poisson_dg.ufl
+                                    BASENAME opcount_poisson_dg_symdiff
+                                    INIFILE opcount_poisson_dg_symdiff.mini
+                                    )
+endif()
 
 # 9. Poisson Test Case: DG quadrilaterals
 dune_add_formcompiler_system_test(UFLFILE poisson_dg_quadrilateral.ufl