diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt
index 5a75b84e1c0c3b4d4daa19381d7032489d2cbddb..f472b3d2b80421c4f18b1283a63f80a065a97435 100644
--- a/applications/CMakeLists.txt
+++ b/applications/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(poisson_dg)
 add_subdirectory(poisson_dg_tensor)
+add_subdirectory(knl)
diff --git a/applications/knl/CMakeLists.txt b/applications/knl/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a75b84e1c0c3b4d4daa19381d7032489d2cbddb
--- /dev/null
+++ b/applications/knl/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(poisson_dg)
+add_subdirectory(poisson_dg_tensor)
diff --git a/applications/knl/poisson_dg/CMakeLists.txt b/applications/knl/poisson_dg/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a906a02d5765bd323ab634bb810010bfa92e1535
--- /dev/null
+++ b/applications/knl/poisson_dg/CMakeLists.txt
@@ -0,0 +1,11 @@
+dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl
+                                  BASENAME app_knl_poisson
+                                  INIFILE knl_poisson_dg.mini
+                                  NO_TESTS
+                                  )
+
+dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl
+                                  BASENAME verify_app_knl_poisson_dg
+                                  INIFILE verify.mini
+                                  NO_TESTS
+                                  )
diff --git a/applications/knl/poisson_dg/knl_poisson_dg.mini b/applications/knl/poisson_dg/knl_poisson_dg.mini
new file mode 100644
index 0000000000000000000000000000000000000000..9951705d981a39a8e9c20c089fd117eb76b4e2ee
--- /dev/null
+++ b/applications/knl/poisson_dg/knl_poisson_dg.mini
@@ -0,0 +1,50 @@
+__name = app_knl_poisson_{__exec_suffix}
+__exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{formcompiler.instrumentation_level}
+
+opcount_suffix = opcount, nonopcount | expand opcount
+{opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude
+
+# Calculate the size of the grid to equlibritate it to 100 MB/rank
+# Input parameters
+dim = 3
+mbperrank = 100
+ranks = 66
+floatingbytes = 8
+
+# Metaini Calculations
+memperrank = {mbperrank} * 1048576 | eval
+dofsperdir = {formcompiler.ufl_variants.degree} + 1 | eval
+celldofs = {dofsperdir} ** {dim} | eval
+cellsperrank = {memperrank} / ({floatingbytes} * {celldofs}) | eval
+cellsperdir = {cellsperrank} ** (1/{dim}) | eval | toint
+firstdircells = {ranks} * {cellsperdir} | eval
+dimminusone = {dim} - 1 | eval
+ones = 1 | repeat {dimminusone}
+otherdircells = {cellsperdir} | repeat {dimminusone}
+
+# Set up the timing identifier
+identifier = knl_poisson_dg_deg{formcompiler.ufl_variants.degree}
+
+# Setup the grid!
+extension = 1.0 | repeat {dim}
+cells = {firstdircells} {otherdircells}
+partitioning = {ranks} {ones}
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+fastdg = 1
+sumfact = 1
+vectorize_quad = 1
+vectorize_diagonal = 1
+instrumentation_level = 2, 3, 4 | expand
+opcounter = 1, 0 | expand opcount
+time_opcounter = 0, 1 | expand opcount
+quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
+architecture = knl
+
+[formcompiler.ufl_variants]
+cell = hexahedron
+degree = 1, 3, 5, 7, 9  | expand
diff --git a/applications/knl/poisson_dg/poisson_dg.ufl b/applications/knl/poisson_dg/poisson_dg.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..2c0f2c34436f1c6b76817d6abad344fae3183d62
--- /dev/null
+++ b/applications/knl/poisson_dg/poisson_dg.ufl
@@ -0,0 +1,32 @@
+dim = 3
+x = SpatialCoordinate(cell)
+f = -6.
+g = x[0]*x[0] + x[1]*x[1] + x[2]*x[2]
+
+V = FiniteElement("DG", cell, degree)
+
+u = TrialFunction(V)
+v = TestFunction(V)
+
+n = FacetNormal(cell)('+')
+
+alpha = 1.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+cse_gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+cse_gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+
+theta = -1.0
+
+r = inner(grad(u), grad(v))*dx \
+  + inner(n, avg(grad(u)))*jump(v)*dS \
+  + cse_gamma_int*jump(u)*jump(v)*dS \
+  - theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - inner(n, grad(u))*v*ds \
+  + cse_gamma_ext*u*v*ds \
+  + theta*u*inner(grad(v), n)*ds \
+  - f*v*dx \
+  - theta*g*inner(grad(v), n)*ds \
+  - cse_gamma_ext*g*v*ds
+
+forms = [r]
diff --git a/applications/knl/poisson_dg/verify.mini b/applications/knl/poisson_dg/verify.mini
new file mode 100644
index 0000000000000000000000000000000000000000..bae219fc0b2336b1109b109b8c236e9e8ef9cc5c
--- /dev/null
+++ b/applications/knl/poisson_dg/verify.mini
@@ -0,0 +1,21 @@
+__name = verify_app_knl_poisson_dg
+
+# Setup the grid!
+extension = 1.0 1.0 1.0
+cells = 8 8 8
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+fastdg = 1
+sumfact = 1
+vectorize_quad = 1
+vectorize_diagonal = 1
+quadrature_order = 6
+architecture = knl
+
+[formcompiler.ufl_variants]
+cell = hexahedron
+degree = 3
diff --git a/applications/knl/poisson_dg_tensor/CMakeLists.txt b/applications/knl/poisson_dg_tensor/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..86889e3f0b4893c6afcf0a4e21ed954f75a626c7
--- /dev/null
+++ b/applications/knl/poisson_dg_tensor/CMakeLists.txt
@@ -0,0 +1,11 @@
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_tensor.ufl
+                                  BASENAME app_knl_poisson_tensor
+                                  INIFILE knl_poisson_dg_tensor.mini
+                                  NO_TESTS
+                                  )
+
+dune_add_formcompiler_system_test(UFLFILE poisson_dg_tensor.ufl
+                                  BASENAME verify_app_knl_poisson_dg_tensor
+                                  INIFILE verify.mini
+                                  NO_TESTS
+                                  )
diff --git a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini
new file mode 100644
index 0000000000000000000000000000000000000000..07aa0a438765cc5ddddf2f46dad71ae7be37d835
--- /dev/null
+++ b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini
@@ -0,0 +1,50 @@
+__name = app_knl_poisson_tensor_{__exec_suffix}
+__exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{formcompiler.instrumentation_level}
+
+opcount_suffix = opcount, nonopcount | expand opcount
+{opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude
+
+# Calculate the size of the grid to equlibritate it to 100 MB/rank
+# Input parameters
+dim = 3
+mbperrank = 100
+ranks = 66
+floatingbytes = 8
+
+# Metaini Calculations
+memperrank = {mbperrank} * 1048576 | eval
+dofsperdir = {formcompiler.ufl_variants.degree} + 1 | eval
+celldofs = {dofsperdir} ** {dim} | eval
+cellsperrank = {memperrank} / ({floatingbytes} * {celldofs}) | eval
+cellsperdir = {cellsperrank} ** (1/{dim}) | eval | toint
+firstdircells = {ranks} * {cellsperdir} | eval
+dimminusone = {dim} - 1 | eval
+ones = 1 | repeat {dimminusone}
+otherdircells = {cellsperdir} | repeat {dimminusone}
+
+# Set up the timing identifier
+identifier = knl_poisson_dg_deg{formcompiler.ufl_variants.degree}
+
+# Setup the grid!
+extension = 1.0 | repeat {dim}
+cells = {firstdircells} {otherdircells}
+partitioning = {ranks} {ones}
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+fastdg = 1
+sumfact = 1
+vectorize_quad = 1
+vectorize_diagonal = 1
+instrumentation_level = 2, 3, 4 | expand
+opcounter = 1, 0 | expand opcount
+time_opcounter = 0, 1 | expand opcount
+quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
+architecture = knl
+
+[formcompiler.ufl_variants]
+cell = hexahedron
+degree = 1, 3, 5, 7, 9  | expand
diff --git a/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl b/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..00267f15b9331b4705138cfb1a4698ee357a5767
--- /dev/null
+++ b/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl
@@ -0,0 +1,35 @@
+dim = 3
+x = SpatialCoordinate(cell)
+
+g = x[0]*x[0] + x[1]*x[1] + x[2]*x[2]
+I = Identity(3)
+A = as_matrix([[x[i]*x[j] + I[i,j] for j in range(3)] for i in range(3)])
+f = -6.
+c = 10.
+
+V = FiniteElement("DG", cell, degree)
+
+u = TrialFunction(V)
+v = TestFunction(V)
+
+n = FacetNormal(cell)('+')
+
+alpha = 3.0
+h_ext = CellVolume(cell) / FacetArea(cell)
+cse_gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext
+h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell)
+cse_gamma_int = (alpha * degree * (degree + dim - 1)) / h_int
+
+theta = -1.0
+
+r = (inner(A*grad(u), grad(v)) + (c*u-f)*v)*dx \
+  + inner(n, A*avg(grad(u)))*jump(v)*dS \
+  + cse_gamma_int*jump(u)*jump(v)*dS \
+  - theta*jump(u)*inner(A*avg(grad(v)), n)*dS \
+  - inner(n, A*grad(u))*v*ds \
+  + cse_gamma_ext*u*v*ds \
+  + theta*u*inner(A*grad(v), n)*ds \
+  - theta*g*inner(A*grad(v), n)*ds \
+  - cse_gamma_ext*g*v*ds
+
+forms = [r]
diff --git a/applications/knl/poisson_dg_tensor/verify.mini b/applications/knl/poisson_dg_tensor/verify.mini
new file mode 100644
index 0000000000000000000000000000000000000000..2f0d7170f89fccc6c4e65888f8cdab176c2dd015
--- /dev/null
+++ b/applications/knl/poisson_dg_tensor/verify.mini
@@ -0,0 +1,21 @@
+__name = verify_app_knl_poisson_dg_tensor
+
+# Setup the grid!
+extension = 1.0 1.0 1.0
+cells = 8 8 8
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+fastdg = 1
+sumfact = 1
+vectorize_quad = 1
+vectorize_diagonal = 1
+quadrature_order = 6
+architecture = knl
+
+[formcompiler.ufl_variants]
+cell = hexahedron
+degree = 3
diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index e8ea4bd424beb1d30c28ca716113021b2713aa6b..36a037e322cf430c6b0fbc8495e06512fa84b1ac 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -11,4 +11,5 @@ dune_install_python_script(SCRIPT performance_regression.py
                            )
 
 dune_symlink_to_source_files(FILES make_graph.sh
+                                   knltimings.sh
                              )
diff --git a/bin/knltimings.sh b/bin/knltimings.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2f238395262f6eb11a35707f7d6556541eaac673
--- /dev/null
+++ b/bin/knltimings.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# If an argument was given use it as the working directory
+if [ $# -eq 1 ]
+then
+  cd $1
+fi
+
+# Search for runnable executables
+FILES=$(ls *.ini)
+for inifile in $FILES
+do
+  line=$(grep ^"opcounter = " $inifile)
+  extract=${line##opcounter = }
+  UPPER=10
+  if [ $extract -eq 1 ]
+  then
+    UPPER=1
+  fi
+  COUNT=0
+  while [ $COUNT -lt $UPPER ]; do
+    exec=${inifile%.ini}
+	line=$(grep ^"ranks = " $inifile)
+	np=${line##ranks = }
+    mpirun -np $np ./$exec $inifile
+    COUNT=$((COUNT + 1))
+  done
+done
diff --git a/dune/perftool/common/vectorclass.hh b/dune/perftool/common/vectorclass.hh
index 3b857c35cbb90bd34a24b8b35c300c8b30df3eaf..8d31883eb80faebe8101f2e045d0da31b029642b 100644
--- a/dune/perftool/common/vectorclass.hh
+++ b/dune/perftool/common/vectorclass.hh
@@ -165,7 +165,7 @@ static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) {
 static inline Vec4d operator - (Vec4d const & a) {
   BARRIER;
   Vec4d r(a);
-  for (size_t i = 0 ; i < 3 ; ++i)
+  for (size_t i = 0 ; i < 4 ; ++i)
     r._d[i] = -a._d[i];
   BARRIER;
   return r;
@@ -590,6 +590,427 @@ static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) {
   return r;
 }
 
+#if MAC_VECTOR_SIZE >= 512
+
+struct Vec8d
+{
+  oc::OpCounter<double> _d[8];
+
+  using F = oc::OpCounter<double>;
+
+  Vec8d()
+  {}
+
+  Vec8d(F d)
+  {
+    BARRIER;
+    std::fill(_d,_d+8,d);
+    BARRIER;
+  }
+
+  Vec8d(double d)
+  {
+    BARRIER;
+    std::fill(_d,_d+8,d);
+    BARRIER;
+  }
+
+  Vec8d(F d0, F d1, F d2, F d3, F d4, F d5, F d6, F d7)
+    : _d{d0,d1,d2,d3,d4,d5,d6,d7}
+  {
+    BARRIER;
+  }
+
+  Vec8d& load(const F* p)
+  {
+    BARRIER;
+    std::copy(p,p+8,_d);
+    BARRIER;
+    return *this;
+  }
+
+  Vec8d& load_a(const F* p)
+  {
+    BARRIER;
+    std::copy(p,p+8,_d);
+    BARRIER;
+    return *this;
+  }
+
+  void store(F* p) const
+  {
+    BARRIER;
+    std::copy(_d,_d+8,p);
+    BARRIER;
+  }
+
+  void store_a(F* p) const
+  {
+    BARRIER;
+    std::copy(_d,_d+8,p);
+    BARRIER;
+  }
+
+  Vec8d const& insert(uint32_t index, F value)
+  {
+    BARRIER;
+    _d[index] = value;
+    BARRIER;
+    return *this;
+  }
+
+  F extract(uint32_t index) const
+  {
+    BARRIER;
+    return _d[index];
+  }
+
+  constexpr static int size()
+  {
+    return 8;
+  }
+
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8d operator + (Vec8d const & a, Vec8d const & b) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x + y; });
+  BARRIER;
+  return r;
+}
+
+// vector operator += : add
+static inline Vec8d & operator += (Vec8d & a, Vec8d const & b) {
+  BARRIER;
+  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x + y; });
+  BARRIER;
+  return a;
+}
+
+// postfix operator ++
+static inline Vec8d operator ++ (Vec8d & a, int) {
+  BARRIER;
+  Vec8d a0 = a;
+  a = a + 1.0;
+  BARRIER;
+  return a0;
+}
+
+// prefix operator ++
+static inline Vec8d & operator ++ (Vec8d & a) {
+  BARRIER;
+  a = a + 1.0;
+  BARRIER;
+  return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8d operator - (Vec8d const & a, Vec8d const & b) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x - y; });
+  BARRIER;
+  return r;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8d operator - (Vec8d const & a) {
+  BARRIER;
+  Vec8d r(a);
+  for (size_t i = 0 ; i < 8 ; ++i)
+    r._d[i] = -a._d[i];
+  BARRIER;
+  return r;
+}
+
+// vector operator -= : subtract
+static inline Vec8d & operator -= (Vec8d & a, Vec8d const & b) {
+  BARRIER;
+  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x - y; });
+  BARRIER;
+  return a;
+}
+
+// postfix operator --
+static inline Vec8d operator -- (Vec8d & a, int) {
+  BARRIER;
+  Vec8d a0 = a;
+  a = a - 1.0;
+  BARRIER;
+  return a0;
+}
+
+// prefix operator --
+static inline Vec8d & operator -- (Vec8d & a) {
+  BARRIER;
+  a = a - 1.0;
+  BARRIER;
+  return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8d operator * (Vec8d const & a, Vec8d const & b) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x * y; });
+  BARRIER;
+  return r;
+}
+
+// vector operator *= : multiply
+static inline Vec8d & operator *= (Vec8d & a, Vec8d const & b) {
+  BARRIER;
+  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x * y; });
+  BARRIER;
+  return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8d operator / (Vec8d const & a, Vec8d const & b) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x / y; });
+  BARRIER;
+  return r;
+}
+
+// vector operator /= : divide
+static inline Vec8d & operator /= (Vec8d & a, Vec8d const & b) {
+  BARRIER;
+  std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x / y; });
+  BARRIER;
+  return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline _vcl::Vec8db operator == (Vec8d const & a, Vec8d const & b) {
+  BARRIER;
+  _vcl::Vec8d a_, b_;
+  BARRIER;
+  a_.load(a._d[0].data());
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec8d::F::comparisons(8);
+  BARRIER;
+  return a_ == b_;
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline _vcl::Vec8db operator != (Vec8d const & a, Vec8d const & b) {
+  BARRIER;
+  _vcl::Vec8d a_, b_;
+  BARRIER;
+  a_.load(a._d[0].data());
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec8d::F::comparisons(8);
+  BARRIER;
+  return a_ != b_;
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline _vcl::Vec8db operator < (Vec8d const & a, Vec8d const & b) {
+  BARRIER;
+  _vcl::Vec8d a_, b_;
+  BARRIER;
+  a_.load(a._d[0].data());
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec8d::F::comparisons(8);
+  BARRIER;
+  return a_ < b_;
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline _vcl::Vec8db operator <= (Vec8d const & a, Vec8d const & b) {
+  BARRIER;
+  _vcl::Vec8d a_, b_;
+  BARRIER;
+  a_.load(a._d[0].data());
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  Vec8d::F::comparisons(8);
+  BARRIER;
+  return a_ <= b_;
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline _vcl::Vec8db operator > (Vec8d const & a, Vec8d const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline _vcl::Vec8db operator >= (Vec8d const & a, Vec8d const & b) {
+    return b <= a;
+}
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline Vec8d::F horizontal_add (Vec8d const & a) {
+  BARRIER;
+  return std::accumulate(a._d,a._d+8,Vec8d::F(0.0));
+  BARRIER;
+}
+
+// function max: a > b ? a : b
+static inline Vec8d max(Vec8d const & a, Vec8d const & b) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return max(x,y); });
+  BARRIER;
+  return r;
+}
+
+// function min: a < b ? a : b
+static inline Vec8d min(Vec8d const & a, Vec8d const & b) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return min(x,y); });
+  BARRIER;
+  return r;
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec8d abs(Vec8d const & a) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,r._d,[](auto x){ return abs(x); });
+  BARRIER;
+  return r;
+}
+
+// function sqrt: square root
+static inline Vec8d sqrt(Vec8d const & a) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,r._d,[](auto x){ return sqrt(x); });
+  BARRIER;
+  return r;
+}
+
+// function square: a * a
+static inline Vec8d square(Vec8d const & a) {
+  return a * a;
+}
+
+
+// exponential function
+static inline Vec8d exp(Vec8d const & a){
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,r._d,[](auto x){ return exp(x); });
+  BARRIER;
+  return r;
+}
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec8d round(Vec8d const & a) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,r._d,[](auto x){ return round(x); });
+  BARRIER;
+  return r;
+}
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec8d truncate(Vec8d const & a) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,r._d,[](auto x){ return trunc(x); });
+  BARRIER;
+  return r;
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec8d floor(Vec8d const & a) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,r._d,[](auto x){ return floor(x); });
+  BARRIER;
+  return r;
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec8d ceil(Vec8d const & a) {
+  BARRIER;
+  Vec8d r;
+  std::transform(a._d,a._d+8,r._d,[](auto x){ return ceil(x); });
+  BARRIER;
+  return r;
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8d mul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
+  BARRIER;
+  Vec8d r;
+  for (size_t i = 0 ; i < 8 ; ++i)
+    r._d[i] = a._d[i] * b._d[i] + c._d[i];
+  BARRIER;
+  return r;
+}
+
+
+// Multiply and subtract
+static inline Vec8d mul_sub(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
+  BARRIER;
+  Vec8d r;
+  for (size_t i = 0 ; i < 8 ; ++i)
+    r._d[i] = a._d[i] * b._d[i] - c._d[i];
+  BARRIER;
+  return r;
+}
+
+// Multiply and inverse subtract
+static inline Vec8d nmul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
+  BARRIER;
+  Vec8d r;
+  for (size_t i = 0 ; i < 8 ; ++i)
+    r._d[i] = - a._d[i] * b._d[i] + c._d[i];
+  BARRIER;
+  return r;
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d blend8d(Vec8d const & a, Vec8d const & b) {
+  BARRIER;
+  _vcl::Vec8d a_,b_;
+  BARRIER;
+  a_.load(a._d[0].data());
+  BARRIER;
+  b_.load(b._d[0].data());
+  BARRIER;
+  _vcl::Vec8d r_ = _vcl::blend8d<i0,i1,i2,i3,i4,i5,i6,i7>(a_,b_);
+  BARRIER;
+  Vec8d::F::blends(1);
+  BARRIER;
+  Vec8d r;
+  BARRIER;
+  r_.store(r._d[0].data());
+  BARRIER;
+  return r;
+}
+
+#endif // MAC_VECTOR_SIZE >= 512
+
 #endif // ENABLE_COUNTER
 
 #endif // DUNE_PDELAB_COMMON_VECTORCLASS_HH
diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh
index 5e84c98b935f191b1df88ade2ab50e3753d7e3c4..9d924bf900afc5fc1865b142cb0501d8a78b7a34 100644
--- a/dune/perftool/sumfact/transposereg.hh
+++ b/dune/perftool/sumfact/transposereg.hh
@@ -3,20 +3,6 @@
 
 #include<dune/perftool/common/vectorclass.hh>
 
-
-#if MAX_VECTOR_SIZE >= 128
-
-void transpose_reg(Vec2d& a0, Vec2d& a1)
-{
-  Vec2d b0, b1;
-  b0 = blend2d<0,2>(a0, a1);
-  b1 = blend2d<1,3>(a0, a1);
-  a0 = b0;
-  a1 = b1;
-}
-
-#endif
-
 #if MAX_VECTOR_SIZE >= 256
 
 void transpose_reg(Vec4d& a0, Vec4d& a1, Vec4d& a2, Vec4d& a3)
diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py
index 89c5791e0edb1d08eb51bff020a910d2b4f7c8ac..df7bc8314ebd2ef3f0d79b70ca7222ceb3e92b71 100644
--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -55,7 +55,8 @@ def _realize_sum_factorization_kernel(sf):
         insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
                                                          within_inames=frozenset(sf.within_inames),
                                                          depends_on=insn_dep,
-                                                         ),}))
+                                                         ),
+                                             }))
 
     # Set up the input for stage 1
     if sf.stage == 1 and not get_option("fastdg"):
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index 0cbb6b814b13b3f60bacc065ea539ed0a8a37af5..790548f1b99a9cd36d8e3c395890debb419c2cec 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -120,8 +120,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
 
         # The following construction is a bit weird: Dict comprehensions do not have
         # access to the locals of the calling scope: So we need to do the eval beforehand
-        defaults = [eval(a) for a in SumfactKernel.init_arg_names]
-        defaultdict = {a: defaults[SumfactKernel.init_arg_names.index(a)] for a in SumfactKernel.init_arg_names}
+        defaultdict = {}
+        for a in SumfactKernel.init_arg_names:
+            defaultdict[a] = eval(a)
 
         # Call the base class constructors
         ImmutableRecord.__init__(self, **defaultdict)
diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py
index 466690a4a9fa3fc414ba790c93a098e398abfad5..92a8883eda78d836f002a984afdc66f95c830f1d 100644
--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -162,7 +162,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase):
     def pymbolic(self, indices):
         assert len(indices) == 3
 
-        # Check whether we can realize this by broadcasting the values of a simple tabulation 
+        # Check whether we can realize this by broadcasting the values of a simple tabulation
         if len(set(self.tabs)) == 1:
             theta = self.tabs[0].pymbolic(indices[:-1])
             return prim.Call(ExplicitVCLCast(np.float64, vector_width=len(self.tabs)), (theta,))
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index b856ccc76afc649e8af310f9fdfebaf5df59f4d7..6900885499385a62fc9dbca9d1e9adac7e444551 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -133,12 +133,14 @@ def horizontal_vectorization_strategy(sumfacts, width, allow_padding=1):
 def diagonal_vectorization_strategy(sumfacts, width):
     if width == 4:
         horizontal, vertical = 2, 2
+    elif width == 8:
+        horizontal, vertical = 4, 2
     else:
         raise NotImplementedError
 
     result = {}
 
-    horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=0)
+    horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=1)
 
     for sf in horizontal_kernels:
         vert = vertical_vectorization_strategy(horizontal_kernels[sf], width // horizontal_kernels[sf].horizontal_width)
@@ -164,19 +166,23 @@ def decide_vectorization_strategy():
         for inputkey in inputkeys:
             width = get_vcl_type_size(np.float64)
             sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey]
-            sfdict.update(**horizontal_vectorization_strategy(sumfact_filter, width))
+            for old, new in horizontal_vectorization_strategy(sumfact_filter, width).items():
+                sfdict[old] = new
     elif get_option("vectorize_slice"):
         for sumfact in sumfacts:
             width = get_vcl_type_size(np.float64)
-            sfdict.update(**vertical_vectorization_strategy(sumfact, width))
+            for old, new in vertical_vectorization_strategy(sumfact, width).items():
+                sfdict[old] = new
     elif get_option("vectorize_diagonal"):
         inputkeys = set(sf.input_key for sf in sumfacts)
         for inputkey in inputkeys:
             width = get_vcl_type_size(np.float64)
             sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey]
-            sfdict.update(**diagonal_vectorization_strategy(sumfact_filter, width))
+            for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items():
+                sfdict[old] = new
     else:
-        sfdict.update(**no_vectorization(sumfacts))
+        for old, new in no_vectorization(sumfacts).items():
+            sfdict[old] = new
 
     # Register the results
     for sf in sumfacts: