diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh
index f73c6a2f717b243e32411a5feea948c7777ce430..9ed1974055859fa175cdbe2c86d0bcabd80fb8bc 100644
--- a/dune/perftool/sumfact/transposereg.hh
+++ b/dune/perftool/sumfact/transposereg.hh
@@ -75,6 +75,48 @@ void transpose_reg (Vec8d& a0, Vec8d& a1)
   a1 = b1;
 }
 
+namespace impl
+{
+  /* (alow, aupp), (blow, bupp) -> (alow, blow), (aupp, bupp) */
+  void swap_halves(Vec8d& a, Vec8d& b)
+  {
+    Vec4d tmp = a.get_high();
+    a = Vec8d(a.get_low(), b.get_low());
+    b = Vec8d(tmp, b.get_high());
+  }
+
+  /* A 4x8 transpose that behaves exactly like Vec4d's 4x4 transpose
+   * on the lower and upper halves of the Vec8d
+   */
+  void _transpose4x8(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
+  {
+    Vec8d b0,b1,b2,b3;
+    b0 = blend8d<0,8,2,10,4,12,6,14>(a0,a1);
+    b1 = blend8d<1,9,3,11,5,13,7,15>(a0,a1);
+    b2 = blend8d<0,8,2,10,4,12,6,14>(a2,a3);
+    b3 = blend8d<1,9,3,11,5,13,7,15>(a2,a3);
+    a0 = blend8d<0,1,8,9,4,5,12,13>(b0,b2);
+    a1 = blend8d<0,1,8,9,4,5,12,13>(b1,b3);
+    a2 = blend8d<2,3,10,11,6,7,14,15>(b0,b2);
+    a3 = blend8d<2,3,10,11,6,7,14,15>(b1,b3);
+  }
+}
+
+/* This is the 8x8 transpose of Vec8d's. It uses the same shuffling
+ * as Vec4d, but on the 4x4 subblocks. Afterwards, the off diagonal
+ * blocks are swapped.
+ */
+void transpose(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3,
+               Vec8d& a4, Vec8d& a5, Vec8d& a6, Vec8d& a7)
+{
+  impl::_transpose4x8(a0,a1,a2,a3);
+  impl::_transpose4x8(a4,a5,a6,a7);
+  impl::swap_halves(a0,a4);
+  impl::swap_halves(a1,a5);
+  impl::swap_halves(a2,a6);
+  impl::swap_halves(a3,a7);
+}
+
 #endif
 
 #endif