diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh
index 9d924bf900afc5fc1865b142cb0501d8a78b7a34..6a40a8ea466f1c36fdbe18d5b7557441589cefdf 100644
--- a/dune/perftool/sumfact/transposereg.hh
+++ b/dune/perftool/sumfact/transposereg.hh
@@ -33,15 +33,30 @@ void transpose_reg(Vec4d& a0, Vec4d& a1)
 
 void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
 {
-  Vec8d b0, b1, b2, b3;
-  b0 = blend8d<0,1,8,9,2,3,10,11>(a0, a1);
-  b1 = blend8d<4,5,12,13,6,7,14,15>(a0, a1);
-  b2 = blend8d<0,1,8,9,2,3,10,11>(a2, a3);
-  b3 = blend8d<4,5,12,13,6,7,14,15>(a2, a3);
-  a0 = blend8d<0,1,2,3,8,9,10,11>(b0, b2);
-  a1 = blend8d<4,5,6,7,12,13,14,15>(b0, b2);
-  a2 = blend8d<0,1,2,3,8,9,10,11>(b1, b3);
-  a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3);
+    auto ac = _mm512_castpd_si512(a);
+    auto bc = _mm512_castpd_si512(b);
+    auto cc = _mm512_castpd_si512(c);
+    auto dc = _mm512_castpd_si512(d);
+
+    auto t1 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(ac, 0xCC, bc, ac, 6));
+    auto t2 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(bc, 0x33, bc, ac, 2));
+    auto t3 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(cc, 0xCC, dc, cc, 6));
+    auto t4 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(dc, 0x33, dc, cc, 2));
+
+    a = _mm512_insertf64x4(t1, _mm512_extractf64x4_pd(t3, 0), 1);
+    c = _mm512_insertf64x4(t3, _mm512_extractf64x4_pd(t1, 1), 0);
+    b = _mm512_insertf64x4(t2, _mm512_extractf64x4_pd(t4, 0), 1);
+    d = _mm512_insertf64x4(t4, _mm512_extractf64x4_pd(t2, 1), 0);
+//
+//  Vec8d b0, b1, b2, b3;
+//  b0 = blend8d<0,1,8,9,2,3,10,11>(a0, a1);
+//  b1 = blend8d<4,5,12,13,6,7,14,15>(a0, a1);
+//  b2 = blend8d<0,1,8,9,2,3,10,11>(a2, a3);
+//  b3 = blend8d<4,5,12,13,6,7,14,15>(a2, a3);
+//  a0 = blend8d<0,1,2,3,8,9,10,11>(b0, b2);
+//  a1 = blend8d<4,5,6,7,12,13,14,15>(b0, b2);
+//  a2 = blend8d<0,1,2,3,8,9,10,11>(b1, b3);
+//  a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3);
 }
 
 void transpose_reg (Vec8d& a0, Vec8d& a1)