diff --git a/dune/perftool/CMakeLists.txt b/dune/perftool/CMakeLists.txt
index 7533fd57e0f5cafac76f7591c9bbbf41ea7b819b..92f812d0fe1d364e4fdd175b6cebe02b14f2f978 100644
--- a/dune/perftool/CMakeLists.txt
+++ b/dune/perftool/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(common)
+add_subdirectory(test)
 
 install(FILES vectorclass/dispatch_example.cpp
               vectorclass/instrset_detect.cpp
diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh
index d2ce09c39bf6ff5b87cc236cbf5d2a5bfcf850f1..a3b8372288d9461c85f6761328fa96e7db465ea4 100644
--- a/dune/perftool/sumfact/transposereg.hh
+++ b/dune/perftool/sumfact/transposereg.hh
@@ -3,8 +3,51 @@
 
 #include<dune/perftool/common/vectorclass.hh>
 
+// DOUBLE 2 x 2
+void transpose_reg(Vec2d& a0, Vec2d& a1)
+{
+  double tmp = a0[1];
+  a0 = Vec2d(a0[0], a1[0]);
+  a1 = Vec2d(tmp, a1[1]);
+}
+
+// FLOAT 4 x 2
+void transpose_reg(Vec4f& a0, Vec4f& a1)
+{
+  Vec4f b0, b1;
+  b0 = blend4f<0,1,4,5>(a0,a1);
+  b1 = blend4f<2,3,6,7>(a0,a1);
+  a0 = b0;
+  a1 = b1;
+}
+
+// FLOAT 4 x 4
+void transpose_reg(Vec4f& a0, Vec4f& a1, Vec4f& a2, Vec4f& a3)
+{
+  Vec4f b0,b1,b2,b3;
+  b0 = blend4f<0,4,2,6>(a0,a1);
+  b1 = blend4f<1,5,3,7>(a0,a1);
+  b2 = blend4f<0,4,2,6>(a2,a3);
+  b3 = blend4f<1,5,3,7>(a2,a3);
+  a0 = blend4f<0,1,4,5>(b0,b2);
+  a1 = blend4f<0,1,4,5>(b1,b3);
+  a2 = blend4f<2,3,6,7>(b0,b2);
+  a3 = blend4f<2,3,6,7>(b1,b3);
+}
+
 #if MAX_VECTOR_SIZE >= 256
 
+// DOUBLE 4 x 2
+void transpose_reg(Vec4d& a0, Vec4d& a1)
+{
+  Vec4d b0, b1;
+  b0 = blend4d<0,1,4,5>(a0,a1);
+  b1 = blend4d<2,3,6,7>(a0,a1);
+  a0 = b0;
+  a1 = b1;
+}
+
+// DOUBLE 4 x 4
 void transpose_reg(Vec4d& a0, Vec4d& a1, Vec4d& a2, Vec4d& a3)
 {
   Vec4d b0,b1,b2,b3;
@@ -18,15 +61,17 @@ void transpose_reg(Vec4d& a0, Vec4d& a1, Vec4d& a2, Vec4d& a3)
   a3 = blend4d<2,3,6,7>(b1,b3);
 }
 
-void transpose_reg(Vec4d& a0, Vec4d& a1)
+// FLOAT 8 x 2
+void transpose_reg (Vec8f& a0, Vec8f& a1)
 {
-  Vec4d b0, b1;
-  b0 = blend4d<0,1,4,5>(a0,a1);
-  b1 = blend4d<2,3,6,7>(a0,a1);
+  Vec8f b0, b1;
+  b0 = blend8f<0,1,2,3,8,9,10,11>(a0, a1);
+  b1 = blend8f<4,5,6,7,12,13,14,15>(a0, a1);
   a0 = b0;
   a1 = b1;
 }
 
+// FLOAT 8 x 4
 void transpose_reg(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3)
 {
   Vec8f b0, b1, b2, b3;
@@ -40,19 +85,50 @@ void transpose_reg(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3)
   a3 = blend8f<4,5,6,7,12,13,14,15>(b1, b3);
 }
 
-void transpose_reg (Vec8f& a0, Vec8f& a1)
+namespace impl
 {
-  Vec8f b0, b1;
-  b0 = blend8f<0,1,2,3,8,9,10,11>(a0, a1);
-  b1 = blend8f<4,5,6,7,12,13,14,15>(a0, a1);
-  a0 = b0;
-  a1 = b1;
+  /* (alow, aupp), (blow, bupp) -> (alow, blow), (aupp, bupp) */
+  void swap_halves(Vec8f& a, Vec8f& b)
+  {
+    Vec4f tmp = a.get_high();
+    a = Vec8f(a.get_low(), b.get_low());
+    b = Vec8f(tmp, b.get_high());
+  }
+
+  /* A 4x8 transpose that behaves exactly like Vec4d's 4x4 transpose
+   * on the lower and upper halves of the Vec8d
+   */
+  void _transpose4x8(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3)
+  {
+    Vec8f b0,b1,b2,b3;
+    b0 = blend8f<0,8,2,10,4,12,6,14>(a0,a1);
+    b1 = blend8f<1,9,3,11,5,13,7,15>(a0,a1);
+    b2 = blend8f<0,8,2,10,4,12,6,14>(a2,a3);
+    b3 = blend8f<1,9,3,11,5,13,7,15>(a2,a3);
+    a0 = blend8f<0,1,8,9,4,5,12,13>(b0,b2);
+    a1 = blend8f<0,1,8,9,4,5,12,13>(b1,b3);
+    a2 = blend8f<2,3,10,11,6,7,14,15>(b0,b2);
+    a3 = blend8f<2,3,10,11,6,7,14,15>(b1,b3);
+  }
+}
+
+// FLOAT 8 x 8
+void transpose_reg(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3,
+                   Vec8f& a4, Vec8f& a5, Vec8f& a6, Vec8f& a7)
+{
+  impl::_transpose4x8(a0,a1,a2,a3);
+  impl::_transpose4x8(a4,a5,a6,a7);
+  impl::swap_halves(a0,a4);
+  impl::swap_halves(a1,a5);
+  impl::swap_halves(a2,a6);
+  impl::swap_halves(a3,a7);
 }
 
 #endif
 
 #if MAX_VECTOR_SIZE >= 512
 
+// DOUBLE 8 x 4
 void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
 {
   Vec8d b0, b1, b2, b3;
@@ -66,6 +142,7 @@ void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
   a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3);
 }
 
+// DOUBLE 8 x 2
 /** TODO: Is this transpose using blend8d superior to the swap_halves
  *        version below using get_low/get_high?
  */
@@ -105,6 +182,7 @@ namespace impl
   }
 }
 
+// DOUBLE 8 x 8
 /* This is the 8x8 transpose of Vec8d's. It uses the same shuffling
  * as Vec4d, but on the 4x4 subblocks. Afterwards, the off diagonal
  * blocks are swapped.
@@ -120,6 +198,66 @@ void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3,
   impl::swap_halves(a3,a7);
 }
 
+void swap_halves(Vec16f& a0, Vec16f& a1)
+{
+  Vec8f tmp = a0.get_high();
+  a0 = Vec16f(a0.get_low(), a1.get_low());
+  a1 = Vec16f(tmp, a1.get_high());
+}
+
+// FLOAT 16 x 2
+void transpose_reg(Vec16f& a0, Vec16f& a1)
+{
+  swap_halves(a0, a1);
+}
+
+// FLOAT 16 x 4
+void transpose_reg(Vec16f& a0, Vec16f& a1, Vec16f& a2, Vec16f& a3)
+{
+  Vec16f b0,b1,b2,b3;
+  b0 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(a0,a1);
+  b1 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(a0,a1);
+  b2 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(a2,a3);
+  b3 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(a2,a3);
+  swap_halves(b0,b2);
+  swap_halves(b1,b3);
+  a0 = b0;
+  a1 = b1;
+  a2 = b2;
+  a3 = b3;
+}
+
+namespace impl
+{
+  /* A 4x8 transpose that behaves exactly like Vec4d's 4x4 transpose
+   * on the lower and upper halves of the Vec8d
+   */
+  void _transpose4x16(Vec16f& a0, Vec16f& a1, Vec16f& a2, Vec16f& a3)
+  {
+    Vec16f b0,b1,b2,b3;
+    b0 = blend16f<0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29>(a0,a1);
+    b1 = blend16f<2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31>(a0,a1);
+    b2 = blend16f<0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29>(a2,a3);
+    b3 = blend16f<2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31>(a2,a3);
+    a0 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(b0,b2);
+    a1 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(b1,b3);
+    a2 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(b0,b2);
+    a3 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(b1,b3);
+  }
+}
+
+// FLOAT 16 x 8
+void transpose_reg(Vec16f& a0, Vec16f& a1, Vec16f& a2, Vec16f& a3,
+		   Vec16f& a4, Vec16f& a5, Vec16f& a6, Vec16f& a7)
+{
+  impl::_transpose4x16(a0,a1,a2,a3);
+  impl::_transpose4x16(a4,a5,a6,a7);
+  swap_halves(a0,a4);
+  swap_halves(a1,a5);
+  swap_halves(a2,a6);
+  swap_halves(a3,a7);
+}
+
 #endif
 
 #endif
diff --git a/dune/perftool/test/CMakeLists.txt b/dune/perftool/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..69aeb19014fc881242af94b44d75f81f0315da17
--- /dev/null
+++ b/dune/perftool/test/CMakeLists.txt
@@ -0,0 +1,8 @@
+# There is a clang bug for a 16x8 transpose and I did not manage to include
+# only that one due to a weirdness in CI - it worked on my machine through testtools.
+if(NOT("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang"))
+  dune_add_system_test(SOURCE test_transpose.cc
+                       BASENAME test_transpose
+                       INIFILE transpose.mini
+                       )
+endif()
diff --git a/dune/perftool/test/test_transpose.cc b/dune/perftool/test/test_transpose.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d265b1bd266969ce900957dff547a20d5a04607
--- /dev/null
+++ b/dune/perftool/test/test_transpose.cc
@@ -0,0 +1,74 @@
+#include "config.h"
+
+#define MAX_VECTOR_SIZE 512
+
+#include<dune/perftool/common/vectorclass.hh>
+#include<dune/perftool/sumfact/transposereg.hh>
+
+#include<array>
+#include<iostream>
+#include<utility>
+
+
+template<std::size_t ...S>
+void apply_function(std::array<VECTYPE, M>& data, std::index_sequence<S...>)
+{
+  transpose_reg(std::get<S>(data) ...);
+}
+
+
+void print(const std::array<VECTYPE, M>& data)
+{
+  for(int i=0; i<M; ++i)
+  {
+    for(int j=0; j<N; ++j)
+      std::cout << data[i].extract(j) << " ";
+    std::cout << std::endl;
+  }
+}
+
+
+int main()
+{
+  // Setup data
+  std::array<VECTYPE, M> testdata;
+  int data = 0;
+  for(int i=0; i<M; ++i)
+    for(int j=0; j<N; ++j, ++data)
+      testdata[i].insert(j, data);
+
+  std::array<VECTYPE, M> originaldata(testdata);
+
+  // Print the test data once
+  std::cout << "Before transposition:" << std::endl;
+  print(testdata);
+
+  // Apply the transpose function
+  apply_function(testdata, std::make_index_sequence<M>{});
+
+  // And print the test data once more
+  std::cout << "After transposition:" << std::endl;
+  print(testdata);
+
+  int result = 0;
+  for(int i=0; i<M; ++i)
+    for(int j=0; j<N; ++j)
+    {
+      // Dirty handcoding of the generic permutation
+      auto NM = (int)N / (int)M;
+      auto linear = i * N + j;
+      auto block = linear / NM;
+      auto newblock = (block % M) * M + (block / M);
+      auto newlinear = newblock * NM + (linear % NM);
+      auto newj = newlinear % N;
+      auto newi = newlinear / N;
+
+      if(testdata[newi].extract(newj) != originaldata[i].extract(j))
+      {
+	std::cout << "Test failure in transpose: index (" << i << "," << j << ") in original data does not match index (" << newi << "," << newj << ") in new data." << std::endl;
+	++result;
+      }
+    }
+
+  return result;
+}
diff --git a/dune/perftool/test/transpose.mini b/dune/perftool/test/transpose.mini
new file mode 100644
index 0000000000000000000000000000000000000000..53a31e14fe0714271ade56205f05316e82896613
--- /dev/null
+++ b/dune/perftool/test/transpose.mini
@@ -0,0 +1,18 @@
+__name = test_transpose_{__exec_suffix}
+__exec_suffix = {__static.BASETYPE}_{__static.N}x{__static.M} 
+
+# These transposes do not make sense
+{__static.M} > {__static.N} | exclude
+{__static.N} == 2 and {shorttype} == f | exclude
+{__static.N} == 16 and {shorttype} == d | exclude
+
+# These transposes are not yet implemented
+{__static.M} == 16 | exclude
+
+shorttype = f, d | expand base
+
+[__static]
+N = 2, 4, 8, 16 | expand n
+M = 2, 4, 8, 16 | expand m
+BASETYPE = float, double | expand base
+VECTYPE = Vec{__static.N}{shorttype}