diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh index f73c6a2f717b243e32411a5feea948c7777ce430..9ed1974055859fa175cdbe2c86d0bcabd80fb8bc 100644 --- a/dune/perftool/sumfact/transposereg.hh +++ b/dune/perftool/sumfact/transposereg.hh @@ -75,6 +75,48 @@ void transpose_reg (Vec8d& a0, Vec8d& a1) a1 = b1; } +namespace impl +{ + /* (alow, aupp), (blow, bupp) -> (alow, blow), (aupp, bupp) */ + void swap_halves(Vec8d& a, Vec8d& b) + { + Vec4d tmp = a.get_high(); + a = Vec8d(a.get_low(), b.get_low()); + b = Vec8d(tmp, b.get_high()); + } + + /* A 4x8 transpose that behaves exactly like Vec4d's 4x4 transpose + * on the lower and upper halves of the Vec8d + */ + void _transpose4x8(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3) + { + Vec8d b0,b1,b2,b3; + b0 = blend8d<0,8,2,10,4,12,6,14>(a0,a1); + b1 = blend8d<1,9,3,11,5,13,7,15>(a0,a1); + b2 = blend8d<0,8,2,10,4,12,6,14>(a2,a3); + b3 = blend8d<1,9,3,11,5,13,7,15>(a2,a3); + a0 = blend8d<0,1,8,9,4,5,12,13>(b0,b2); + a1 = blend8d<0,1,8,9,4,5,12,13>(b1,b3); + a2 = blend8d<2,3,10,11,6,7,14,15>(b0,b2); + a3 = blend8d<2,3,10,11,6,7,14,15>(b1,b3); + } +} + +/* This is the 8x8 transpose of Vec8d's. It uses the same shuffling + * as Vec4d, but on the 4x4 subblocks. Afterwards, the off diagonal + * blocks are swapped. + */ +void transpose(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3, + Vec8d& a4, Vec8d& a5, Vec8d& a6, Vec8d& a7) +{ + impl::_transpose4x8(a0,a1,a2,a3); + impl::_transpose4x8(a4,a5,a6,a7); + impl::swap_halves(a0,a4); + impl::swap_halves(a1,a5); + impl::swap_halves(a2,a6); + impl::swap_halves(a3,a7); +} + #endif #endif