diff --git a/dune/perftool/CMakeLists.txt b/dune/perftool/CMakeLists.txt index 7533fd57e0f5cafac76f7591c9bbbf41ea7b819b..92f812d0fe1d364e4fdd175b6cebe02b14f2f978 100644 --- a/dune/perftool/CMakeLists.txt +++ b/dune/perftool/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(common) +add_subdirectory(test) install(FILES vectorclass/dispatch_example.cpp vectorclass/instrset_detect.cpp diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh index d2ce09c39bf6ff5b87cc236cbf5d2a5bfcf850f1..a3b8372288d9461c85f6761328fa96e7db465ea4 100644 --- a/dune/perftool/sumfact/transposereg.hh +++ b/dune/perftool/sumfact/transposereg.hh @@ -3,8 +3,51 @@ #include<dune/perftool/common/vectorclass.hh> +// DOUBLE 2 x 2 +void transpose_reg(Vec2d& a0, Vec2d& a1) +{ + double tmp = a0[1]; + a0 = Vec2d(a0[0], a1[0]); + a1 = Vec2d(tmp, a1[1]); +} + +// FLOAT 4 x 2 +void transpose_reg(Vec4f& a0, Vec4f& a1) +{ + Vec4f b0, b1; + b0 = blend4f<0,1,4,5>(a0,a1); + b1 = blend4f<2,3,6,7>(a0,a1); + a0 = b0; + a1 = b1; +} + +// FLOAT 4 x 4 +void transpose_reg(Vec4f& a0, Vec4f& a1, Vec4f& a2, Vec4f& a3) +{ + Vec4f b0,b1,b2,b3; + b0 = blend4f<0,4,2,6>(a0,a1); + b1 = blend4f<1,5,3,7>(a0,a1); + b2 = blend4f<0,4,2,6>(a2,a3); + b3 = blend4f<1,5,3,7>(a2,a3); + a0 = blend4f<0,1,4,5>(b0,b2); + a1 = blend4f<0,1,4,5>(b1,b3); + a2 = blend4f<2,3,6,7>(b0,b2); + a3 = blend4f<2,3,6,7>(b1,b3); +} + #if MAX_VECTOR_SIZE >= 256 +// DOUBLE 4 x 2 +void transpose_reg(Vec4d& a0, Vec4d& a1) +{ + Vec4d b0, b1; + b0 = blend4d<0,1,4,5>(a0,a1); + b1 = blend4d<2,3,6,7>(a0,a1); + a0 = b0; + a1 = b1; +} + +// DOUBLE 4 x 4 void transpose_reg(Vec4d& a0, Vec4d& a1, Vec4d& a2, Vec4d& a3) { Vec4d b0,b1,b2,b3; @@ -18,15 +61,17 @@ void transpose_reg(Vec4d& a0, Vec4d& a1, Vec4d& a2, Vec4d& a3) a3 = blend4d<2,3,6,7>(b1,b3); } -void transpose_reg(Vec4d& a0, Vec4d& a1) +// FLOAT 8 x 2 +void transpose_reg (Vec8f& a0, Vec8f& a1) { - Vec4d b0, b1; - b0 = blend4d<0,1,4,5>(a0,a1); - b1 = blend4d<2,3,6,7>(a0,a1); + Vec8f b0, b1; + b0 = blend8f<0,1,2,3,8,9,10,11>(a0, a1); + b1 = blend8f<4,5,6,7,12,13,14,15>(a0, a1); a0 = b0; a1 = b1; } +// FLOAT 8 x 4 void transpose_reg(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3) { Vec8f b0, b1, b2, b3; @@ -40,19 +85,50 @@ void transpose_reg(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3) a3 = blend8f<4,5,6,7,12,13,14,15>(b1, b3); } -void transpose_reg (Vec8f& a0, Vec8f& a1) +namespace impl { - Vec8f b0, b1; - b0 = blend8f<0,1,2,3,8,9,10,11>(a0, a1); - b1 = blend8f<4,5,6,7,12,13,14,15>(a0, a1); - a0 = b0; - a1 = b1; + /* (alow, aupp), (blow, bupp) -> (alow, blow), (aupp, bupp) */ + void swap_halves(Vec8f& a, Vec8f& b) + { + Vec4f tmp = a.get_high(); + a = Vec8f(a.get_low(), b.get_low()); + b = Vec8f(tmp, b.get_high()); + } + + /* A 4x8 transpose that behaves exactly like Vec4d's 4x4 transpose + * on the lower and upper halves of the Vec8d + */ + void _transpose4x8(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3) + { + Vec8f b0,b1,b2,b3; + b0 = blend8f<0,8,2,10,4,12,6,14>(a0,a1); + b1 = blend8f<1,9,3,11,5,13,7,15>(a0,a1); + b2 = blend8f<0,8,2,10,4,12,6,14>(a2,a3); + b3 = blend8f<1,9,3,11,5,13,7,15>(a2,a3); + a0 = blend8f<0,1,8,9,4,5,12,13>(b0,b2); + a1 = blend8f<0,1,8,9,4,5,12,13>(b1,b3); + a2 = blend8f<2,3,10,11,6,7,14,15>(b0,b2); + a3 = blend8f<2,3,10,11,6,7,14,15>(b1,b3); + } +} + +// FLOAT 8 x 8 +void transpose_reg(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3, + Vec8f& a4, Vec8f& a5, Vec8f& a6, Vec8f& a7) +{ + impl::_transpose4x8(a0,a1,a2,a3); + impl::_transpose4x8(a4,a5,a6,a7); + impl::swap_halves(a0,a4); + impl::swap_halves(a1,a5); + impl::swap_halves(a2,a6); + impl::swap_halves(a3,a7); } #endif #if MAX_VECTOR_SIZE >= 512 +// DOUBLE 8 x 4 void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3) { Vec8d b0, b1, b2, b3; @@ -66,6 +142,7 @@ void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3) a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3); } +// DOUBLE 8 x 2 /** TODO: Is this transpose using blend8d superior to the swap_halves * version below using get_low/get_high? */ @@ -105,6 +182,7 @@ namespace impl } } +// DOUBLE 8 x 8 /* This is the 8x8 transpose of Vec8d's. It uses the same shuffling * as Vec4d, but on the 4x4 subblocks. Afterwards, the off diagonal * blocks are swapped. @@ -120,6 +198,66 @@ void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3, impl::swap_halves(a3,a7); } +void swap_halves(Vec16f& a0, Vec16f& a1) +{ + Vec8f tmp = a0.get_high(); + a0 = Vec16f(a0.get_low(), a1.get_low()); + a1 = Vec16f(tmp, a1.get_high()); +} + +// FLOAT 16 x 2 +void transpose_reg(Vec16f& a0, Vec16f& a1) +{ + swap_halves(a0, a1); +} + +// FLOAT 16 x 4 +void transpose_reg(Vec16f& a0, Vec16f& a1, Vec16f& a2, Vec16f& a3) +{ + Vec16f b0,b1,b2,b3; + b0 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(a0,a1); + b1 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(a0,a1); + b2 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(a2,a3); + b3 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(a2,a3); + swap_halves(b0,b2); + swap_halves(b1,b3); + a0 = b0; + a1 = b1; + a2 = b2; + a3 = b3; +} + +namespace impl +{ + /* A 4x8 transpose that behaves exactly like Vec4d's 4x4 transpose + * on the lower and upper halves of the Vec8d + */ + void _transpose4x16(Vec16f& a0, Vec16f& a1, Vec16f& a2, Vec16f& a3) + { + Vec16f b0,b1,b2,b3; + b0 = blend16f<0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29>(a0,a1); + b1 = blend16f<2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31>(a0,a1); + b2 = blend16f<0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29>(a2,a3); + b3 = blend16f<2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31>(a2,a3); + a0 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(b0,b2); + a1 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(b1,b3); + a2 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(b0,b2); + a3 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(b1,b3); + } +} + +// FLOAT 16 x 8 +void transpose_reg(Vec16f& a0, Vec16f& a1, Vec16f& a2, Vec16f& a3, + Vec16f& a4, Vec16f& a5, Vec16f& a6, Vec16f& a7) +{ + impl::_transpose4x16(a0,a1,a2,a3); + impl::_transpose4x16(a4,a5,a6,a7); + swap_halves(a0,a4); + swap_halves(a1,a5); + swap_halves(a2,a6); + swap_halves(a3,a7); +} + #endif #endif diff --git a/dune/perftool/test/CMakeLists.txt b/dune/perftool/test/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..69aeb19014fc881242af94b44d75f81f0315da17 --- /dev/null +++ b/dune/perftool/test/CMakeLists.txt @@ -0,0 +1,8 @@ +# There is a clang bug for a 16x8 transpose and I did not manage to include +# only that one due to a weirdness in CI - it worked on my machine through testtools. +if(NOT("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")) + dune_add_system_test(SOURCE test_transpose.cc + BASENAME test_transpose + INIFILE transpose.mini + ) +endif() diff --git a/dune/perftool/test/test_transpose.cc b/dune/perftool/test/test_transpose.cc new file mode 100644 index 0000000000000000000000000000000000000000..8d265b1bd266969ce900957dff547a20d5a04607 --- /dev/null +++ b/dune/perftool/test/test_transpose.cc @@ -0,0 +1,74 @@ +#include "config.h" + +#define MAX_VECTOR_SIZE 512 + +#include<dune/perftool/common/vectorclass.hh> +#include<dune/perftool/sumfact/transposereg.hh> + +#include<array> +#include<iostream> +#include<utility> + + +template<std::size_t ...S> +void apply_function(std::array<VECTYPE, M>& data, std::index_sequence<S...>) +{ + transpose_reg(std::get<S>(data) ...); +} + + +void print(const std::array<VECTYPE, M>& data) +{ + for(int i=0; i<M; ++i) + { + for(int j=0; j<N; ++j) + std::cout << data[i].extract(j) << " "; + std::cout << std::endl; + } +} + + +int main() +{ + // Setup data + std::array<VECTYPE, M> testdata; + int data = 0; + for(int i=0; i<M; ++i) + for(int j=0; j<N; ++j, ++data) + testdata[i].insert(j, data); + + std::array<VECTYPE, M> originaldata(testdata); + + // Print the test data once + std::cout << "Before transposition:" << std::endl; + print(testdata); + + // Apply the transpose function + apply_function(testdata, std::make_index_sequence<M>{}); + + // And print the test data once more + std::cout << "After transposition:" << std::endl; + print(testdata); + + int result = 0; + for(int i=0; i<M; ++i) + for(int j=0; j<N; ++j) + { + // Dirty handcoding of the generic permutation + auto NM = (int)N / (int)M; + auto linear = i * N + j; + auto block = linear / NM; + auto newblock = (block % M) * M + (block / M); + auto newlinear = newblock * NM + (linear % NM); + auto newj = newlinear % N; + auto newi = newlinear / N; + + if(testdata[newi].extract(newj) != originaldata[i].extract(j)) + { + std::cout << "Test failure in transpose: index (" << i << "," << j << ") in original data does not match index (" << newi << "," << newj << ") in new data." << std::endl; + ++result; + } + } + + return result; +} diff --git a/dune/perftool/test/transpose.mini b/dune/perftool/test/transpose.mini new file mode 100644 index 0000000000000000000000000000000000000000..53a31e14fe0714271ade56205f05316e82896613 --- /dev/null +++ b/dune/perftool/test/transpose.mini @@ -0,0 +1,18 @@ +__name = test_transpose_{__exec_suffix} +__exec_suffix = {__static.BASETYPE}_{__static.N}x{__static.M} + +# These transposes do not make sense +{__static.M} > {__static.N} | exclude +{__static.N} == 2 and {shorttype} == f | exclude +{__static.N} == 16 and {shorttype} == d | exclude + +# These transposes are not yet implemented +{__static.M} == 16 | exclude + +shorttype = f, d | expand base + +[__static] +N = 2, 4, 8, 16 | expand n +M = 2, 4, 8, 16 | expand m +BASETYPE = float, double | expand base +VECTYPE = Vec{__static.N}{shorttype}