Skip to content
Snippets Groups Projects
Commit df2c1d60 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

[!271] Add AVX512 single precision tranposes and a transpose testing suite

Merge branch 'feature/skylake-single-precision-transposes' into 'master'

See merge request [dominic/dune-perftool!271]

  [dominic/dune-perftool!271]: Nonedominic/dune-perftool/merge_requests/271
parents f5c18a67 b13701f0
No related branches found
No related tags found
No related merge requests found
add_subdirectory(common)
add_subdirectory(test)
install(FILES vectorclass/dispatch_example.cpp
vectorclass/instrset_detect.cpp
......
......@@ -3,8 +3,51 @@
#include<dune/perftool/common/vectorclass.hh>
// DOUBLE 2 x 2
void transpose_reg(Vec2d& a0, Vec2d& a1)
{
double tmp = a0[1];
a0 = Vec2d(a0[0], a1[0]);
a1 = Vec2d(tmp, a1[1]);
}
// FLOAT 4 x 2
void transpose_reg(Vec4f& a0, Vec4f& a1)
{
Vec4f b0, b1;
b0 = blend4f<0,1,4,5>(a0,a1);
b1 = blend4f<2,3,6,7>(a0,a1);
a0 = b0;
a1 = b1;
}
// FLOAT 4 x 4
void transpose_reg(Vec4f& a0, Vec4f& a1, Vec4f& a2, Vec4f& a3)
{
Vec4f b0,b1,b2,b3;
b0 = blend4f<0,4,2,6>(a0,a1);
b1 = blend4f<1,5,3,7>(a0,a1);
b2 = blend4f<0,4,2,6>(a2,a3);
b3 = blend4f<1,5,3,7>(a2,a3);
a0 = blend4f<0,1,4,5>(b0,b2);
a1 = blend4f<0,1,4,5>(b1,b3);
a2 = blend4f<2,3,6,7>(b0,b2);
a3 = blend4f<2,3,6,7>(b1,b3);
}
#if MAX_VECTOR_SIZE >= 256
// DOUBLE 4 x 2
void transpose_reg(Vec4d& a0, Vec4d& a1)
{
Vec4d b0, b1;
b0 = blend4d<0,1,4,5>(a0,a1);
b1 = blend4d<2,3,6,7>(a0,a1);
a0 = b0;
a1 = b1;
}
// DOUBLE 4 x 4
void transpose_reg(Vec4d& a0, Vec4d& a1, Vec4d& a2, Vec4d& a3)
{
Vec4d b0,b1,b2,b3;
......@@ -18,15 +61,17 @@ void transpose_reg(Vec4d& a0, Vec4d& a1, Vec4d& a2, Vec4d& a3)
a3 = blend4d<2,3,6,7>(b1,b3);
}
void transpose_reg(Vec4d& a0, Vec4d& a1)
// FLOAT 8 x 2
void transpose_reg (Vec8f& a0, Vec8f& a1)
{
Vec4d b0, b1;
b0 = blend4d<0,1,4,5>(a0,a1);
b1 = blend4d<2,3,6,7>(a0,a1);
Vec8f b0, b1;
b0 = blend8f<0,1,2,3,8,9,10,11>(a0, a1);
b1 = blend8f<4,5,6,7,12,13,14,15>(a0, a1);
a0 = b0;
a1 = b1;
}
// FLOAT 8 x 4
void transpose_reg(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3)
{
Vec8f b0, b1, b2, b3;
......@@ -40,19 +85,50 @@ void transpose_reg(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3)
a3 = blend8f<4,5,6,7,12,13,14,15>(b1, b3);
}
void transpose_reg (Vec8f& a0, Vec8f& a1)
namespace impl
{
Vec8f b0, b1;
b0 = blend8f<0,1,2,3,8,9,10,11>(a0, a1);
b1 = blend8f<4,5,6,7,12,13,14,15>(a0, a1);
a0 = b0;
a1 = b1;
/* (alow, aupp), (blow, bupp) -> (alow, blow), (aupp, bupp) */
void swap_halves(Vec8f& a, Vec8f& b)
{
Vec4f tmp = a.get_high();
a = Vec8f(a.get_low(), b.get_low());
b = Vec8f(tmp, b.get_high());
}
/* A 4x8 transpose that behaves exactly like Vec4d's 4x4 transpose
* on the lower and upper halves of the Vec8d
*/
void _transpose4x8(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3)
{
Vec8f b0,b1,b2,b3;
b0 = blend8f<0,8,2,10,4,12,6,14>(a0,a1);
b1 = blend8f<1,9,3,11,5,13,7,15>(a0,a1);
b2 = blend8f<0,8,2,10,4,12,6,14>(a2,a3);
b3 = blend8f<1,9,3,11,5,13,7,15>(a2,a3);
a0 = blend8f<0,1,8,9,4,5,12,13>(b0,b2);
a1 = blend8f<0,1,8,9,4,5,12,13>(b1,b3);
a2 = blend8f<2,3,10,11,6,7,14,15>(b0,b2);
a3 = blend8f<2,3,10,11,6,7,14,15>(b1,b3);
}
}
// FLOAT 8 x 8
void transpose_reg(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3,
Vec8f& a4, Vec8f& a5, Vec8f& a6, Vec8f& a7)
{
impl::_transpose4x8(a0,a1,a2,a3);
impl::_transpose4x8(a4,a5,a6,a7);
impl::swap_halves(a0,a4);
impl::swap_halves(a1,a5);
impl::swap_halves(a2,a6);
impl::swap_halves(a3,a7);
}
#endif
#if MAX_VECTOR_SIZE >= 512
// DOUBLE 8 x 4
void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
{
Vec8d b0, b1, b2, b3;
......@@ -66,6 +142,7 @@ void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3);
}
// DOUBLE 8 x 2
/** TODO: Is this transpose using blend8d superior to the swap_halves
* version below using get_low/get_high?
*/
......@@ -105,6 +182,7 @@ namespace impl
}
}
// DOUBLE 8 x 8
/* This is the 8x8 transpose of Vec8d's. It uses the same shuffling
* as Vec4d, but on the 4x4 subblocks. Afterwards, the off diagonal
* blocks are swapped.
......@@ -120,6 +198,66 @@ void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3,
impl::swap_halves(a3,a7);
}
void swap_halves(Vec16f& a0, Vec16f& a1)
{
Vec8f tmp = a0.get_high();
a0 = Vec16f(a0.get_low(), a1.get_low());
a1 = Vec16f(tmp, a1.get_high());
}
// FLOAT 16 x 2
void transpose_reg(Vec16f& a0, Vec16f& a1)
{
swap_halves(a0, a1);
}
// FLOAT 16 x 4
void transpose_reg(Vec16f& a0, Vec16f& a1, Vec16f& a2, Vec16f& a3)
{
Vec16f b0,b1,b2,b3;
b0 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(a0,a1);
b1 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(a0,a1);
b2 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(a2,a3);
b3 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(a2,a3);
swap_halves(b0,b2);
swap_halves(b1,b3);
a0 = b0;
a1 = b1;
a2 = b2;
a3 = b3;
}
namespace impl
{
/* A 4x8 transpose that behaves exactly like Vec4d's 4x4 transpose
* on the lower and upper halves of the Vec8d
*/
void _transpose4x16(Vec16f& a0, Vec16f& a1, Vec16f& a2, Vec16f& a3)
{
Vec16f b0,b1,b2,b3;
b0 = blend16f<0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29>(a0,a1);
b1 = blend16f<2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31>(a0,a1);
b2 = blend16f<0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29>(a2,a3);
b3 = blend16f<2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31>(a2,a3);
a0 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(b0,b2);
a1 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(b1,b3);
a2 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(b0,b2);
a3 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(b1,b3);
}
}
// FLOAT 16 x 8
void transpose_reg(Vec16f& a0, Vec16f& a1, Vec16f& a2, Vec16f& a3,
Vec16f& a4, Vec16f& a5, Vec16f& a6, Vec16f& a7)
{
impl::_transpose4x16(a0,a1,a2,a3);
impl::_transpose4x16(a4,a5,a6,a7);
swap_halves(a0,a4);
swap_halves(a1,a5);
swap_halves(a2,a6);
swap_halves(a3,a7);
}
#endif
#endif
# There is a clang bug for a 16x8 transpose and I did not manage to include
# only that one due to a weirdness in CI - it worked on my machine through testtools.
if(NOT("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang"))
dune_add_system_test(SOURCE test_transpose.cc
BASENAME test_transpose
INIFILE transpose.mini
)
endif()
#include "config.h"
#define MAX_VECTOR_SIZE 512
#include<dune/perftool/common/vectorclass.hh>
#include<dune/perftool/sumfact/transposereg.hh>
#include<array>
#include<iostream>
#include<utility>
template<std::size_t ...S>
void apply_function(std::array<VECTYPE, M>& data, std::index_sequence<S...>)
{
transpose_reg(std::get<S>(data) ...);
}
void print(const std::array<VECTYPE, M>& data)
{
for(int i=0; i<M; ++i)
{
for(int j=0; j<N; ++j)
std::cout << data[i].extract(j) << " ";
std::cout << std::endl;
}
}
int main()
{
// Setup data
std::array<VECTYPE, M> testdata;
int data = 0;
for(int i=0; i<M; ++i)
for(int j=0; j<N; ++j, ++data)
testdata[i].insert(j, data);
std::array<VECTYPE, M> originaldata(testdata);
// Print the test data once
std::cout << "Before transposition:" << std::endl;
print(testdata);
// Apply the transpose function
apply_function(testdata, std::make_index_sequence<M>{});
// And print the test data once more
std::cout << "After transposition:" << std::endl;
print(testdata);
int result = 0;
for(int i=0; i<M; ++i)
for(int j=0; j<N; ++j)
{
// Dirty handcoding of the generic permutation
auto NM = (int)N / (int)M;
auto linear = i * N + j;
auto block = linear / NM;
auto newblock = (block % M) * M + (block / M);
auto newlinear = newblock * NM + (linear % NM);
auto newj = newlinear % N;
auto newi = newlinear / N;
if(testdata[newi].extract(newj) != originaldata[i].extract(j))
{
std::cout << "Test failure in transpose: index (" << i << "," << j << ") in original data does not match index (" << newi << "," << newj << ") in new data." << std::endl;
++result;
}
}
return result;
}
__name = test_transpose_{__exec_suffix}
__exec_suffix = {__static.BASETYPE}_{__static.N}x{__static.M}
# These transposes do not make sense
{__static.M} > {__static.N} | exclude
{__static.N} == 2 and {shorttype} == f | exclude
{__static.N} == 16 and {shorttype} == d | exclude
# These transposes are not yet implemented
{__static.M} == 16 | exclude
shorttype = f, d | expand base
[__static]
N = 2, 4, 8, 16 | expand n
M = 2, 4, 8, 16 | expand m
BASETYPE = float, double | expand base
VECTYPE = Vec{__static.N}{shorttype}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment