diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt index 5a75b84e1c0c3b4d4daa19381d7032489d2cbddb..f472b3d2b80421c4f18b1283a63f80a065a97435 100644 --- a/applications/CMakeLists.txt +++ b/applications/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(poisson_dg) add_subdirectory(poisson_dg_tensor) +add_subdirectory(knl) diff --git a/applications/knl/CMakeLists.txt b/applications/knl/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a75b84e1c0c3b4d4daa19381d7032489d2cbddb --- /dev/null +++ b/applications/knl/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(poisson_dg) +add_subdirectory(poisson_dg_tensor) diff --git a/applications/knl/poisson_dg/CMakeLists.txt b/applications/knl/poisson_dg/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a906a02d5765bd323ab634bb810010bfa92e1535 --- /dev/null +++ b/applications/knl/poisson_dg/CMakeLists.txt @@ -0,0 +1,11 @@ +dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl + BASENAME app_knl_poisson + INIFILE knl_poisson_dg.mini + NO_TESTS + ) + +dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl + BASENAME verify_app_knl_poisson_dg + INIFILE verify.mini + NO_TESTS + ) diff --git a/applications/knl/poisson_dg/knl_poisson_dg.mini b/applications/knl/poisson_dg/knl_poisson_dg.mini new file mode 100644 index 0000000000000000000000000000000000000000..9951705d981a39a8e9c20c089fd117eb76b4e2ee --- /dev/null +++ b/applications/knl/poisson_dg/knl_poisson_dg.mini @@ -0,0 +1,50 @@ +__name = app_knl_poisson_{__exec_suffix} +__exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{formcompiler.instrumentation_level} + +opcount_suffix = opcount, nonopcount | expand opcount +{opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude + +# Calculate the size of the grid to equlibritate it to 100 MB/rank +# Input parameters +dim = 3 +mbperrank = 100 +ranks = 66 +floatingbytes = 8 + +# Metaini Calculations +memperrank = {mbperrank} * 1048576 | eval +dofsperdir = {formcompiler.ufl_variants.degree} + 1 | eval +celldofs = {dofsperdir} ** {dim} | eval +cellsperrank = {memperrank} / ({floatingbytes} * {celldofs}) | eval +cellsperdir = {cellsperrank} ** (1/{dim}) | eval | toint +firstdircells = {ranks} * {cellsperdir} | eval +dimminusone = {dim} - 1 | eval +ones = 1 | repeat {dimminusone} +otherdircells = {cellsperdir} | repeat {dimminusone} + +# Set up the timing identifier +identifier = knl_poisson_dg_deg{formcompiler.ufl_variants.degree} + +# Setup the grid! +extension = 1.0 | repeat {dim} +cells = {firstdircells} {otherdircells} +partitioning = {ranks} {ones} + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +fastdg = 1 +sumfact = 1 +vectorize_quad = 1 +vectorize_diagonal = 1 +instrumentation_level = 2, 3, 4 | expand +opcounter = 1, 0 | expand opcount +time_opcounter = 0, 1 | expand opcount +quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval +architecture = knl + +[formcompiler.ufl_variants] +cell = hexahedron +degree = 1, 3, 5, 7, 9 | expand diff --git a/applications/knl/poisson_dg/poisson_dg.ufl b/applications/knl/poisson_dg/poisson_dg.ufl new file mode 100644 index 0000000000000000000000000000000000000000..2c0f2c34436f1c6b76817d6abad344fae3183d62 --- /dev/null +++ b/applications/knl/poisson_dg/poisson_dg.ufl @@ -0,0 +1,32 @@ +dim = 3 +x = SpatialCoordinate(cell) +f = -6. +g = x[0]*x[0] + x[1]*x[1] + x[2]*x[2] + +V = FiniteElement("DG", cell, degree) + +u = TrialFunction(V) +v = TestFunction(V) + +n = FacetNormal(cell)('+') + +alpha = 1.0 +h_ext = CellVolume(cell) / FacetArea(cell) +cse_gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext +h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell) +cse_gamma_int = (alpha * degree * (degree + dim - 1)) / h_int + +theta = -1.0 + +r = inner(grad(u), grad(v))*dx \ + + inner(n, avg(grad(u)))*jump(v)*dS \ + + cse_gamma_int*jump(u)*jump(v)*dS \ + - theta*jump(u)*inner(avg(grad(v)), n)*dS \ + - inner(n, grad(u))*v*ds \ + + cse_gamma_ext*u*v*ds \ + + theta*u*inner(grad(v), n)*ds \ + - f*v*dx \ + - theta*g*inner(grad(v), n)*ds \ + - cse_gamma_ext*g*v*ds + +forms = [r] diff --git a/applications/knl/poisson_dg/verify.mini b/applications/knl/poisson_dg/verify.mini new file mode 100644 index 0000000000000000000000000000000000000000..bae219fc0b2336b1109b109b8c236e9e8ef9cc5c --- /dev/null +++ b/applications/knl/poisson_dg/verify.mini @@ -0,0 +1,21 @@ +__name = verify_app_knl_poisson_dg + +# Setup the grid! +extension = 1.0 1.0 1.0 +cells = 8 8 8 + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +fastdg = 1 +sumfact = 1 +vectorize_quad = 1 +vectorize_diagonal = 1 +quadrature_order = 6 +architecture = knl + +[formcompiler.ufl_variants] +cell = hexahedron +degree = 3 diff --git a/applications/knl/poisson_dg_tensor/CMakeLists.txt b/applications/knl/poisson_dg_tensor/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..86889e3f0b4893c6afcf0a4e21ed954f75a626c7 --- /dev/null +++ b/applications/knl/poisson_dg_tensor/CMakeLists.txt @@ -0,0 +1,11 @@ +dune_add_formcompiler_system_test(UFLFILE poisson_dg_tensor.ufl + BASENAME app_knl_poisson_tensor + INIFILE knl_poisson_dg_tensor.mini + NO_TESTS + ) + +dune_add_formcompiler_system_test(UFLFILE poisson_dg_tensor.ufl + BASENAME verify_app_knl_poisson_dg_tensor + INIFILE verify.mini + NO_TESTS + ) diff --git a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini new file mode 100644 index 0000000000000000000000000000000000000000..07aa0a438765cc5ddddf2f46dad71ae7be37d835 --- /dev/null +++ b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini @@ -0,0 +1,50 @@ +__name = app_knl_poisson_tensor_{__exec_suffix} +__exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{formcompiler.instrumentation_level} + +opcount_suffix = opcount, nonopcount | expand opcount +{opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude + +# Calculate the size of the grid to equlibritate it to 100 MB/rank +# Input parameters +dim = 3 +mbperrank = 100 +ranks = 66 +floatingbytes = 8 + +# Metaini Calculations +memperrank = {mbperrank} * 1048576 | eval +dofsperdir = {formcompiler.ufl_variants.degree} + 1 | eval +celldofs = {dofsperdir} ** {dim} | eval +cellsperrank = {memperrank} / ({floatingbytes} * {celldofs}) | eval +cellsperdir = {cellsperrank} ** (1/{dim}) | eval | toint +firstdircells = {ranks} * {cellsperdir} | eval +dimminusone = {dim} - 1 | eval +ones = 1 | repeat {dimminusone} +otherdircells = {cellsperdir} | repeat {dimminusone} + +# Set up the timing identifier +identifier = knl_poisson_dg_deg{formcompiler.ufl_variants.degree} + +# Setup the grid! +extension = 1.0 | repeat {dim} +cells = {firstdircells} {otherdircells} +partitioning = {ranks} {ones} + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +fastdg = 1 +sumfact = 1 +vectorize_quad = 1 +vectorize_diagonal = 1 +instrumentation_level = 2, 3, 4 | expand +opcounter = 1, 0 | expand opcount +time_opcounter = 0, 1 | expand opcount +quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval +architecture = knl + +[formcompiler.ufl_variants] +cell = hexahedron +degree = 1, 3, 5, 7, 9 | expand diff --git a/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl b/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl new file mode 100644 index 0000000000000000000000000000000000000000..00267f15b9331b4705138cfb1a4698ee357a5767 --- /dev/null +++ b/applications/knl/poisson_dg_tensor/poisson_dg_tensor.ufl @@ -0,0 +1,35 @@ +dim = 3 +x = SpatialCoordinate(cell) + +g = x[0]*x[0] + x[1]*x[1] + x[2]*x[2] +I = Identity(3) +A = as_matrix([[x[i]*x[j] + I[i,j] for j in range(3)] for i in range(3)]) +f = -6. +c = 10. + +V = FiniteElement("DG", cell, degree) + +u = TrialFunction(V) +v = TestFunction(V) + +n = FacetNormal(cell)('+') + +alpha = 3.0 +h_ext = CellVolume(cell) / FacetArea(cell) +cse_gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext +h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell) +cse_gamma_int = (alpha * degree * (degree + dim - 1)) / h_int + +theta = -1.0 + +r = (inner(A*grad(u), grad(v)) + (c*u-f)*v)*dx \ + + inner(n, A*avg(grad(u)))*jump(v)*dS \ + + cse_gamma_int*jump(u)*jump(v)*dS \ + - theta*jump(u)*inner(A*avg(grad(v)), n)*dS \ + - inner(n, A*grad(u))*v*ds \ + + cse_gamma_ext*u*v*ds \ + + theta*u*inner(A*grad(v), n)*ds \ + - theta*g*inner(A*grad(v), n)*ds \ + - cse_gamma_ext*g*v*ds + +forms = [r] diff --git a/applications/knl/poisson_dg_tensor/verify.mini b/applications/knl/poisson_dg_tensor/verify.mini new file mode 100644 index 0000000000000000000000000000000000000000..2f0d7170f89fccc6c4e65888f8cdab176c2dd015 --- /dev/null +++ b/applications/knl/poisson_dg_tensor/verify.mini @@ -0,0 +1,21 @@ +__name = verify_app_knl_poisson_dg_tensor + +# Setup the grid! +extension = 1.0 1.0 1.0 +cells = 8 8 8 + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +fastdg = 1 +sumfact = 1 +vectorize_quad = 1 +vectorize_diagonal = 1 +quadrature_order = 6 +architecture = knl + +[formcompiler.ufl_variants] +cell = hexahedron +degree = 3 diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt index e8ea4bd424beb1d30c28ca716113021b2713aa6b..36a037e322cf430c6b0fbc8495e06512fa84b1ac 100644 --- a/bin/CMakeLists.txt +++ b/bin/CMakeLists.txt @@ -11,4 +11,5 @@ dune_install_python_script(SCRIPT performance_regression.py ) dune_symlink_to_source_files(FILES make_graph.sh + knltimings.sh ) diff --git a/bin/knltimings.sh b/bin/knltimings.sh new file mode 100755 index 0000000000000000000000000000000000000000..2f238395262f6eb11a35707f7d6556541eaac673 --- /dev/null +++ b/bin/knltimings.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# If an argument was given use it as the working directory +if [ $# -eq 1 ] +then + cd $1 +fi + +# Search for runnable executables +FILES=$(ls *.ini) +for inifile in $FILES +do + line=$(grep ^"opcounter = " $inifile) + extract=${line##opcounter = } + UPPER=10 + if [ $extract -eq 1 ] + then + UPPER=1 + fi + COUNT=0 + while [ $COUNT -lt $UPPER ]; do + exec=${inifile%.ini} + line=$(grep ^"ranks = " $inifile) + np=${line##ranks = } + mpirun -np $np ./$exec $inifile + COUNT=$((COUNT + 1)) + done +done diff --git a/dune/perftool/common/vectorclass.hh b/dune/perftool/common/vectorclass.hh index 3b857c35cbb90bd34a24b8b35c300c8b30df3eaf..8d31883eb80faebe8101f2e045d0da31b029642b 100644 --- a/dune/perftool/common/vectorclass.hh +++ b/dune/perftool/common/vectorclass.hh @@ -165,7 +165,7 @@ static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) { static inline Vec4d operator - (Vec4d const & a) { BARRIER; Vec4d r(a); - for (size_t i = 0 ; i < 3 ; ++i) + for (size_t i = 0 ; i < 4 ; ++i) r._d[i] = -a._d[i]; BARRIER; return r; @@ -590,6 +590,427 @@ static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) { return r; } +#if MAC_VECTOR_SIZE >= 512 + +struct Vec8d +{ + oc::OpCounter<double> _d[8]; + + using F = oc::OpCounter<double>; + + Vec8d() + {} + + Vec8d(F d) + { + BARRIER; + std::fill(_d,_d+8,d); + BARRIER; + } + + Vec8d(double d) + { + BARRIER; + std::fill(_d,_d+8,d); + BARRIER; + } + + Vec8d(F d0, F d1, F d2, F d3, F d4, F d5, F d6, F d7) + : _d{d0,d1,d2,d3,d4,d5,d6,d7} + { + BARRIER; + } + + Vec8d& load(const F* p) + { + BARRIER; + std::copy(p,p+8,_d); + BARRIER; + return *this; + } + + Vec8d& load_a(const F* p) + { + BARRIER; + std::copy(p,p+8,_d); + BARRIER; + return *this; + } + + void store(F* p) const + { + BARRIER; + std::copy(_d,_d+8,p); + BARRIER; + } + + void store_a(F* p) const + { + BARRIER; + std::copy(_d,_d+8,p); + BARRIER; + } + + Vec8d const& insert(uint32_t index, F value) + { + BARRIER; + _d[index] = value; + BARRIER; + return *this; + } + + F extract(uint32_t index) const + { + BARRIER; + return _d[index]; + } + + constexpr static int size() + { + return 8; + } + +}; + + +/***************************************************************************** +* +* Operators for Vec8d +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec8d operator + (Vec8d const & a, Vec8d const & b) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x + y; }); + BARRIER; + return r; +} + +// vector operator += : add +static inline Vec8d & operator += (Vec8d & a, Vec8d const & b) { + BARRIER; + std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x + y; }); + BARRIER; + return a; +} + +// postfix operator ++ +static inline Vec8d operator ++ (Vec8d & a, int) { + BARRIER; + Vec8d a0 = a; + a = a + 1.0; + BARRIER; + return a0; +} + +// prefix operator ++ +static inline Vec8d & operator ++ (Vec8d & a) { + BARRIER; + a = a + 1.0; + BARRIER; + return a; +} + +// vector operator - : subtract element by element +static inline Vec8d operator - (Vec8d const & a, Vec8d const & b) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x - y; }); + BARRIER; + return r; +} + +// vector operator - : unary minus +// Change sign bit, even for 0, INF and NAN +static inline Vec8d operator - (Vec8d const & a) { + BARRIER; + Vec8d r(a); + for (size_t i = 0 ; i < 8 ; ++i) + r._d[i] = -a._d[i]; + BARRIER; + return r; +} + +// vector operator -= : subtract +static inline Vec8d & operator -= (Vec8d & a, Vec8d const & b) { + BARRIER; + std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x - y; }); + BARRIER; + return a; +} + +// postfix operator -- +static inline Vec8d operator -- (Vec8d & a, int) { + BARRIER; + Vec8d a0 = a; + a = a - 1.0; + BARRIER; + return a0; +} + +// prefix operator -- +static inline Vec8d & operator -- (Vec8d & a) { + BARRIER; + a = a - 1.0; + BARRIER; + return a; +} + +// vector operator * : multiply element by element +static inline Vec8d operator * (Vec8d const & a, Vec8d const & b) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x * y; }); + BARRIER; + return r; +} + +// vector operator *= : multiply +static inline Vec8d & operator *= (Vec8d & a, Vec8d const & b) { + BARRIER; + std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x * y; }); + BARRIER; + return a; +} + +// vector operator / : divide all elements by same integer +static inline Vec8d operator / (Vec8d const & a, Vec8d const & b) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x / y; }); + BARRIER; + return r; +} + +// vector operator /= : divide +static inline Vec8d & operator /= (Vec8d & a, Vec8d const & b) { + BARRIER; + std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x / y; }); + BARRIER; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline _vcl::Vec8db operator == (Vec8d const & a, Vec8d const & b) { + BARRIER; + _vcl::Vec8d a_, b_; + BARRIER; + a_.load(a._d[0].data()); + BARRIER; + b_.load(b._d[0].data()); + BARRIER; + Vec8d::F::comparisons(8); + BARRIER; + return a_ == b_; +} + +// vector operator != : returns true for elements for which a != b +static inline _vcl::Vec8db operator != (Vec8d const & a, Vec8d const & b) { + BARRIER; + _vcl::Vec8d a_, b_; + BARRIER; + a_.load(a._d[0].data()); + BARRIER; + b_.load(b._d[0].data()); + BARRIER; + Vec8d::F::comparisons(8); + BARRIER; + return a_ != b_; +} + +// vector operator < : returns true for elements for which a < b +static inline _vcl::Vec8db operator < (Vec8d const & a, Vec8d const & b) { + BARRIER; + _vcl::Vec8d a_, b_; + BARRIER; + a_.load(a._d[0].data()); + BARRIER; + b_.load(b._d[0].data()); + BARRIER; + Vec8d::F::comparisons(8); + BARRIER; + return a_ < b_; +} + +// vector operator <= : returns true for elements for which a <= b +static inline _vcl::Vec8db operator <= (Vec8d const & a, Vec8d const & b) { + BARRIER; + _vcl::Vec8d a_, b_; + BARRIER; + a_.load(a._d[0].data()); + BARRIER; + b_.load(b._d[0].data()); + BARRIER; + Vec8d::F::comparisons(8); + BARRIER; + return a_ <= b_; +} + +// vector operator > : returns true for elements for which a > b +static inline _vcl::Vec8db operator > (Vec8d const & a, Vec8d const & b) { + return b < a; +} + +// vector operator >= : returns true for elements for which a >= b +static inline _vcl::Vec8db operator >= (Vec8d const & a, Vec8d const & b) { + return b <= a; +} + +// General arithmetic functions, etc. + +// Horizontal add: Calculates the sum of all vector elements. +static inline Vec8d::F horizontal_add (Vec8d const & a) { + BARRIER; + return std::accumulate(a._d,a._d+8,Vec8d::F(0.0)); + BARRIER; +} + +// function max: a > b ? a : b +static inline Vec8d max(Vec8d const & a, Vec8d const & b) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return max(x,y); }); + BARRIER; + return r; +} + +// function min: a < b ? a : b +static inline Vec8d min(Vec8d const & a, Vec8d const & b) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return min(x,y); }); + BARRIER; + return r; +} + +// function abs: absolute value +// Removes sign bit, even for -0.0f, -INF and -NAN +static inline Vec8d abs(Vec8d const & a) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,r._d,[](auto x){ return abs(x); }); + BARRIER; + return r; +} + +// function sqrt: square root +static inline Vec8d sqrt(Vec8d const & a) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,r._d,[](auto x){ return sqrt(x); }); + BARRIER; + return r; +} + +// function square: a * a +static inline Vec8d square(Vec8d const & a) { + return a * a; +} + + +// exponential function +static inline Vec8d exp(Vec8d const & a){ + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,r._d,[](auto x){ return exp(x); }); + BARRIER; + return r; +} + +// function round: round to nearest integer (even). (result as double vector) +static inline Vec8d round(Vec8d const & a) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,r._d,[](auto x){ return round(x); }); + BARRIER; + return r; +} + +// function truncate: round towards zero. (result as double vector) +static inline Vec8d truncate(Vec8d const & a) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,r._d,[](auto x){ return trunc(x); }); + BARRIER; + return r; +} + +// function floor: round towards minus infinity. (result as double vector) +static inline Vec8d floor(Vec8d const & a) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,r._d,[](auto x){ return floor(x); }); + BARRIER; + return r; +} + +// function ceil: round towards plus infinity. (result as double vector) +static inline Vec8d ceil(Vec8d const & a) { + BARRIER; + Vec8d r; + std::transform(a._d,a._d+8,r._d,[](auto x){ return ceil(x); }); + BARRIER; + return r; +} + +// Fused multiply and add functions + +// Multiply and add +static inline Vec8d mul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) { + BARRIER; + Vec8d r; + for (size_t i = 0 ; i < 8 ; ++i) + r._d[i] = a._d[i] * b._d[i] + c._d[i]; + BARRIER; + return r; +} + + +// Multiply and subtract +static inline Vec8d mul_sub(Vec8d const & a, Vec8d const & b, Vec8d const & c) { + BARRIER; + Vec8d r; + for (size_t i = 0 ; i < 8 ; ++i) + r._d[i] = a._d[i] * b._d[i] - c._d[i]; + BARRIER; + return r; +} + +// Multiply and inverse subtract +static inline Vec8d nmul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) { + BARRIER; + Vec8d r; + for (size_t i = 0 ; i < 8 ; ++i) + r._d[i] = - a._d[i] * b._d[i] + c._d[i]; + BARRIER; + return r; +} + + +template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> +static inline Vec8d blend8d(Vec8d const & a, Vec8d const & b) { + BARRIER; + _vcl::Vec8d a_,b_; + BARRIER; + a_.load(a._d[0].data()); + BARRIER; + b_.load(b._d[0].data()); + BARRIER; + _vcl::Vec8d r_ = _vcl::blend8d<i0,i1,i2,i3,i4,i5,i6,i7>(a_,b_); + BARRIER; + Vec8d::F::blends(1); + BARRIER; + Vec8d r; + BARRIER; + r_.store(r._d[0].data()); + BARRIER; + return r; +} + +#endif // MAC_VECTOR_SIZE >= 512 + #endif // ENABLE_COUNTER #endif // DUNE_PDELAB_COMMON_VECTORCLASS_HH diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh index 5e84c98b935f191b1df88ade2ab50e3753d7e3c4..9d924bf900afc5fc1865b142cb0501d8a78b7a34 100644 --- a/dune/perftool/sumfact/transposereg.hh +++ b/dune/perftool/sumfact/transposereg.hh @@ -3,20 +3,6 @@ #include<dune/perftool/common/vectorclass.hh> - -#if MAX_VECTOR_SIZE >= 128 - -void transpose_reg(Vec2d& a0, Vec2d& a1) -{ - Vec2d b0, b1; - b0 = blend2d<0,2>(a0, a1); - b1 = blend2d<1,3>(a0, a1); - a0 = b0; - a1 = b1; -} - -#endif - #if MAX_VECTOR_SIZE >= 256 void transpose_reg(Vec4d& a0, Vec4d& a1, Vec4d& a2, Vec4d& a3) diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index 89c5791e0edb1d08eb51bff020a910d2b4f7c8ac..df7bc8314ebd2ef3f0d79b70ca7222ceb3e92b71 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -55,7 +55,8 @@ def _realize_sum_factorization_kernel(sf): insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), within_inames=frozenset(sf.within_inames), depends_on=insn_dep, - ),})) + ), + })) # Set up the input for stage 1 if sf.stage == 1 and not get_option("fastdg"): diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index 0cbb6b814b13b3f60bacc065ea539ed0a8a37af5..790548f1b99a9cd36d8e3c395890debb419c2cec 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -120,8 +120,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): # The following construction is a bit weird: Dict comprehensions do not have # access to the locals of the calling scope: So we need to do the eval beforehand - defaults = [eval(a) for a in SumfactKernel.init_arg_names] - defaultdict = {a: defaults[SumfactKernel.init_arg_names.index(a)] for a in SumfactKernel.init_arg_names} + defaultdict = {} + for a in SumfactKernel.init_arg_names: + defaultdict[a] = eval(a) # Call the base class constructors ImmutableRecord.__init__(self, **defaultdict) diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index 466690a4a9fa3fc414ba790c93a098e398abfad5..92a8883eda78d836f002a984afdc66f95c830f1d 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -162,7 +162,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): def pymbolic(self, indices): assert len(indices) == 3 - # Check whether we can realize this by broadcasting the values of a simple tabulation + # Check whether we can realize this by broadcasting the values of a simple tabulation if len(set(self.tabs)) == 1: theta = self.tabs[0].pymbolic(indices[:-1]) return prim.Call(ExplicitVCLCast(np.float64, vector_width=len(self.tabs)), (theta,)) diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index b856ccc76afc649e8af310f9fdfebaf5df59f4d7..6900885499385a62fc9dbca9d1e9adac7e444551 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -133,12 +133,14 @@ def horizontal_vectorization_strategy(sumfacts, width, allow_padding=1): def diagonal_vectorization_strategy(sumfacts, width): if width == 4: horizontal, vertical = 2, 2 + elif width == 8: + horizontal, vertical = 4, 2 else: raise NotImplementedError result = {} - horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=0) + horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=1) for sf in horizontal_kernels: vert = vertical_vectorization_strategy(horizontal_kernels[sf], width // horizontal_kernels[sf].horizontal_width) @@ -164,19 +166,23 @@ def decide_vectorization_strategy(): for inputkey in inputkeys: width = get_vcl_type_size(np.float64) sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey] - sfdict.update(**horizontal_vectorization_strategy(sumfact_filter, width)) + for old, new in horizontal_vectorization_strategy(sumfact_filter, width).items(): + sfdict[old] = new elif get_option("vectorize_slice"): for sumfact in sumfacts: width = get_vcl_type_size(np.float64) - sfdict.update(**vertical_vectorization_strategy(sumfact, width)) + for old, new in vertical_vectorization_strategy(sumfact, width).items(): + sfdict[old] = new elif get_option("vectorize_diagonal"): inputkeys = set(sf.input_key for sf in sumfacts) for inputkey in inputkeys: width = get_vcl_type_size(np.float64) sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey] - sfdict.update(**diagonal_vectorization_strategy(sumfact_filter, width)) + for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items(): + sfdict[old] = new else: - sfdict.update(**no_vectorization(sumfacts)) + for old, new in no_vectorization(sumfacts).items(): + sfdict[old] = new # Register the results for sf in sumfacts: