diff --git a/applications/knl/poisson_dg/knl_poisson_dg.mini b/applications/knl/poisson_dg/knl_poisson_dg.mini index ba2f345fd39edcca22fbbb9aded8e4bd3193157c..41755b7e08a746af270a3d6572dea6dc6273404a 100644 --- a/applications/knl/poisson_dg/knl_poisson_dg.mini +++ b/applications/knl/poisson_dg/knl_poisson_dg.mini @@ -3,6 +3,7 @@ __exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{for opcount_suffix = opcount, nonopcount | expand opcount {opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude +dune-opcounter_FOUND, 1 | expand opcount | cmake_guard # Calculate the size of the grid to equlibritate it to 100 MB/rank # Input parameters diff --git a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini index 4f3f5826f71fd9fc742efa4d88ae222e45759529..9d87f7eea1d29b61fc50ee460b8a1098b42cfc5a 100644 --- a/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini +++ b/applications/knl/poisson_dg_tensor/knl_poisson_dg_tensor.mini @@ -3,6 +3,7 @@ __exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{for opcount_suffix = opcount, nonopcount | expand opcount {opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude +dune-opcounter_FOUND, 1 | expand opcount | cmake_guard # Calculate the size of the grid to equlibritate it to 100 MB/rank # Input parameters diff --git a/applications/poisson_dg/poisson_dg.mini b/applications/poisson_dg/poisson_dg.mini index 7e65a4815f5fccc2dc6aae955bbdef559e9ebe92..d031d8233611e2492ef17eb80c3ae5d24518183f 100644 --- a/applications/poisson_dg/poisson_dg.mini +++ b/applications/poisson_dg/poisson_dg.mini @@ -3,6 +3,7 @@ __exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{for opcount_suffix = opcount, nonopcount | expand opcount {opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude +dune-opcounter_FOUND, 1 | expand opcount | cmake_guard # Calculate the size of the grid to equlibritate it to 100 MB/rank # Input parameters diff --git a/applications/poisson_dg_tensor/poisson_dg_tensor.mini b/applications/poisson_dg_tensor/poisson_dg_tensor.mini index 4e7ac555b33fa5fcef568cca5b6dddf6dd3552e3..a5738a46a076588437d0cf2a509fa4623290af2f 100644 --- a/applications/poisson_dg_tensor/poisson_dg_tensor.mini +++ b/applications/poisson_dg_tensor/poisson_dg_tensor.mini @@ -3,6 +3,7 @@ __exec_suffix = deg{formcompiler.ufl_variants.degree}_{opcount_suffix}_level{for opcount_suffix = opcount, nonopcount | expand opcount {opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude +dune-opcounter_FOUND, 1 | expand opcount | cmake_guard # Calculate the size of the grid to equlibritate it to 100 MB/rank # Input parameters diff --git a/applications/stokes_dg/stokes_dg.mini b/applications/stokes_dg/stokes_dg.mini index a80567f9b73473ae09e52c3a768177431435e849..6160ced2c164d0fb777f4777234bb30dd0d11768 100644 --- a/applications/stokes_dg/stokes_dg.mini +++ b/applications/stokes_dg/stokes_dg.mini @@ -3,6 +3,7 @@ __exec_suffix = deg{formcompiler.ufl_variants.v_degree}_{opcount_suffix}_level{f opcount_suffix = opcount, nonopcount | expand opcount {opcount_suffix} == opcount and {formcompiler.instrumentation_level} != 4 | exclude +dune-opcounter_FOUND, 1 | expand opcount | cmake_guard # Calculate the size of the grid to equlibritate it to 100 MB/rank # Input parameters diff --git a/cmake/modules/GeneratedSystemtests.cmake b/cmake/modules/GeneratedSystemtests.cmake index 634d7ad35605ef6707436eaad98f882c40bb4c1d..2b1d87728c04a9359df79f3711c0df9c4f3f621e 100644 --- a/cmake/modules/GeneratedSystemtests.cmake +++ b/cmake/modules/GeneratedSystemtests.cmake @@ -96,7 +96,7 @@ function(dune_add_formcompiler_system_test) ) set_tests_properties(${tname} PROPERTIES SKIP_RETURN_CODE 77) - set_tests_properties(${tname} PROPERTIES TIMEOUT 60) + set_tests_properties(${tname} PROPERTIES TIMEOUT 120) endif() endforeach() endfunction() diff --git a/dune.module b/dune.module index 264e9fb20580d83c62defce6410f9f3326728cc5..c6a0e7bb6e053b8d274511f4dbb54d9d2eab84c4 100644 --- a/dune.module +++ b/dune.module @@ -8,3 +8,4 @@ Version: 0.0 Maintainer: dominic.kempf@iwr.uni-heidelberg.de #depending on Depends: dune-testtools dune-pdelab dune-alugrid +Suggests: dune-opcounter diff --git a/dune/perftool/common/CMakeLists.txt b/dune/perftool/common/CMakeLists.txt index 31520e34b6dde5cc0de439d9885128f44e5b1a1c..6cd100631e8241eb91cbd4a3f11b27197941b8c9 100644 --- a/dune/perftool/common/CMakeLists.txt +++ b/dune/perftool/common/CMakeLists.txt @@ -1,6 +1,7 @@ install(FILES muladd_workarounds.hh - opcounter.hh timer.hh + timer_tsc.hh + timer_chrono.hh tsc.hh vectorclass.hh DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/dune/perftool/common diff --git a/dune/perftool/common/muladd_workarounds.hh b/dune/perftool/common/muladd_workarounds.hh index f05dab7c8695d5f6d587a86b473a2fc86a9f372c..ba728fad57bbbe78ed997c6b8f42f5cbf265988b 100644 --- a/dune/perftool/common/muladd_workarounds.hh +++ b/dune/perftool/common/muladd_workarounds.hh @@ -3,36 +3,14 @@ /* We are currently having some issues with FMA nodes not being * eliminated correctly upon code generation. We "solve" the problem - * for now with overloads of the mul_add function for scalars. + * for now with a generic implementation of the mul_add function. */ -#include<dune/perftool/common/opcounter.hh> - -inline double mul_add(double op1, double& op2, double op3) -{ - return op1 * op2 + op3; -} - - -inline float mul_add(float op1, float& op2, float op3) +template<typename T> +inline T mul_add(T op1, T op2, T op3) { - return op1 * op2 + op3; + return op1 * op2 + op3; } - -#ifdef ENABLE_COUNTER - -oc::OpCounter<double> mul_add(oc::OpCounter<double> op1, oc::OpCounter<double>& op2, oc::OpCounter<double> op3) -{ - return op1 * op2 + op3; -} - -oc::OpCounter<float> mul_add(oc::OpCounter<float> op1, oc::OpCounter<float>& op2, oc::OpCounter<float> op3) -{ - return op1 * op2 + op3; -} - -#endif - #endif diff --git a/dune/perftool/common/opcounter.hh b/dune/perftool/common/opcounter.hh deleted file mode 100644 index edb16eaacc39f3c383fc7ace2fc7b733354116e7..0000000000000000000000000000000000000000 --- a/dune/perftool/common/opcounter.hh +++ /dev/null @@ -1,974 +0,0 @@ -#ifndef __OPCOUNTER__ -#define __OPCOUNTER__ - -#include <type_traits> -#include <iostream> -#include <cmath> -#include <cstdlib> -#include <ostream> -#include <sstream> -#include <istream> - -#include <dune/common/typetraits.hh> - -namespace oc { - - template<typename F> - class OpCounter; - - template<typename T> - struct isOpCounter : public std::false_type - {}; - - template<typename F> - struct isOpCounter<OpCounter<F>> : public std::true_type - {}; - - template<typename T> - constexpr bool isOpCounterV = isOpCounter<T>::value; -} - -namespace Dune { - - template <typename T> - struct IsNumber<oc::OpCounter<T>> - : public std::integral_constant<bool, IsNumber<T>::value>{ - }; - - template<typename T, int n> - class FieldVector; - -} - -namespace oc { - - template<typename F> - class OpCounter - { - - public: - - using size_type = std::size_t; - - using value_type = F; - - OpCounter() - : _v() - {} - - template<typename T> - OpCounter(const T& t, typename std::enable_if<std::is_same<T,int>::value and !std::is_same<F,int>::value>::type* = nullptr) - : _v(t) - {} - - OpCounter(const F& f) - : _v(f) - {} - - OpCounter(F&& f) - : _v(f) - {} - - explicit OpCounter(const char* s) - : _v(strtod(s,nullptr)) - {} - - OpCounter& operator=(const char* s) - { - _v = strtod(s,nullptr); - return *this; - } - - explicit operator F() const - { - return _v; - } - - OpCounter& operator=(const F& f) - { - _v = f; - return *this; - } - - OpCounter& operator=(F&& f) - { - _v = f; - return *this; - } - - friend std::ostream& operator<<(std::ostream& os, const OpCounter& f) - { - os << "OC(" << f._v << ")"; - return os; - } - - friend std::istringstream& operator>>(std::istringstream& iss, OpCounter& f) - { - iss >> f._v; - return iss; - } - - friend std::istream& operator>>(std::istream& is, OpCounter& f) - { - is >> f._v; - return is; - } - - F* data() - { - return &_v; - } - - const F* data() const - { - return &_v; - } - - F _v; - - // The alignment is necessary to work around - // https://bugs.llvm.org/show_bug.cgi?id=34219: - // "SLP vectorizer: aligned store to unaligned address" - // The bug is present in 3.8.0 (tags/RELEASE_380/final 263969) and in - // clang version 6.0.0 (trunk 310993) - struct alignas(16) Counters { - - size_type addition_count; - size_type multiplication_count; - size_type division_count; - size_type exp_count; - size_type pow_count; - size_type sin_count; - size_type sqrt_count; - size_type comparison_count; - size_type blend_count; - size_type permute_count; - - Counters() - : addition_count(0) - , multiplication_count(0) - , division_count(0) - , exp_count(0) - , pow_count(0) - , sin_count(0) - , sqrt_count(0) - , comparison_count(0) - , blend_count(0) - , permute_count(0) - {} - - void reset() - { - addition_count = 0; - multiplication_count = 0; - division_count = 0; - exp_count = 0; - pow_count = 0; - sin_count = 0; - sqrt_count = 0; - comparison_count = 0; - blend_count = 0; - permute_count = 0; - } - - template<typename Stream, typename T> - void reportOperations(Stream& os, T level, std::string exec, std::string kernel, bool doReset = false) - { - auto total = addition_count + multiplication_count + division_count + exp_count + pow_count + sin_count + sqrt_count + comparison_count; - if (total == 0) - return; - os << level << " " << exec << " " << kernel << " additions " << addition_count << std::endl - << level << " " << exec << " " << kernel << " multiplications " << multiplication_count << std::endl - << level << " " << exec << " " << kernel << " divisions " << division_count << std::endl - << level << " " << exec << " " << kernel << " exp " << exp_count << std::endl - << level << " " << exec << " " << kernel << " pow " << pow_count << std::endl - << level << " " << exec << " " << kernel << " sin " << sin_count << std::endl - << level << " " << exec << " " << kernel << " sqrt " << sqrt_count << std::endl - << level << " " << exec << " " << kernel << " comparisons " << comparison_count << std::endl - << level << " " << exec << " " << kernel << " blends " << blend_count << std::endl - << level << " " << exec << " " << kernel << " permutes " << permute_count << std::endl - << level << " " << exec << " " << kernel << " totalops " << total << std::endl; - - if (doReset) - reset(); - } - - Counters& operator+=(const Counters& rhs) - { - addition_count += rhs.addition_count; - multiplication_count += rhs.multiplication_count; - division_count += rhs.division_count; - exp_count += rhs.exp_count; - pow_count += rhs.pow_count; - sin_count += rhs.sin_count; - sqrt_count += rhs.sqrt_count; - comparison_count += rhs.comparison_count; - blend_count += rhs.blend_count; - permute_count += rhs.permute_count; - return *this; - } - - Counters operator-(const Counters& rhs) - { - Counters r; - r.addition_count = addition_count - rhs.addition_count; - r.multiplication_count = multiplication_count - rhs.multiplication_count; - r.division_count = division_count - rhs.division_count; - r.exp_count = exp_count - rhs.exp_count; - r.pow_count = pow_count - rhs.pow_count; - r.sin_count = sin_count - rhs.sin_count; - r.sqrt_count = sqrt_count - rhs.sqrt_count; - r.comparison_count = comparison_count - rhs.comparison_count; - r.blend_count = blend_count - rhs.blend_count; - r.permute_count = permute_count - rhs.permute_count; - return r; - } - - }; - - static void additions(std::size_t n) - { - counters.addition_count += n; - } - - static void multiplications(std::size_t n) - { - counters.multiplication_count += n; - } - - static void divisions(std::size_t n) - { - counters.division_count += n; - } - - static void comparisons(std::size_t n) - { - counters.comparison_count += n; - } - - static void blends(std::size_t n) - { - counters.blend_count += n; - } - - static void permutes(std::size_t n) - { - counters.permute_count += n; - } - - static void reset() - { - counters.reset(); - } - - template<typename Stream, typename T> - static void reportOperations(Stream& os, T level, std::string exec, std::string kernel, bool doReset = false) - { - counters.reportOperations(os, level, exec, kernel, doReset); - } - - static Counters counters; - - }; - - template<typename F> - typename OpCounter<F>::Counters OpCounter<F>::counters; - - // ******************************************************************************** - // negation - // ******************************************************************************** - - template<typename F> - OpCounter<F> operator-(const OpCounter<F>& a) - { - ++OpCounter<F>::counters.addition_count; - return {-a._v}; - } - - - // ******************************************************************************** - // addition - // ******************************************************************************** - - template<typename F> - OpCounter<F> operator+(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.addition_count; - return {a._v + b._v}; - } - - template<typename F> - OpCounter<F> operator+(const OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.addition_count; - return {a._v + b}; - } - - template<typename F> - OpCounter<F> operator+(const F& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.addition_count; - return {a + b._v}; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F> - >::type - operator+(const OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.addition_count; - return {a._v + b}; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F> - >::type - operator+(const T& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.addition_count; - return {a + b._v}; - } - - template<typename F> - OpCounter<F>& operator+=(OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.addition_count; - a._v += b._v; - return a; - } - - template<typename F> - OpCounter<F>& operator+=(OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.addition_count; - a._v += b; - return a; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F>& - >::type - operator+=(OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.addition_count; - a._v += b; - return a; - } - - template<typename F> - OpCounter<F>& operator+=(OpCounter<F>& a, const Dune::FieldVector<OpCounter<F>,1>& b) - { - ++OpCounter<F>::counters.addition_count; - a._v += b[0]._v; - return a; - } - - // ******************************************************************************** - // subtraction - // ******************************************************************************** - - template<typename F> - OpCounter<F> operator-(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.addition_count; - return {a._v - b._v}; - } - - template<typename F> - OpCounter<F> operator-(const OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.addition_count; - return {a._v - b}; - } - - template<typename F> - OpCounter<F> operator-(const F& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.addition_count; - return {a - b._v}; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F> - >::type - operator-(const OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.addition_count; - return {a._v - b}; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F> - >::type - operator-(const T& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.addition_count; - return {a - b._v}; - } - - template<typename F> - OpCounter<F>& operator-=(OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.addition_count; - a._v -= b._v; - return a; - } - - template<typename F> - OpCounter<F>& operator-=(OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.addition_count; - a._v -= b; - return a; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F>& - >::type - operator-=(OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.addition_count; - a._v -= b; - return a; - } - - - // ******************************************************************************** - // multiplication - // ******************************************************************************** - - template<typename F> - OpCounter<F> operator*(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.multiplication_count; - return {a._v * b._v}; - } - - template<typename F> - OpCounter<F> operator*(const OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.multiplication_count; - return {a._v * b}; - } - - template<typename F> - OpCounter<F> operator*(const F& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.multiplication_count; - return {a * b._v}; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F> - >::type - operator*(const OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.multiplication_count; - return {a._v * b}; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F> - >::type - operator*(const T& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.multiplication_count; - return {a * b._v}; - } - - template<typename F> - OpCounter<F>& operator*=(OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.multiplication_count; - a._v *= b._v; - return a; - } - - template<typename F> - OpCounter<F>& operator*=(OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.multiplication_count; - a._v *= b; - return a; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F>& - >::type - operator*=(OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.multiplication_count; - a._v *= b; - return a; - } - - - // ******************************************************************************** - // division - // ******************************************************************************** - - template<typename F> - OpCounter<F> operator/(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.division_count; - return {a._v / b._v}; - } - - template<typename F> - OpCounter<F> operator/(const OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.division_count; - return {a._v / b}; - } - - template<typename F> - OpCounter<F> operator/(const F& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.division_count; - return {a / b._v}; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F> - >::type - operator/(const OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.division_count; - return {a._v / b}; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F> - >::type - operator/(const T& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.division_count; - return {a / b._v}; - } - - template<typename F> - OpCounter<F>& operator/=(OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.division_count; - a._v /= b._v; - return a; - } - - template<typename F> - OpCounter<F>& operator/=(OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.division_count; - a._v /= b; - return a; - } - - template<typename F, typename T> - typename std::enable_if< - std::is_arithmetic<T>::value, - OpCounter<F>& - >::type - operator/=(OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.division_count; - a._v /= b; - return a; - } - - - - // ******************************************************************************** - // comparisons - // ******************************************************************************** - - - // ******************************************************************************** - // less - // ******************************************************************************** - - template<typename F> - bool operator<(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v < b._v}; - } - - template<typename F> - bool operator<(const OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v < b}; - } - - template<typename F> - bool operator<(const F& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a < b._v}; - } - - template<typename F, typename T> - bool operator<(const OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v < b}; - } - - template<typename F, typename T> - bool operator<(const T& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a < b._v}; - } - - - // ******************************************************************************** - // less_or_equals - // ******************************************************************************** - - template<typename F> - bool operator<=(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v <= b._v}; - } - - template<typename F> - bool operator<=(const OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v <= b}; - } - - template<typename F> - bool operator<=(const F& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a <= b._v}; - } - - template<typename F, typename T> - bool operator<=(const OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v <= b}; - } - - template<typename F, typename T> - bool operator<=(const T& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a <= b._v}; - } - - - // ******************************************************************************** - // greater - // ******************************************************************************** - - template<typename F> - bool operator>(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v > b._v}; - } - - template<typename F> - bool operator>(const OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v > b}; - } - - template<typename F> - bool operator>(const F& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a > b._v}; - } - - template<typename F, typename T> - bool operator>(const OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v > b}; - } - - template<typename F, typename T> - bool operator>(const T& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a > b._v}; - } - - - // ******************************************************************************** - // greater_or_equals - // ******************************************************************************** - - template<typename F> - bool operator>=(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v >= b._v}; - } - - template<typename F> - bool operator>=(const OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v >= b}; - } - - template<typename F> - bool operator>=(const F& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a >= b._v}; - } - - template<typename F, typename T> - bool operator>=(const OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v >= b}; - } - - template<typename F, typename T> - bool operator>=(const T& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a >= b._v}; - } - - - // ******************************************************************************** - // inequals - // ******************************************************************************** - - template<typename F> - bool operator!=(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v != b._v}; - } - - template<typename F> - bool operator!=(const OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v != b}; - } - - template<typename F> - bool operator!=(const F& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a != b._v}; - } - - template<typename F, typename T> - bool operator!=(const OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v != b}; - } - - template<typename F, typename T> - bool operator!=(const T& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a != b._v}; - } - - - // ******************************************************************************** - // equals - // ******************************************************************************** - - template<typename F> - bool operator==(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v == b._v}; - } - - template<typename F> - bool operator==(const OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v == b}; - } - - template<typename F> - bool operator==(const F& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a == b._v}; - } - - template<typename F, typename T> - bool operator==(const OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a._v == b}; - } - - template<typename F, typename T> - bool operator==(const T& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - return {a == b._v}; - } - - - - // ******************************************************************************** - // functions - // ******************************************************************************** - - template<typename F> - OpCounter<F> exp(const OpCounter<F>& a) - { - ++OpCounter<F>::counters.exp_count; - return {std::exp(a._v)}; - } - - template<typename F> - OpCounter<F> pow(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.pow_count; - return {std::pow(a._v,b._v)}; - } - - template<typename F> - OpCounter<F> pow(const OpCounter<F>& a, const F& b) - { - ++OpCounter<F>::counters.pow_count; - return {std::pow(a._v,b)}; - } - - template<typename F, typename T> - OpCounter<F> pow(const OpCounter<F>& a, const T& b) - { - ++OpCounter<F>::counters.pow_count; - return {std::pow(a._v,b)}; - } - - template<typename F> - OpCounter<F> pow(const F& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.pow_count; - return {std::pow(a,b._v)}; - } - - template<typename F, typename T> - OpCounter<F> pow(const T& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.pow_count; - return {std::pow(a,b._v)}; - } - - template<typename F> - OpCounter<F> sin(const OpCounter<F>& a) - { - ++OpCounter<F>::counters.sin_count; - return {std::sin(a._v)}; - } - - template<typename F> - OpCounter<F> cos(const OpCounter<F>& a) - { - ++OpCounter<F>::counters.sin_count; - return {std::cos(a._v)}; - } - - template<typename F> - OpCounter<F> sqrt(const OpCounter<F>& a) - { - ++OpCounter<F>::counters.sqrt_count; - return {std::sqrt(a._v)}; - } - - template<typename F> - OpCounter<F> abs(const OpCounter<F>& a) - { - ++OpCounter<F>::counters.comparison_count; - return {std::abs(a._v)}; - } - - template<typename F> - OpCounter<F> max(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - using std::max; - return {max(a._v,b._v)}; - } - - template<typename F> - OpCounter<F> min(const OpCounter<F>& a, const OpCounter<F>& b) - { - ++OpCounter<F>::counters.comparison_count; - using std::min; - return {min(a._v,b._v)}; - } - - template<typename F> - OpCounter<F> round(const OpCounter<F>& a) - { - using std::round; - return {round(a._v)}; - } - - template<typename F> - OpCounter<F> trunc(const OpCounter<F>& a) - { - using std::trunc; - return {trunc(a._v)}; - } - - template<typename F> - OpCounter<F> ceil(const OpCounter<F>& a) - { - using std::ceil; - return {ceil(a._v)}; - } - - template<typename F> - OpCounter<F> floor(const OpCounter<F>& a) - { - using std::floor; - return {floor(a._v)}; - } - - template<typename F> - bool isnan(const OpCounter<F>& a) - { - using std::isnan; - return {isnan(a._v)}; - } - -} - -#endif // __OPCOUNTER__ diff --git a/dune/perftool/common/timer_chrono.hh b/dune/perftool/common/timer_chrono.hh index d1b0bd9b5d50e43a32a07623ef417c76337d8af6..8e96a70a2e4126471f2636300bfb9a46841d5f9f 100644 --- a/dune/perftool/common/timer_chrono.hh +++ b/dune/perftool/common/timer_chrono.hh @@ -7,7 +7,7 @@ #include <chrono> -#include <dune/perftool/common/opcounter.hh> +#include <dune/opcounter/opcounter.hh> #define HP_TIMER_OPCOUNTER oc::OpCounter<double> diff --git a/dune/perftool/common/timer_tsc.hh b/dune/perftool/common/timer_tsc.hh index 0f9e1e040f7d30ebe8d54c3d793e959699250e96..10ec75d98ffbb5a68425dbab21367fb20bdbf395 100644 --- a/dune/perftool/common/timer_tsc.hh +++ b/dune/perftool/common/timer_tsc.hh @@ -6,7 +6,7 @@ #endif #include <dune/perftool/common/tsc.hh> -#include <dune/perftool/common/opcounter.hh> +#include <dune/opcounter/opcounter.hh> #define HP_TIMER_DURATION(name) __hp_timer_##name##_duration #define HP_TIMER_STARTTIME(name) __hp_timer_##name##_start @@ -76,16 +76,22 @@ #ifdef ENABLE_COUNTER #define DUMP_TIMER(level,name,os,reset)\ - if (HP_TIMER_DURATION(name) > 1e-12) \ - os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ - HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset); + { \ + std::string prefix = std::string(#level) + " " + ident + " " + std::string(#name); \ + if (HP_TIMER_DURATION(name) > 1e-12) \ + os << prefix << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ + HP_TIMER_OPCOUNTERS(name).reportOperations(os,prefix,reset); \ + } #define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops) \ - if (HP_TIMER_DURATION(name) > 1e-12) \ - os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ - time += HP_TIMER_DURATION(name); \ - ops += HP_TIMER_OPCOUNTERS(name); \ - HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset); + { \ + std::string prefix = std::string(#level) + " " + ident + " " + std::string(#name); \ + if (HP_TIMER_DURATION(name) > 1e-12) \ + os << prefix << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ + time += HP_TIMER_DURATION(name); \ + ops += HP_TIMER_OPCOUNTERS(name); \ + HP_TIMER_OPCOUNTERS(name).reportOperations(os,prefix,reset); \ + } #elif defined ENABLE_HP_TIMERS diff --git a/dune/perftool/common/vectorclass.hh b/dune/perftool/common/vectorclass.hh index 6204b0a213796861533b9668f4c9a869198eb7cc..9fe0cf17b47abda0691c14591be3fcc15ee6e4d7 100644 --- a/dune/perftool/common/vectorclass.hh +++ b/dune/perftool/common/vectorclass.hh @@ -1,1770 +1,72 @@ -#ifndef DUNE_PDELAB_COMMON_VECTORCLASS_HH -#define DUNE_PDELAB_COMMON_VECTORCLASS_HH +#ifndef DUNE_PERFTOOL_COMMON_VECTORCLASS_HH +#define DUNE_PERFTOOL_COMMON_VECTORCLASS_HH -#ifdef VECTORCLASS_H -#error Do not include the vectorclass.h header directly, always use this wrapper! -#endif - -#ifdef VCL_NAMESPACE -#error Do not manually set VCL_NAMESPACE, it is used internally by PDELab -#endif - -#define BARRIER asm volatile("": : :"memory") template<typename T> struct base_floatingpoint {}; -#ifndef ENABLE_COUNTER +#ifdef ENABLE_COUNTER +#if HAVE_DUNE_OPCOUNTER +#include<dune/opcounter/vectorclass.hh> + +template<typename F, int size> +struct base_floatingpoint<OpCounter::impl::OpCounterVector<F, size>> +{ + using value = OpCounter::OpCounter<F>; +}; -#include <dune/perftool/vectorclass/vectorclass.h> -#include <dune/perftool/vectorclass/vectormath_exp.h> -#include <dune/perftool/vectorclass/vectormath_trig.h> + +#else +#error "dune-opcounter is needed for opcounted vector types" +#endif +#else +#include<dune/perftool/vectorclass/vectorclass.h> +#include<dune/perftool/vectorclass/vectormath_exp.h> +#include<dune/perftool/vectorclass/vectormath_hyp.h> +#include<dune/perftool/vectorclass/vectormath_trig.h> template<> -struct base_floatingpoint<Vec4d> +struct base_floatingpoint<Vec2d> { using value = double; }; template<> -struct base_floatingpoint<Vec8f> +struct base_floatingpoint<Vec4f> { using value = float; }; -#if MAX_VECTOR_SIZE >= 512 +#if MAX_VECTOR_SIZE >= 256 template<> -struct base_floatingpoint<Vec8d> +struct base_floatingpoint<Vec4d> { using value = double; }; -#endif - -#else - -#include <algorithm> - -#define VCL_NAMESPACE _vcl -#include <dune/perftool/vectorclass/vectorclass.h> -#include <dune/perftool/vectorclass/vectormath_exp.h> -#include <dune/perftool/vectorclass/vectormath_trig.h> - -#include <dune/perftool/common/opcounter.hh> - -#include <numeric> - -struct Vec4d -{ - oc::OpCounter<double> _d[4]; - - using F = oc::OpCounter<double>; - - Vec4d() - {} - - Vec4d(F d) - { - BARRIER; - std::fill(_d,_d+4,d); - BARRIER; - } - - Vec4d(F dl, F du) - { - BARRIER; - std::fill(_d,_d+2,dl); - std::fill(_d+2,_d+4,du); - BARRIER; - } - - Vec4d(F d0, F d1, F d2, F d3) - : _d{d0,d1,d2,d3} - { - BARRIER; - } - - Vec4d& load(const F* p) - { - BARRIER; - std::copy(p,p+4,_d); - BARRIER; - return *this; - } - - Vec4d& load_a(const F* p) - { - BARRIER; - std::copy(p,p+4,_d); - BARRIER; - return *this; - } - - void store(F* p) const - { - BARRIER; - std::copy(_d,_d+4,p); - BARRIER; - } - - void store_a(F* p) const - { - BARRIER; - std::copy(_d,_d+4,p); - BARRIER; - } - - Vec4d const& insert(uint32_t index, F value) - { - BARRIER; - _d[index] = value; - BARRIER; - return *this; - } - - F extract(uint32_t index) const - { - BARRIER; - return _d[index]; - } - - F operator [] (uint32_t index) const { - return extract(index); - } - - constexpr static int size() - { - return 4; - } - -}; - template<> -struct base_floatingpoint<Vec4d> +struct base_floatingpoint<Vec8f> { - using value = typename Vec4d::F; + using value = float; }; - -/***************************************************************************** -* -* Operators for Vec4d -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec4d operator + (Vec4d const & a, Vec4d const & b) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return x + y; }); - BARRIER; - return r; -} - -// vector operator += : add -static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) { - BARRIER; - std::transform(a._d,a._d+4,b._d,a._d,[](auto x, auto y){ return x + y; }); - BARRIER; - return a; -} - -// postfix operator ++ -static inline Vec4d operator ++ (Vec4d & a, int) { - BARRIER; - Vec4d a0 = a; - a = a + Vec4d(1.0); - BARRIER; - return a0; -} - -// prefix operator ++ -static inline Vec4d & operator ++ (Vec4d & a) { - BARRIER; - a = a + Vec4d(1.0); - BARRIER; - return a; -} - -// vector operator - : subtract element by element -static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return x - y; }); - BARRIER; - return r; -} - -// vector operator - : unary minus -// Change sign bit, even for 0, INF and NAN -static inline Vec4d operator - (Vec4d const & a) { - BARRIER; - Vec4d r(a); - for (size_t i = 0 ; i < 4 ; ++i) - r._d[i] = -a._d[i]; - BARRIER; - return r; -} - -// vector operator -= : subtract -static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) { - BARRIER; - std::transform(a._d,a._d+4,b._d,a._d,[](auto x, auto y){ return x - y; }); - BARRIER; - return a; -} - -// postfix operator -- -static inline Vec4d operator -- (Vec4d & a, int) { - BARRIER; - Vec4d a0 = a; - a = a - Vec4d(1.0); - BARRIER; - return a0; -} - -// prefix operator -- -static inline Vec4d & operator -- (Vec4d & a) { - BARRIER; - a = a - Vec4d(1.0); - BARRIER; - return a; -} - -// vector operator * : multiply element by element -static inline Vec4d operator * (Vec4d const & a, Vec4d const & b) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return x * y; }); - BARRIER; - return r; -} - -// vector operator *= : multiply -static inline Vec4d & operator *= (Vec4d & a, Vec4d const & b) { - BARRIER; - std::transform(a._d,a._d+4,b._d,a._d,[](auto x, auto y){ return x * y; }); - BARRIER; - return a; -} - -// vector operator / : divide all elements by same integer -static inline Vec4d operator / (Vec4d const & a, Vec4d const & b) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return x / y; }); - BARRIER; - return r; -} - -// vector operator /= : divide -static inline Vec4d & operator /= (Vec4d & a, Vec4d const & b) { - BARRIER; - std::transform(a._d,a._d+4,b._d,a._d,[](auto x, auto y){ return x / y; }); - BARRIER; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline _vcl::Vec4db operator == (Vec4d const & a, Vec4d const & b) { - BARRIER; - _vcl::Vec4d a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return a_ == b_; -} - -// vector operator == : returns true for elements for which a == b -static inline _vcl::Vec4db operator == (oc::OpCounter<double> a, Vec4d const & b) { - BARRIER; - _vcl::Vec4d a_(a._v), b_; - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return a_ == b_; -} - -// vector operator == : returns true for elements for which a == b -static inline _vcl::Vec4db operator == (Vec4d const & b, oc::OpCounter<double> a) { - BARRIER; - _vcl::Vec4d a_(a._v), b_; - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return a_ == b_; -} - -// vector operator != : returns true for elements for which a != b -static inline _vcl::Vec4db operator != (Vec4d const & a, Vec4d const & b) { - BARRIER; - _vcl::Vec4d a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return a_ != b_; -} - -// vector operator != : returns true for elements for which a != b -static inline _vcl::Vec4db operator != (oc::OpCounter<double> a, Vec4d const & b) { - BARRIER; - _vcl::Vec4d a_(a._v), b_; - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return a_ != b_; -} - -// vector operator != : returns true for elements for which a != b -static inline _vcl::Vec4db operator != (Vec4d const & b, oc::OpCounter<double> a) { - BARRIER; - _vcl::Vec4d a_(a._v), b_; - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return a_ != b_; -} - -// vector operator < : returns true for elements for which a < b -static inline _vcl::Vec4db operator < (Vec4d const & a, Vec4d const & b) { - BARRIER; - _vcl::Vec4d a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return a_ < b_; -} - -// vector operator < : returns true for elements for which a < b -static inline _vcl::Vec4db operator < (oc::OpCounter<double> a, Vec4d const & b) { - BARRIER; - _vcl::Vec4d a_(a._v), b_; - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return a_ < b_; -} - -// vector operator < : returns true for elements for which a < b -static inline _vcl::Vec4db operator < (Vec4d const & b, oc::OpCounter<double> a) { - BARRIER; - _vcl::Vec4d a_(a._v), b_; - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return b_ < a_; -} - -// vector operator <= : returns true for elements for which a <= b -static inline _vcl::Vec4db operator <= (Vec4d const & a, Vec4d const & b) { - BARRIER; - _vcl::Vec4d a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return a_ <= b_; -} - -// vector operator <= : returns true for elements for which a <= b -static inline _vcl::Vec4db operator <= (oc::OpCounter<double> a, Vec4d const & b) { - BARRIER; - _vcl::Vec4d a_(a._v), b_; - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return a_ <= b_; -} - -// vector operator <= : returns true for elements for which a <= b -static inline _vcl::Vec4db operator <= (Vec4d const & b, oc::OpCounter<double> a) { - BARRIER; - _vcl::Vec4d a_(a._v), b_; - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec4d::F::comparisons(4); - BARRIER; - return b_ <= a_; -} - -// vector operator > : returns true for elements for which a > b -static inline _vcl::Vec4db operator > (Vec4d const & a, Vec4d const & b) { - return b < a; -} - -// vector operator > : returns true for elements for which a > b -static inline _vcl::Vec4db operator > (oc::OpCounter<double> a, Vec4d const & b) { - return a < b; -} - -// vector operator > : returns true for elements for which a > b -static inline _vcl::Vec4db operator > (Vec4d const & b, oc::OpCounter<double> a) { - return a < b; -} - -// vector operator >= : returns true for elements for which a >= b -static inline _vcl::Vec4db operator >= (Vec4d const & a, Vec4d const & b) { - return b <= a; -} - -// vector operator >= : returns true for elements for which a >= b -static inline _vcl::Vec4db operator >= (oc::OpCounter<double> a, Vec4d const & b) { - return b <= a; -} - -// vector operator >= : returns true for elements for which a >= b -static inline _vcl::Vec4db operator >= (Vec4d const & b, oc::OpCounter<double> a) { - return a <= b; -} - - -// avoid logical operators for now, I don't think we need them -#if 0 - -// Bitwise logical operators - -// vector operator & : bitwise and -static inline Vec4d operator & (Vec4d const & a, Vec4d const & b) { - return _mm256_and_pd(a, b); -} - -// vector operator &= : bitwise and -static inline Vec4d & operator &= (Vec4d & a, Vec4d const & b) { - a = a & b; - return a; -} - -// vector operator & : bitwise and of Vec4d and Vec4db -static inline Vec4d operator & (Vec4d const & a, Vec4db const & b) { - return _mm256_and_pd(a, b); -} -static inline Vec4d operator & (Vec4db const & a, Vec4d const & b) { - return _mm256_and_pd(a, b); -} - -// vector operator | : bitwise or -static inline Vec4d operator | (Vec4d const & a, Vec4d const & b) { - return _mm256_or_pd(a, b); -} - -// vector operator |= : bitwise or -static inline Vec4d & operator |= (Vec4d & a, Vec4d const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec4d operator ^ (Vec4d const & a, Vec4d const & b) { - return _mm256_xor_pd(a, b); -} - -// vector operator ^= : bitwise xor -static inline Vec4d & operator ^= (Vec4d & a, Vec4d const & b) { - a = a ^ b; - return a; -} - -// vector operator ! : logical not. Returns Boolean vector -static inline Vec4db operator ! (Vec4d const & a) { - return a == Vec4d(0.0); -} - -#endif - - -// General arithmetic functions, etc. - -// Horizontal add: Calculates the sum of all vector elements. -static inline Vec4d::F horizontal_add (Vec4d const & a) { - BARRIER; - return std::accumulate(a._d,a._d+4,Vec4d::F(0.0)); - BARRIER; -} - -// function max: a > b ? a : b -static inline Vec4d max(Vec4d const & a, Vec4d const & b) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return max(x,y); }); - BARRIER; - return r; -} - -// function min: a < b ? a : b -static inline Vec4d min(Vec4d const & a, Vec4d const & b) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,b._d,r._d,[](auto x, auto y){ return min(x,y); }); - BARRIER; - return r; -} - -// function abs: absolute value -// Removes sign bit, even for -0.0f, -INF and -NAN -static inline Vec4d abs(Vec4d const & a) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,r._d,[](auto x){ return abs(x); }); - BARRIER; - return r; -} - -// function sqrt: square root -static inline Vec4d sqrt(Vec4d const & a) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,r._d,[](auto x){ return sqrt(x); }); - BARRIER; - return r; -} - -// function square: a * a -static inline Vec4d square(Vec4d const & a) { - return a * a; -} - - -// exponential function -static inline Vec4d exp(Vec4d const & a){ - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,r._d,[](auto x){ return exp(x); }); - BARRIER; - return r; -} - -// pow -template <typename TT> -static inline Vec4d pow(Vec4d const & a, oc::OpCounter<TT> n) -{ - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,r._d,[=](auto x){ return pow(x, n); }); - BARRIER; - return r; -} - -// pow -template <typename TT> -static inline -std::enable_if_t<not oc::isOpCounterV<TT>, Vec4d> pow(Vec4d const & a, TT n) -{ - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,r._d,[=](auto x){ return pow(x, n); }); - BARRIER; - return r; -} - - -static inline Vec4d select(const _vcl::Vec4db& s, const Vec4d& a, const Vec4d& b) -{ - BARRIER; - Vec4d r; - for(int i=0; i<4; ++i) - r._d[i] = s.extract(i) ? a._d[i] : b._d[i]; - BARRIER; - return r; -} - -// function round: round to nearest integer (even). (result as double vector) -static inline Vec4d round(Vec4d const & a) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,r._d,[](auto x){ return round(x); }); - BARRIER; - return r; -} - -// function truncate: round towards zero. (result as double vector) -static inline Vec4d truncate(Vec4d const & a) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,r._d,[](auto x){ return trunc(x); }); - BARRIER; - return r; -} - -// function floor: round towards minus infinity. (result as double vector) -static inline Vec4d floor(Vec4d const & a) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,r._d,[](auto x){ return floor(x); }); - BARRIER; - return r; -} - -// function ceil: round towards plus infinity. (result as double vector) -static inline Vec4d ceil(Vec4d const & a) { - BARRIER; - Vec4d r; - std::transform(a._d,a._d+4,r._d,[](auto x){ return ceil(x); }); - BARRIER; - return r; -} - -#if 0 -// function round_to_int: round to nearest integer (even). (result as integer vector) -static inline Vec4i round_to_int(Vec4d const & a) { - // Note: assume MXCSR control register is set to rounding - return _mm256_cvtpd_epi32(a); -} - -// function truncate_to_int: round towards zero. (result as integer vector) -static inline Vec4i truncate_to_int(Vec4d const & a) { - return _mm256_cvttpd_epi32(a); -} #endif - -// Fused multiply and add functions - -// Multiply and add -static inline Vec4d mul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) { - BARRIER; - Vec4d r; - for (size_t i = 0 ; i < 4 ; ++i) - r._d[i] = a._d[i] * b._d[i] + c._d[i]; - BARRIER; - return r; -} - - -// Multiply and subtract -static inline Vec4d mul_sub(Vec4d const & a, Vec4d const & b, Vec4d const & c) { - BARRIER; - Vec4d r; - for (size_t i = 0 ; i < 4 ; ++i) - r._d[i] = a._d[i] * b._d[i] - c._d[i]; - BARRIER; - return r; -} - -// Multiply and inverse subtract -static inline Vec4d nmul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) { - BARRIER; - Vec4d r; - for (size_t i = 0 ; i < 4 ; ++i) - r._d[i] = - a._d[i] * b._d[i] + c._d[i]; - BARRIER; - return r; -} - - -template <int i0, int i1, int i2, int i3> -static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) { - BARRIER; - _vcl::Vec4d a_,b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - _vcl::Vec4d r_ = _vcl::blend4d<i0,i1,i2,i3>(a_,b_); - BARRIER; - Vec4d::F::blends(1); - BARRIER; - Vec4d r; - BARRIER; - r_.store(r._d[0].data()); - BARRIER; - return r; -} - -// permute vector Vec4d -template <int i0, int i1, int i2, int i3> -static inline Vec4d permute4d(Vec4d const & a) { - BARRIER; - _vcl::Vec4d a_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - _vcl::Vec4d r_ = _vcl::permute4d<i0,i1,i2,i3>(a_); - BARRIER; - Vec4d::F::permutes(1); - BARRIER; - Vec4d r; - BARRIER; - r_.store(r._d[0].data()); - BARRIER; - return r; -} - - #if MAX_VECTOR_SIZE >= 512 - -struct Vec8d -{ - oc::OpCounter<double> _d[8]; - - using F = oc::OpCounter<double>; - - Vec8d() - {} - - Vec8d(F d) - { - BARRIER; - std::fill(_d,_d+8,d); - BARRIER; - } - - Vec8d(F dl, F du) - { - BARRIER; - std::fill(_d,_d+4,dl); - std::fill(_d+4,_d+8,du); - BARRIER; - } - - Vec8d(Vec4d low, Vec4d high) - { - BARRIER; - std::copy(_d, _d+4, low._d); - std::copy(_d+4, _d+8, high._d); - BARRIER; - } - - Vec8d(F d0, F d1, F d2, F d3, F d4, F d5, F d6, F d7) - : _d{d0,d1,d2,d3,d4,d5,d6,d7} - { - BARRIER; - } - - Vec4d get_low() const - { - BARRIER; - Vec4d ret; - ret.load(_d); - BARRIER; - return ret; - } - - Vec4d get_high() const - { - BARRIER; - Vec4d ret; - ret.load(_d + 4); - BARRIER; - return ret; - } - - Vec8d& load(const F* p) - { - BARRIER; - std::copy(p,p+8,_d); - BARRIER; - return *this; - } - - Vec8d& load_a(const F* p) - { - BARRIER; - std::copy(p,p+8,_d); - BARRIER; - return *this; - } - - void store(F* p) const - { - BARRIER; - std::copy(_d,_d+8,p); - BARRIER; - } - - void store_a(F* p) const - { - BARRIER; - std::copy(_d,_d+8,p); - BARRIER; - } - - Vec8d const& insert(uint32_t index, F value) - { - BARRIER; - _d[index] = value; - BARRIER; - return *this; - } - - F extract(uint32_t index) const - { - BARRIER; - return _d[index]; - } - - F operator [] (uint32_t index) const { - return extract(index); - } - - constexpr static int size() - { - return 8; - } - -}; - template<> struct base_floatingpoint<Vec8d> { - using value = typename Vec8d::F; -}; - -/***************************************************************************** -* -* Operators for Vec8d -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec8d operator + (Vec8d const & a, Vec8d const & b) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x + y; }); - BARRIER; - return r; -} - -// vector operator += : add -static inline Vec8d & operator += (Vec8d & a, Vec8d const & b) { - BARRIER; - std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x + y; }); - BARRIER; - return a; -} - -// postfix operator ++ -static inline Vec8d operator ++ (Vec8d & a, int) { - BARRIER; - Vec8d a0 = a; - a = a + Vec8d(1.0); - BARRIER; - return a0; -} - -// prefix operator ++ -static inline Vec8d & operator ++ (Vec8d & a) { - BARRIER; - a = a + Vec8d(1.0); - BARRIER; - return a; -} - -// vector operator - : subtract element by element -static inline Vec8d operator - (Vec8d const & a, Vec8d const & b) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x - y; }); - BARRIER; - return r; -} - -// vector operator - : unary minus -// Change sign bit, even for 0, INF and NAN -static inline Vec8d operator - (Vec8d const & a) { - BARRIER; - Vec8d r(a); - for (size_t i = 0 ; i < 8 ; ++i) - r._d[i] = -a._d[i]; - BARRIER; - return r; -} - -// vector operator -= : subtract -static inline Vec8d & operator -= (Vec8d & a, Vec8d const & b) { - BARRIER; - std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x - y; }); - BARRIER; - return a; -} - -// postfix operator -- -static inline Vec8d operator -- (Vec8d & a, int) { - BARRIER; - Vec8d a0 = a; - a = a - Vec8d(1.0); - BARRIER; - return a0; -} - -// prefix operator -- -static inline Vec8d & operator -- (Vec8d & a) { - BARRIER; - a = a - Vec8d(1.0); - BARRIER; - return a; -} - -// vector operator * : multiply element by element -static inline Vec8d operator * (Vec8d const & a, Vec8d const & b) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x * y; }); - BARRIER; - return r; -} - -// vector operator *= : multiply -static inline Vec8d & operator *= (Vec8d & a, Vec8d const & b) { - BARRIER; - std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x * y; }); - BARRIER; - return a; -} - -// vector operator / : divide all elements by same integer -static inline Vec8d operator / (Vec8d const & a, Vec8d const & b) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x / y; }); - BARRIER; - return r; -} - -// vector operator /= : divide -static inline Vec8d & operator /= (Vec8d & a, Vec8d const & b) { - BARRIER; - std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x / y; }); - BARRIER; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline _vcl::Vec8db operator == (Vec8d const & a, Vec8d const & b) { - BARRIER; - _vcl::Vec8d a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec8d::F::comparisons(8); - BARRIER; - return a_ == b_; -} - -// vector operator != : returns true for elements for which a != b -static inline _vcl::Vec8db operator != (Vec8d const & a, Vec8d const & b) { - BARRIER; - _vcl::Vec8d a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec8d::F::comparisons(8); - BARRIER; - return a_ != b_; -} - -// vector operator < : returns true for elements for which a < b -static inline _vcl::Vec8db operator < (Vec8d const & a, Vec8d const & b) { - BARRIER; - _vcl::Vec8d a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec8d::F::comparisons(8); - BARRIER; - return a_ < b_; -} - -// vector operator <= : returns true for elements for which a <= b -static inline _vcl::Vec8db operator <= (Vec8d const & a, Vec8d const & b) { - BARRIER; - _vcl::Vec8d a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec8d::F::comparisons(8); - BARRIER; - return a_ <= b_; -} - -// vector operator > : returns true for elements for which a > b -static inline _vcl::Vec8db operator > (Vec8d const & a, Vec8d const & b) { - return b < a; -} - -// vector operator >= : returns true for elements for which a >= b -static inline _vcl::Vec8db operator >= (Vec8d const & a, Vec8d const & b) { - return b <= a; -} - -// General arithmetic functions, etc. - -// Horizontal add: Calculates the sum of all vector elements. -static inline Vec8d::F horizontal_add (Vec8d const & a) { - BARRIER; - return std::accumulate(a._d,a._d+8,Vec8d::F(0.0)); - BARRIER; -} - -// function max: a > b ? a : b -static inline Vec8d max(Vec8d const & a, Vec8d const & b) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return max(x,y); }); - BARRIER; - return r; -} - -// function min: a < b ? a : b -static inline Vec8d min(Vec8d const & a, Vec8d const & b) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return min(x,y); }); - BARRIER; - return r; -} - -// function abs: absolute value -// Removes sign bit, even for -0.0f, -INF and -NAN -static inline Vec8d abs(Vec8d const & a) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return abs(x); }); - BARRIER; - return r; -} - -// function sqrt: square root -static inline Vec8d sqrt(Vec8d const & a) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return sqrt(x); }); - BARRIER; - return r; -} - -// function square: a * a -static inline Vec8d square(Vec8d const & a) { - return a * a; -} - - -// exponential function -static inline Vec8d exp(Vec8d const & a){ - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return exp(x); }); - BARRIER; - return r; -} - -// function round: round to nearest integer (even). (result as double vector) -static inline Vec8d round(Vec8d const & a) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return round(x); }); - BARRIER; - return r; -} - -// function truncate: round towards zero. (result as double vector) -static inline Vec8d truncate(Vec8d const & a) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return trunc(x); }); - BARRIER; - return r; -} - -// function floor: round towards minus infinity. (result as double vector) -static inline Vec8d floor(Vec8d const & a) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return floor(x); }); - BARRIER; - return r; -} - -// function ceil: round towards plus infinity. (result as double vector) -static inline Vec8d ceil(Vec8d const & a) { - BARRIER; - Vec8d r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return ceil(x); }); - BARRIER; - return r; -} - -// Fused multiply and add functions - -// Multiply and add -static inline Vec8d mul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) { - BARRIER; - Vec8d r; - for (size_t i = 0 ; i < 8 ; ++i) - r._d[i] = a._d[i] * b._d[i] + c._d[i]; - BARRIER; - return r; -} - - -// Multiply and subtract -static inline Vec8d mul_sub(Vec8d const & a, Vec8d const & b, Vec8d const & c) { - BARRIER; - Vec8d r; - for (size_t i = 0 ; i < 8 ; ++i) - r._d[i] = a._d[i] * b._d[i] - c._d[i]; - BARRIER; - return r; -} - -// Multiply and inverse subtract -static inline Vec8d nmul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) { - BARRIER; - Vec8d r; - for (size_t i = 0 ; i < 8 ; ++i) - r._d[i] = - a._d[i] * b._d[i] + c._d[i]; - BARRIER; - return r; -} - - -template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> -static inline Vec8d blend8d(Vec8d const & a, Vec8d const & b) { - BARRIER; - _vcl::Vec8d a_,b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - _vcl::Vec8d r_ = _vcl::blend8d<i0,i1,i2,i3,i4,i5,i6,i7>(a_,b_); - BARRIER; - Vec8d::F::blends(1); - BARRIER; - Vec8d r; - BARRIER; - r_.store(r._d[0].data()); - BARRIER; - return r; -} - -// permute vector Vec4d -template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> -static inline Vec8d permute8d(Vec8d const & a) { - BARRIER; - _vcl::Vec8d a_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - _vcl::Vec8d r_ = _vcl::permute8d<i0,i1,i2,i3,i4,i5,i6,i7>(a_); - BARRIER; - Vec8d::F::permutes(1); - BARRIER; - Vec8d r; - BARRIER; - r_.store(r._d[0].data()); - BARRIER; - return r; -} - -#endif // MAC_VECTOR_SIZE >= 512 - - -struct Vec8f -{ - oc::OpCounter<float> _d[8]; - - using F = oc::OpCounter<float>; - - Vec8f() - {} - - Vec8f(F d) - { - BARRIER; - std::fill(_d,_d+8,d); - BARRIER; - } - - Vec8f(double d) - { - BARRIER; - std::fill(_d,_d+8,d); - BARRIER; - } - - Vec8f(F d0, F d1, F d2, F d3, F d4, F d5, F d6, F d7) - : _d{d0,d1,d2,d3,d4,d5,d6,d7} - { - BARRIER; - } - - Vec8f& load(const F* p) - { - BARRIER; - std::copy(p,p+8,_d); - BARRIER; - return *this; - } - - Vec8f& load_a(const F* p) - { - BARRIER; - std::copy(p,p+8,_d); - BARRIER; - return *this; - } - - void store(F* p) const - { - BARRIER; - std::copy(_d,_d+8,p); - BARRIER; - } - - void store_a(F* p) const - { - BARRIER; - std::copy(_d,_d+8,p); - BARRIER; - } - - Vec8f const& insert(uint32_t index, F value) - { - BARRIER; - _d[index] = value; - BARRIER; - return *this; - } - - F extract(uint32_t index) const - { - BARRIER; - return _d[index]; - } - - constexpr static int size() - { - return 8; - } - + using value = double; }; template<> -struct base_floatingpoint<Vec8f> +struct base_floatingpoint<Vec16f> { - using value = typename Vec8f::F; + using value = float; }; - -/***************************************************************************** -* -* Operators for Vec8f -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec8f operator + (Vec8f const & a, Vec8f const & b) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x + y; }); - BARRIER; - return r; -} - -// vector operator += : add -static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) { - BARRIER; - std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x + y; }); - BARRIER; - return a; -} - -// postfix operator ++ -static inline Vec8f operator ++ (Vec8f & a, int) { - BARRIER; - Vec8f a0 = a; - a = a + Vec8f(1.0); - BARRIER; - return a0; -} - -// prefix operator ++ -static inline Vec8f & operator ++ (Vec8f & a) { - BARRIER; - a = a + Vec8f(1.0); - BARRIER; - return a; -} - -// vector operator - : subtract element by element -static inline Vec8f operator - (Vec8f const & a, Vec8f const & b) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x - y; }); - BARRIER; - return r; -} - -// vector operator - : unary minus -// Change sign bit, even for 0, INF and NAN -static inline Vec8f operator - (Vec8f const & a) { - BARRIER; - Vec8f r(a); - for (size_t i = 0 ; i < 8 ; ++i) - r._d[i] = -a._d[i]; - BARRIER; - return r; -} - -// vector operator -= : subtract -static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) { - BARRIER; - std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x - y; }); - BARRIER; - return a; -} - -// postfix operator -- -static inline Vec8f operator -- (Vec8f & a, int) { - BARRIER; - Vec8f a0 = a; - a = a - Vec8f(1.0); - BARRIER; - return a0; -} - -// prefix operator -- -static inline Vec8f & operator -- (Vec8f & a) { - BARRIER; - a = a - Vec8f(1.0); - BARRIER; - return a; -} - -// vector operator * : multiply element by element -static inline Vec8f operator * (Vec8f const & a, Vec8f const & b) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x * y; }); - BARRIER; - return r; -} - -// vector operator *= : multiply -static inline Vec8f & operator *= (Vec8f & a, Vec8f const & b) { - BARRIER; - std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x * y; }); - BARRIER; - return a; -} - -// vector operator / : divide all elements by same integer -static inline Vec8f operator / (Vec8f const & a, Vec8f const & b) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return x / y; }); - BARRIER; - return r; -} - -// vector operator /= : divide -static inline Vec8f & operator /= (Vec8f & a, Vec8f const & b) { - BARRIER; - std::transform(a._d,a._d+8,b._d,a._d,[](auto x, auto y){ return x / y; }); - BARRIER; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline _vcl::Vec8fb operator == (Vec8f const & a, Vec8f const & b) { - BARRIER; - _vcl::Vec8f a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec8f::F::comparisons(8); - BARRIER; - return a_ == b_; -} - -// vector operator != : returns true for elements for which a != b -static inline _vcl::Vec8fb operator != (Vec8f const & a, Vec8f const & b) { - BARRIER; - _vcl::Vec8f a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec8f::F::comparisons(8); - BARRIER; - return a_ != b_; -} - -// vector operator < : returns true for elements for which a < b -static inline _vcl::Vec8fb operator < (Vec8f const & a, Vec8f const & b) { - BARRIER; - _vcl::Vec8f a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec8f::F::comparisons(8); - BARRIER; - return a_ < b_; -} - -// vector operator <= : returns true for elements for which a <= b -static inline _vcl::Vec8fb operator <= (Vec8f const & a, Vec8f const & b) { - BARRIER; - _vcl::Vec8f a_, b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - Vec8f::F::comparisons(8); - BARRIER; - return a_ <= b_; -} - -// vector operator > : returns true for elements for which a > b -static inline _vcl::Vec8fb operator > (Vec8f const & a, Vec8f const & b) { - return b < a; -} - -// vector operator >= : returns true for elements for which a >= b -static inline _vcl::Vec8fb operator >= (Vec8f const & a, Vec8f const & b) { - return b <= a; -} - -// avoid logical operators for now, I don't think we need them -#if 0 - -// Bitwise logical operators - -// vector operator & : bitwise and -static inline Vec8f operator & (Vec8f const & a, Vec8f const & b) { - return _mm256_and_pd(a, b); -} - -// vector operator &= : bitwise and -static inline Vec8f & operator &= (Vec8f & a, Vec8f const & b) { - a = a & b; - return a; -} - -// vector operator & : bitwise and of Vec8f and Vec8fb -static inline Vec8f operator & (Vec8f const & a, Vec8fb const & b) { - return _mm256_and_pd(a, b); -} -static inline Vec8f operator & (Vec8fb const & a, Vec8f const & b) { - return _mm256_and_pd(a, b); -} - -// vector operator | : bitwise or -static inline Vec8f operator | (Vec8f const & a, Vec8f const & b) { - return _mm256_or_pd(a, b); -} - -// vector operator |= : bitwise or -static inline Vec8f & operator |= (Vec8f & a, Vec8f const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec8f operator ^ (Vec8f const & a, Vec8f const & b) { - return _mm256_xor_pd(a, b); -} - -// vector operator ^= : bitwise xor -static inline Vec8f & operator ^= (Vec8f & a, Vec8f const & b) { - a = a ^ b; - return a; -} - -// vector operator ! : logical not. Returns Boolean vector -static inline Vec8fb operator ! (Vec8f const & a) { - return a == Vec8f(0.0); -} - #endif - -// General arithmetic functions, etc. - -// Horizontal add: Calculates the sum of all vector elements. -static inline Vec8f::F horizontal_add (Vec8f const & a) { - BARRIER; - return std::accumulate(a._d,a._d+8,Vec8f::F(0.0)); - BARRIER; -} - -// function max: a > b ? a : b -static inline Vec8f max(Vec8f const & a, Vec8f const & b) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return max(x,y); }); - BARRIER; - return r; -} - -// function min: a < b ? a : b -static inline Vec8f min(Vec8f const & a, Vec8f const & b) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,b._d,r._d,[](auto x, auto y){ return min(x,y); }); - BARRIER; - return r; -} - -// function abs: absolute value -// Removes sign bit, even for -0.0f, -INF and -NAN -static inline Vec8f abs(Vec8f const & a) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return abs(x); }); - BARRIER; - return r; -} - -// function sqrt: square root -static inline Vec8f sqrt(Vec8f const & a) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return sqrt(x); }); - BARRIER; - return r; -} - -// function square: a * a -static inline Vec8f square(Vec8f const & a) { - return a * a; -} - - -// exponential function -static inline Vec8f exp(Vec8f const & a){ - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return exp(x); }); - BARRIER; - return r; -} - - -// ignore pow() for now -#if 0 - -// pow(Vec8f, int): -template <typename TT> static Vec8f pow(Vec8f const & a, TT n); - -// Raise floating point numbers to integer power n -template <> -inline Vec8f pow<int>(Vec8f const & x0, int n) { - return pow_template_i<Vec8f>(x0, n); -} - -// allow conversion from unsigned int -template <> -inline Vec8f pow<uint32_t>(Vec8f const & x0, uint32_t n) { - return pow_template_i<Vec8f>(x0, (int)n); -} - - -// Raise floating point numbers to integer power n, where n is a compile-time constant -template <int n> -static inline Vec8f pow_n(Vec8f const & a) { - if (n < 0) return Vec8f(1.0) / pow_n<-n>(a); - if (n == 0) return Vec8f(1.0); - if (n >= 256) return pow(a, n); - Vec8f x = a; // a^(2^i) - Vec8f y; // accumulator - const int lowest = n - (n & (n-1));// lowest set bit in n - if (n & 1) y = x; - if (n < 2) return y; - x = x*x; // x^2 - if (n & 2) { - if (lowest == 2) y = x; else y *= x; - } - if (n < 4) return y; - x = x*x; // x^4 - if (n & 4) { - if (lowest == 4) y = x; else y *= x; - } - if (n < 8) return y; - x = x*x; // x^8 - if (n & 8) { - if (lowest == 8) y = x; else y *= x; - } - if (n < 16) return y; - x = x*x; // x^16 - if (n & 16) { - if (lowest == 16) y = x; else y *= x; - } - if (n < 32) return y; - x = x*x; // x^32 - if (n & 32) { - if (lowest == 32) y = x; else y *= x; - } - if (n < 64) return y; - x = x*x; // x^64 - if (n & 64) { - if (lowest == 64) y = x; else y *= x; - } - if (n < 128) return y; - x = x*x; // x^128 - if (n & 128) { - if (lowest == 128) y = x; else y *= x; - } - return y; -} - -template <int n> -static inline Vec8f pow(Vec8f const & a, Const_int_t<n>) { - return pow_n<n>(a); -} - #endif -// function round: round to nearest integer (even). (result as double vector) -static inline Vec8f round(Vec8f const & a) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return round(x); }); - BARRIER; - return r; -} - -// function truncate: round towards zero. (result as double vector) -static inline Vec8f truncate(Vec8f const & a) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return trunc(x); }); - BARRIER; - return r; -} - -// function floor: round towards minus infinity. (result as double vector) -static inline Vec8f floor(Vec8f const & a) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return floor(x); }); - BARRIER; - return r; -} - -// function ceil: round towards plus infinity. (result as double vector) -static inline Vec8f ceil(Vec8f const & a) { - BARRIER; - Vec8f r; - std::transform(a._d,a._d+8,r._d,[](auto x){ return ceil(x); }); - BARRIER; - return r; -} - -#if 0 -// function round_to_int: round to nearest integer (even). (result as integer vector) -static inline Vec8i round_to_int(Vec8f const & a) { - // Note: assume MXCSR control register is set to rounding - return _mm256_cvtpd_epi32(a); -} - -// function truncate_to_int: round towards zero. (result as integer vector) -static inline Vec8i truncate_to_int(Vec8f const & a) { - return _mm256_cvttpd_epi32(a); -} -#endif - - -// Fused multiply and add functions - -// Multiply and add -static inline Vec8f mul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) { - BARRIER; - Vec8f r; - for (size_t i = 0 ; i < 8 ; ++i) - r._d[i] = a._d[i] * b._d[i] + c._d[i]; - BARRIER; - return r; -} - - -// Multiply and subtract -static inline Vec8f mul_sub(Vec8f const & a, Vec8f const & b, Vec8f const & c) { - BARRIER; - Vec8f r; - for (size_t i = 0 ; i < 8 ; ++i) - r._d[i] = a._d[i] * b._d[i] - c._d[i]; - BARRIER; - return r; -} - -// Multiply and inverse subtract -static inline Vec8f nmul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) { - BARRIER; - Vec8f r; - for (size_t i = 0 ; i < 8 ; ++i) - r._d[i] = - a._d[i] * b._d[i] + c._d[i]; - BARRIER; - return r; -} - - -template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> -static inline Vec8f blend8f(Vec8f const & a, Vec8f const & b) { - BARRIER; - _vcl::Vec8f a_,b_; - BARRIER; - a_.load(a._d[0].data()); - BARRIER; - b_.load(b._d[0].data()); - BARRIER; - _vcl::Vec8f r_ = _vcl::blend8f<i0,i1,i2,i3,i4,i5,i6,i7>(a_,b_); - BARRIER; - Vec8f::F::blends(1); - BARRIER; - Vec8f r; - BARRIER; - r_.store(r._d[0].data()); - BARRIER; - return r; -} - - -#endif // ENABLE_COUNTER - #endif // DUNE_PDELAB_COMMON_VECTORCLASS_HH diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index bb9bf793b3adf64f3c29fa5883a97762d0438ee2..cc21196fa1679ad109510e3a65155b0394c60d3d 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -30,7 +30,8 @@ import cgen def _type_to_op_counter_type(name): - return "oc::OpCounter<{}>".format(name) + include_file("dune/opcounter/opcounter.hh") + return "OpCounter::OpCounter<{}>".format(name) def dtype_floatingpoint(): @@ -63,6 +64,10 @@ def type_floatingpoint(): return numpy_to_cpp_dtype(NumpyType(dtype).dtype.name) +def type_context_floatingpoint(): + return {np.float32: 'f', np.float64: 'd'}.get(dtype_floatingpoint()) + + class DuneExpressionToCExpressionMapper(ExpressionToCExpressionMapper): def map_subscript(self, expr, type_context): arr = self.find_array(expr) @@ -88,6 +93,10 @@ class DuneExpressionToCExpressionMapper(ExpressionToCExpressionMapper): return expr def map_constant(self, expr, type_context): + # We correct the type context to force all floating point literals to be of + # the type that we use throughout the computation. + if type_context in ("f", "d"): + type_context = type_context_floatingpoint() ret = ExpressionToCExpressionMapper.map_constant(self, expr, type_context) if get_option('opcounter'): if type_context in ("f", "d"): diff --git a/python/dune/perftool/loopy/transformations/instrumentation.py b/python/dune/perftool/loopy/transformations/instrumentation.py index 89b08b6f0e191ca06db56d820c585ebe585e250b..2771ba141ccf5978067e3569038c4307bd01a357 100644 --- a/python/dune/perftool/loopy/transformations/instrumentation.py +++ b/python/dune/perftool/loopy/transformations/instrumentation.py @@ -22,7 +22,7 @@ def _union(a): return frozenset.union(*a) -def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', operator=False): +def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', operator=False, depends_on=frozenset()): """ Transform loopy kernel to contain instrumentation code Arguments: @@ -32,6 +32,9 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o identifier : The name of the counter to start and stop level : The instrumentation level this measurement is defined at filetag : The tag of the file that should contain the counter definitions + depends_on: Additional dependencies to add to the start instruction. This is used to correct + currently wrong behaviour of the transformation in cases where a lot of structure + of the instrumentation is known a priori. """ # If the instrumentation level is not high enough, this is a no-op if level > get_option("instrumentation_level"): @@ -53,6 +56,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o insn_inames = _intersect(tuple(i.within_inames for i in insns)) other_inames = _union(tuple(i.within_inames for i in lp.find_instructions(knl, lp.match.Not(match)))) within = _intersect((insn_inames, other_inames)) + uniontags = _intersect(tuple(i.tags for i in insns)) # Get a unique identifer - note that the same timer could be started and stopped several times # within one kernel... @@ -67,8 +71,9 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o "HP_TIMER_START({});".format(identifier), id=start_id, within_inames=within, - depends_on=start_depends, + depends_on=depends_on.union(start_depends), boostable_into=frozenset(), + tags=uniontags, ) # Add dependencies on the timing instructions @@ -82,6 +87,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o within_inames=within, depends_on=frozenset(i.id for i in insns), boostable_into=frozenset(), + tags=uniontags, ) # Find all the instructions that should depend on stop @@ -98,4 +104,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o other_insns = list(filter(lambda i: i.id not in [j.id for j in rewritten_insns], knl.instructions)) # Add all the modified instructions into the kernel object - return knl.copy(instructions=rewritten_insns + other_insns + [start_insn, stop_insn]) + knl = knl.copy(instructions=rewritten_insns + other_insns + [start_insn, stop_insn]) + + from loopy.kernel.creation import resolve_dependencies + return resolve_dependencies(knl) diff --git a/python/dune/perftool/pdelab/driver/error.py b/python/dune/perftool/pdelab/driver/error.py index a4a7e2d414c7ffffce709c97d80c1b7a329dbef3..4934f06d66a7913041214130b51453df8cde979f 100644 --- a/python/dune/perftool/pdelab/driver/error.py +++ b/python/dune/perftool/pdelab/driver/error.py @@ -161,7 +161,7 @@ def accumulate_L2_squared(): @preamble(section="error") def define_accumulated_L2_error(name): t = type_range() - return "{} {}(0.0);".format(t, name) + return "Dune::FieldVector<{}, 1> {}(0.0);".format(t, name) def name_accumulated_L2_error(): @@ -182,7 +182,7 @@ def compare_L2_squared(): "if ({}.comm().rank() == 0){{".format(gv), " std::cout << \"\\nl2errorsquared: \" << {} << std::endl << std::endl;".format(accum_error), "}", - "if (isnan({0}) or abs({0})>{1})".format(accum_error, get_option("compare_l2errorsquared")), + "if (isnan({0}[0]) or abs({0}[0])>{1})".format(accum_error, get_option("compare_l2errorsquared")), " {} = true;".format(fail)] diff --git a/python/dune/perftool/pdelab/driver/timings.py b/python/dune/perftool/pdelab/driver/timings.py index 84f0ce839d7684d165faa6ab07b57dfe340d6b13..4ba2d8318de885030240adabcf3e9e139bb20524 100644 --- a/python/dune/perftool/pdelab/driver/timings.py +++ b/python/dune/perftool/pdelab/driver/timings.py @@ -8,6 +8,7 @@ from dune.perftool.generation import (cached, preamble, ) from dune.perftool.pdelab.driver import (get_form_ident, + is_linear, name_initree, name_mpihelper, ) @@ -131,18 +132,19 @@ def apply_jacobian_timer(): if get_option('instrumentation_level') >= 3: print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier())) - if get_option('instrumentation_level') >= 2: - evaluation = ["HP_TIMER_START(apply_jacobian);", - "{}.jacobian_apply({}, j);".format(n_go, v), - "HP_TIMER_STOP(apply_jacobian);", - "DUMP_TIMER({}, apply_jacobian, {}, true);".format(get_option("instrumentation_level"), timestream)] - evaluation.extend(print_times) - else: + if is_linear(): + declaration = ["{} j({});".format(t_v, v), "j=0.0;"] evaluation = ["{}.jacobian_apply({}, j);".format(n_go, v)] + else: + declaration = ["{} j0({});".format(t_v, v), "j0=0.0;", + "{} j1({});".format(t_v, v), "j1=0.0;"] + evaluation = ["{}.nonlinear_jacobian_apply({}, j0, j1);".format(n_go, v)] - evaluation = ["{} j({});".format(t_v, v), "j=0.0;"] + evaluation + if get_option('instrumentation_level') >= 2: + evaluation = ["HP_TIMER_START(apply_jacobian);"] + evaluation + ["HP_TIMER_STOP(apply_jacobian);", "DUMP_TIMER({}, apply_jacobian, {}, true);".format(get_option("instrumentation_level"), timestream)] + evaluation.extend(print_times) - return evaluation + return declaration + evaluation @preamble(section="timings") diff --git a/python/dune/perftool/pdelab/driver/visitor.py b/python/dune/perftool/pdelab/driver/visitor.py index 6549c9a18b8780a983abcfd0cc0be5f07d2e1e3f..da7c10ddb82273bef3b904a02295e3fe6e232eb5 100644 --- a/python/dune/perftool/pdelab/driver/visitor.py +++ b/python/dune/perftool/pdelab/driver/visitor.py @@ -4,11 +4,19 @@ from dune.perftool.ufl.visitor import UFL2LoopyVisitor import pymbolic.primitives as prim -@preamble +@preamble(section="init") def driver_using_statement(what): return "using {};".format(what) +@preamble(section="gridoperator") +def set_lop_to_starting_time(): + from dune.perftool.pdelab.driver import get_form_ident + from dune.perftool.pdelab.driver.gridoperator import name_localoperator + lop = name_localoperator(get_form_ident()) + return "{}.setTime(0.0);".format(lop) + + class DriverUFL2PymbolicVisitor(UFL2LoopyVisitor): def __init__(self): from dune.perftool.pdelab import PDELabInterface @@ -33,6 +41,7 @@ class DriverUFL2PymbolicVisitor(UFL2LoopyVisitor): from dune.perftool.pdelab.driver import get_form_ident from dune.perftool.pdelab.driver.gridoperator import name_localoperator lop = name_localoperator(get_form_ident()) + set_lop_to_starting_time() return prim.Call(prim.Variable("{}.getTime".format(lop)), ()) else: return UFL2LoopyVisitor.coefficient(self, o) diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index a4bd0201c10f681499b6dbd450b585306ca4ec94..727b7ae35fae4ef86fe1a6884204d4cbf3acfb2f 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -572,8 +572,8 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin if add_timings and get_form_option("sumfact"): from dune.perftool.pdelab.signatures import assembler_routine_name kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage1"), "{}_kernel_stage1".format(assembler_routine_name()), 4) - kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage2"), "{}_kernel_quadratureloop".format(assembler_routine_name()), 4) - kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage3"), "{}_kernel_stage3".format(assembler_routine_name()), 4) + kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage2"), "{}_kernel_quadratureloop".format(assembler_routine_name()), 4, depends_on=frozenset({lp.match.Tagged("sumfact_stage1")})) + kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage3"), "{}_kernel_stage3".format(assembler_routine_name()), 4, depends_on=frozenset({lp.match.Tagged("sumfact_stage2")})) if wrap_in_cgen: # Wrap the kernel in something which can generate code diff --git a/test/poisson/CMakeLists.txt b/test/poisson/CMakeLists.txt index a05cbe8c3650c9e33f8f5c4f1bbce696b2df9ba8..0a4f9897176e1aaa2a2864c23bf97135eb67261b 100644 --- a/test/poisson/CMakeLists.txt +++ b/test/poisson/CMakeLists.txt @@ -44,10 +44,12 @@ dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl ) # 8. Poisson with operator counting -dune_add_formcompiler_system_test(UFLFILE opcount_poisson_dg.ufl - BASENAME opcount_poisson_dg_symdiff - INIFILE opcount_poisson_dg_symdiff.mini - ) +if(dune-opcounter_FOUND) + dune_add_formcompiler_system_test(UFLFILE opcount_poisson_dg.ufl + BASENAME opcount_poisson_dg_symdiff + INIFILE opcount_poisson_dg_symdiff.mini + ) +endif() # 9. Poisson Test Case: DG quadrilaterals dune_add_formcompiler_system_test(UFLFILE poisson_dg_quadrilateral.ufl