diff --git a/dune/codegen/sumfact/horizontaladd.hh b/dune/codegen/sumfact/horizontaladd.hh index fc62dc47dab77330a125bb5f91805c89808480b0..7dd122af4c877b7206e99d71ba8aa05fc2fc448d 100644 --- a/dune/codegen/sumfact/horizontaladd.hh +++ b/dune/codegen/sumfact/horizontaladd.hh @@ -4,6 +4,50 @@ #include<dune/codegen/common/vectorclass.hh> +// Only use our custom implementations if we have AVX2 or later! +#if INSTRSET >= 8 + +/** Implement a variant of horizontal_add(Vec2d) that avoids the haddpd + * instruction and instead uses the shuffle port. + */ +static inline double permuting_horizontal_add (const Vec2d & a) +{ + return _mm_cvtsd_f64(_mm_add_pd(_mm_permute_pd(a,1),a)); +} + +/** Implement a variant of horizontal_add(Vec4d) that avoids the haddpd + * instruction and instead uses the shuffle port. + */ +static inline double permuting_horizontal_add (const Vec4d& a) +{ + __m128d valupper = _mm256_extractf128_pd(a, 1); + __m128d vallower = _mm256_castpd256_pd128(a); + __m128d valval = _mm_add_pd(valupper, vallower); + __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval); + return _mm_cvtsd_f64(res); +} + +#if MAX_VECTOR_SIZE >= 512 + +/** Implement a variant of horizontal_add(Vec8d) that avoids the haddpd + * instruction and instead uses the shuffle port. + */ +static inline double permuting_horizontal_add(const Vec8d& a) +{ + return permuting_horizontal_add(a.get_low() + a.get_high()); +} + +#endif + +#else +template<typename V> +static inline double permuting_horizontal_add (const V& a) +{ + return horizontal_add(a); +} + +#endif + template<class V> typename base_floatingpoint<V>::value horizontal_add_lower(const V& x) { @@ -16,4 +60,16 @@ typename base_floatingpoint<V>::value horizontal_add_upper(const V& x) return horizontal_add(x.get_high()); } +template<class V> +typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x) +{ + return permuting_horizontal_add(x.get_low()); +} + +template<class V> +typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x) +{ + return permuting_horizontal_add(x.get_high()); +} + #endif diff --git a/dune/codegen/sumfact/oc_horizontaladd.hh b/dune/codegen/sumfact/oc_horizontaladd.hh new file mode 100644 index 0000000000000000000000000000000000000000..d136aaf5ffc226e8e2463c5c40b3d64c8a476fe7 --- /dev/null +++ b/dune/codegen/sumfact/oc_horizontaladd.hh @@ -0,0 +1,25 @@ +#ifndef DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH +#define DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH + +#include<immintrin.h> +#include<dune/codegen/common/simdtraits.hh> + +template<class V> +typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x) +{ + return horizontal_add(x.get_low()); +} + +template<class V> +typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x) +{ + return horizontal_add(x.get_high()); +} + +template<class V> +typename base_floatingpoint<V>::value permuting_horizontal_add(const V& x) +{ + return horizontal_add(x); +} + +#endif diff --git a/patches/apply_patches.sh b/patches/apply_patches.sh index 7d1d45112d392a13a3dec4881ba43222cf60054b..2d0cdc6f543f2cb43d3a28bf55f562db0e64f13a 100755 --- a/patches/apply_patches.sh +++ b/patches/apply_patches.sh @@ -5,10 +5,6 @@ git apply ../../patches/loopy/Current.patch git apply ../../patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch popd -pushd dune/codegen/vectorclass -git apply ../../../patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch -popd - pushd python/ufl git apply ../../patches/ufl/0001-Remove-special-case-for-variable-in-ufl2dot.patch popd diff --git a/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch b/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch deleted file mode 100644 index c5ca6dc30e2135ab30a28c7373b94da344b8a7ac..0000000000000000000000000000000000000000 --- a/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch +++ /dev/null @@ -1,44 +0,0 @@ -From a324181d74fd8cd81fb945a4f66e4502ffbc68a0 Mon Sep 17 00:00:00 2001 -From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> -Date: Thu, 30 Nov 2017 18:51:49 +0100 -Subject: [PATCH] Alternative implementation of horizontal_add on AVX512 - ---- - vectorf512.h | 19 +++++++++++++------ - 1 file changed, 13 insertions(+), 6 deletions(-) - -diff --git a/vectorf512.h b/vectorf512.h -index 0845d12..6a15ac2 100644 ---- a/vectorf512.h -+++ b/vectorf512.h -@@ -1339,14 +1339,21 @@ static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b) - - - // General arithmetic functions, etc. -+#if __GNUC__ < 7 -+extern __inline double -+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -+_mm512_cvtsd_f64 (__m512d __A) -+{ -+ return __A[0]; -+} -+#endif - - // Horizontal add: Calculates the sum of all vector elements. --static inline double horizontal_add (Vec8d const & a) { --#if defined(__INTEL_COMPILER) -- return _mm512_reduce_add_pd(a); --#else -- return horizontal_add(a.get_low() + a.get_high()); --#endif -+static inline double horizontal_add (Vec8d const & x) { -+ __m512d intermediate = _mm512_add_pd(x, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(x), _mm512_castpd_si512(x), 1))); -+ intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 2))); -+ intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 4))); -+ return _mm512_cvtsd_f64(intermediate); - } - - // function max: a > b ? a : b --- -2.1.4 - diff --git a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch b/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch deleted file mode 100644 index fee83d7ad7cedbacc588c530ad9581b49cfa3b54..0000000000000000000000000000000000000000 --- a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 69f4ea4dcd018eb74c39a076a60fc27c0496e1dd Mon Sep 17 00:00:00 2001 -From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> -Date: Mon, 19 Jun 2017 13:07:22 +0200 -Subject: [PATCH] Better implementation of horizontal_add - ---- - vectorf256.h | 9 +++++---- - 1 file changed, 5 insertions(+), 4 deletions(-) - -diff --git a/vectorf256.h b/vectorf256.h -index db509f8..2bbd9de 100644 ---- a/vectorf256.h -+++ b/vectorf256.h -@@ -1692,10 +1692,11 @@ static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b) - - // Horizontal add: Calculates the sum of all vector elements. - static inline double horizontal_add (Vec4d const & a) { -- __m256d t1 = _mm256_hadd_pd(a,a); -- __m128d t2 = _mm256_extractf128_pd(t1,1); -- __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2); -- return _mm_cvtsd_f64(t3); -+ const __m128d valupper = _mm256_extractf128_pd(a, 1); -+ const __m128d vallower = _mm256_castpd256_pd128(a); -+ const __m128d valval = _mm_add_pd(valupper, vallower); -+ const __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval); -+ return _mm_cvtsd_f64(res); - } - - // function max: a > b ? a : b --- -2.1.4 - diff --git a/python/dune/codegen/loopy/vcl.py b/python/dune/codegen/loopy/vcl.py index e0943a69ac35136abfe59a7f641e0f076681643c..2431275a5f18a3bc87272711ab5ed71c038ced0b 100644 --- a/python/dune/codegen/loopy/vcl.py +++ b/python/dune/codegen/loopy/vcl.py @@ -119,9 +119,17 @@ def vcl_function_mangler(knl, func, arg_dtypes): return lp.CallMangleInfo("select", (vcl,), (vcl, vcl, vcl)) if func in ("horizontal_add", "horizontal_add_lower", "horizontal_add_upper"): + if get_option("permuting_horizontal_add"): + func = "permuting_{}".format(func) + dtype = arg_dtypes[0] vcl = lp.types.NumpyType(get_vcl_type(dtype)) - include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile") + + if get_option("opcounter"): + include_file("dune/codegen/sumfact/oc_horizontaladd.hh", filetag="operatorfile") + else: + include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile") + return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype.dtype),), (vcl,)) if isinstance(func, VCLPermute): diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py index 51722da3fe34a3fa24b85f4d47e06750c0cc0007..9407709a515ef497b9f9b7561f570cf0411dd4ed 100644 --- a/python/dune/codegen/options.py +++ b/python/dune/codegen/options.py @@ -61,6 +61,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord): use_sde = CodegenOption(default=False, helpstr="Use sde instead of own performance measurements.") autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).") with_mpi = CodegenOption(default=True, helpstr="The module was configured with mpi") + permuting_horizontal_add = CodegenOption(default=True, helpstr="Whether SIMD horizontal_add should use a permuting implementation.") # Arguments that are mainly to be set by logic depending on other options max_vector_width = CodegenOption(default=256, helpstr=None)