From 808f70848cb39ded755486a2482287cd1156c912 Mon Sep 17 00:00:00 2001 From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> Date: Mon, 19 Jun 2017 13:08:04 +0200 Subject: [PATCH] Update vector class library + Patch --- dune/perftool/vectorclass | 2 +- ...ter-implementation-of-horizontal_add.patch | 32 +++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch diff --git a/dune/perftool/vectorclass b/dune/perftool/vectorclass index 4e11d282..8d52f136 160000 --- a/dune/perftool/vectorclass +++ b/dune/perftool/vectorclass @@ -1 +1 @@ -Subproject commit 4e11d28201c90f357771c98af790eccfaea2103d +Subproject commit 8d52f13665adbfe4b93bbf076b79828d03563ce1 diff --git a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch b/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch new file mode 100644 index 00000000..cb167554 --- /dev/null +++ b/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch @@ -0,0 +1,32 @@ +From 69f4ea4dcd018eb74c39a076a60fc27c0496e1dd Mon Sep 17 00:00:00 2001 +From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> +Date: Mon, 19 Jun 2017 13:07:22 +0200 +Subject: [PATCH] Better implementation of horizontal_add + +--- + vectorf256.h | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +diff --git a/vectorf256.h b/vectorf256.h +index db509f8..2bbd9de 100644 +--- a/vectorf256.h ++++ b/vectorf256.h +@@ -1692,10 +1692,11 @@ static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b) + + // Horizontal add: Calculates the sum of all vector elements. + static inline double horizontal_add (Vec4d const & a) { +- __m256d t1 = _mm256_hadd_pd(a,a); +- __m128d t2 = _mm256_extractf128_pd(t1,1); +- __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2); +- return _mm_cvtsd_f64(t3); ++ const __m128d valupper = _mm256_extractf128_pd(val, 1); ++ const __m128d vallower = _mm256_castpd256_pd128(val); ++ const __m128d valval = _mm_add_pd(valupper, vallower); ++ const __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval); ++ return _mm_cvtsd_f64(res); + } + + // function max: a > b ? a : b +-- +2.1.4 + -- GitLab