diff --git a/dune/codegen/sumfact/horizontaladd.hh b/dune/codegen/sumfact/horizontaladd.hh index 6b350a56e25c51dcfacee7fb177efcaab4fc9395..90dbf03832056964eea8107f6ba2c80868af3dee 100644 --- a/dune/codegen/sumfact/horizontaladd.hh +++ b/dune/codegen/sumfact/horizontaladd.hh @@ -17,18 +17,14 @@ typename base_floatingpoint<V>::value horizontal_add_upper(const V& x) return horizontal_add(x.get_high()); } -#if MAX_VECTOR_SIZE >= 512 - -/** Implement a variant of horizontal_add(Vec8d) that avoids the haddpd +/** Implement a variant of horizontal_add(Vec2d) that avoids the haddpd * instruction and instead uses the shuffle port. */ -static inline double permuting_horizontal_add(const Vec8d& a) +static inline double permuting_horizontal_add (const Vec2d & a) { - return permuting_horizontal_add(a.get_low()) + permuting_horizontal_add(a.get_high()); + return _mm_cvtsd_f64(_mm_add_pd(_mm_permute_pd(a,1),a)); } -#endif - /** Implement a variant of horizontal_add(Vec4d) that avoids the haddpd * instruction and instead uses the shuffle port. */ @@ -41,14 +37,18 @@ static inline double permuting_horizontal_add (const Vec4d& a) return _mm_cvtsd_f64(res); } -/** Implement a variant of horizontal_add(Vec2d) that avoids the haddpd +#if MAX_VECTOR_SIZE >= 512 + +/** Implement a variant of horizontal_add(Vec8d) that avoids the haddpd * instruction and instead uses the shuffle port. */ -static inline double permuting_horizontal_add (const Vec2d & a) +static inline double permuting_horizontal_add(const Vec8d& a) { - return _mm_cvtsd_f64(_mm_add_pd(_mm_permute_pd(a,1),a)); + return permuting_horizontal_add(a.get_low() + a.get_high()); } +#endif + template<class V> typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x) {