Skip to content
Snippets Groups Projects
Commit af86afdb authored by Dominic Kempf's avatar Dominic Kempf
Browse files

[!309] Move patches for VCL into dune-codegen under a different name and introduce a switch

Merge branch 'feature/permuting-horizontal_adds' into 'master'

ref:extensions/dune-codegen The number of alternative implementations will
grow in the near future, so its much better to have them in dune-codegen than
patching the VCL\...

See merge request [extensions/dune-codegen!309]

  [extensions/dune-codegen!309]: gitlab.dune-project.org/extensions/dune-codegen/merge_requests/309
parents 6c4d06b9 3b0aad3a
No related branches found
No related tags found
No related merge requests found
......@@ -4,6 +4,50 @@
#include<dune/codegen/common/vectorclass.hh>
// Only use our custom implementations if we have AVX2 or later!
#if INSTRSET >= 8
/** Implement a variant of horizontal_add(Vec2d) that avoids the haddpd
* instruction and instead uses the shuffle port.
*/
static inline double permuting_horizontal_add (const Vec2d & a)
{
return _mm_cvtsd_f64(_mm_add_pd(_mm_permute_pd(a,1),a));
}
/** Implement a variant of horizontal_add(Vec4d) that avoids the haddpd
* instruction and instead uses the shuffle port.
*/
static inline double permuting_horizontal_add (const Vec4d& a)
{
__m128d valupper = _mm256_extractf128_pd(a, 1);
__m128d vallower = _mm256_castpd256_pd128(a);
__m128d valval = _mm_add_pd(valupper, vallower);
__m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
return _mm_cvtsd_f64(res);
}
#if MAX_VECTOR_SIZE >= 512
/** Implement a variant of horizontal_add(Vec8d) that avoids the haddpd
* instruction and instead uses the shuffle port.
*/
static inline double permuting_horizontal_add(const Vec8d& a)
{
return permuting_horizontal_add(a.get_low() + a.get_high());
}
#endif
#else
template<typename V>
static inline double permuting_horizontal_add (const V& a)
{
return horizontal_add(a);
}
#endif
template<class V>
typename base_floatingpoint<V>::value horizontal_add_lower(const V& x)
{
......@@ -16,4 +60,16 @@ typename base_floatingpoint<V>::value horizontal_add_upper(const V& x)
return horizontal_add(x.get_high());
}
template<class V>
typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x)
{
return permuting_horizontal_add(x.get_low());
}
template<class V>
typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x)
{
return permuting_horizontal_add(x.get_high());
}
#endif
#ifndef DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH
#define DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH
#include<immintrin.h>
#include<dune/codegen/common/simdtraits.hh>
template<class V>
typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x)
{
return horizontal_add(x.get_low());
}
template<class V>
typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x)
{
return horizontal_add(x.get_high());
}
template<class V>
typename base_floatingpoint<V>::value permuting_horizontal_add(const V& x)
{
return horizontal_add(x);
}
#endif
......@@ -5,10 +5,6 @@ git apply ../../patches/loopy/Current.patch
git apply ../../patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
popd
pushd dune/codegen/vectorclass
git apply ../../../patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
popd
pushd python/ufl
git apply ../../patches/ufl/0001-Remove-special-case-for-variable-in-ufl2dot.patch
popd
From a324181d74fd8cd81fb945a4f66e4502ffbc68a0 Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Thu, 30 Nov 2017 18:51:49 +0100
Subject: [PATCH] Alternative implementation of horizontal_add on AVX512
---
vectorf512.h | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/vectorf512.h b/vectorf512.h
index 0845d12..6a15ac2 100644
--- a/vectorf512.h
+++ b/vectorf512.h
@@ -1339,14 +1339,21 @@ static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b)
// General arithmetic functions, etc.
+#if __GNUC__ < 7
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsd_f64 (__m512d __A)
+{
+ return __A[0];
+}
+#endif
// Horizontal add: Calculates the sum of all vector elements.
-static inline double horizontal_add (Vec8d const & a) {
-#if defined(__INTEL_COMPILER)
- return _mm512_reduce_add_pd(a);
-#else
- return horizontal_add(a.get_low() + a.get_high());
-#endif
+static inline double horizontal_add (Vec8d const & x) {
+ __m512d intermediate = _mm512_add_pd(x, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(x), _mm512_castpd_si512(x), 1)));
+ intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 2)));
+ intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 4)));
+ return _mm512_cvtsd_f64(intermediate);
}
// function max: a > b ? a : b
--
2.1.4
From 69f4ea4dcd018eb74c39a076a60fc27c0496e1dd Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Mon, 19 Jun 2017 13:07:22 +0200
Subject: [PATCH] Better implementation of horizontal_add
---
vectorf256.h | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/vectorf256.h b/vectorf256.h
index db509f8..2bbd9de 100644
--- a/vectorf256.h
+++ b/vectorf256.h
@@ -1692,10 +1692,11 @@ static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b)
// Horizontal add: Calculates the sum of all vector elements.
static inline double horizontal_add (Vec4d const & a) {
- __m256d t1 = _mm256_hadd_pd(a,a);
- __m128d t2 = _mm256_extractf128_pd(t1,1);
- __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
- return _mm_cvtsd_f64(t3);
+ const __m128d valupper = _mm256_extractf128_pd(a, 1);
+ const __m128d vallower = _mm256_castpd256_pd128(a);
+ const __m128d valval = _mm_add_pd(valupper, vallower);
+ const __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
+ return _mm_cvtsd_f64(res);
}
// function max: a > b ? a : b
--
2.1.4
......@@ -119,9 +119,17 @@ def vcl_function_mangler(knl, func, arg_dtypes):
return lp.CallMangleInfo("select", (vcl,), (vcl, vcl, vcl))
if func in ("horizontal_add", "horizontal_add_lower", "horizontal_add_upper"):
if get_option("permuting_horizontal_add"):
func = "permuting_{}".format(func)
dtype = arg_dtypes[0]
vcl = lp.types.NumpyType(get_vcl_type(dtype))
include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile")
if get_option("opcounter"):
include_file("dune/codegen/sumfact/oc_horizontaladd.hh", filetag="operatorfile")
else:
include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile")
return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype.dtype),), (vcl,))
if isinstance(func, VCLPermute):
......
......@@ -61,6 +61,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
use_sde = CodegenOption(default=False, helpstr="Use sde instead of own performance measurements.")
autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
with_mpi = CodegenOption(default=True, helpstr="The module was configured with mpi")
permuting_horizontal_add = CodegenOption(default=True, helpstr="Whether SIMD horizontal_add should use a permuting implementation.")
# Arguments that are mainly to be set by logic depending on other options
max_vector_width = CodegenOption(default=256, helpstr=None)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment