[!309] Move patches for VCL into dune-codegen under a different name and introduce a switch

Merge branch 'feature/permuting-horizontal_adds' into 'master' ref:extensions/dune-codegen The number of alternative implementations will grow in the near future, so its much better to have them in dune-codegen than patching the VCL\... See merge request [extensions/dune-codegen!309] [extensions/dune-codegen!309]: gitlab.dune-project.org/extensions/dune-codegen/merge_requests/309

[!309] Move patches for VCL into dune-codegen under a different name and introduce a switch
Merge branch 'feature/permuting-horizontal_adds' into 'master' ref:extensions/dune-codegen The number of alternative implementations will grow in the near future, so its much better to have them in dune-codegen than patching the VCL\... See merge request [extensions/dune-codegen!309] [extensions/dune-codegen!309]: gitlab.dune-project.org/extensions/dune-codegen/merge_requests/309
af86afdb · Dominic Kempf · 6c4d06b9 · 3b0aad3a · af86afdb · af86afdb
Commit af86afdb authored 5 years ago by Dominic Kempf
--- a/dune/codegen/sumfact/horizontaladd.hh
+++ b/dune/codegen/sumfact/horizontaladd.hh
@@ -4,6 +4,50 @@
 #include<dune/codegen/common/vectorclass.hh>


+// Only use our custom implementations if we have AVX2 or later!
+#if INSTRSET >= 8
+
+/** Implement a variant of horizontal_add(Vec2d) that avoids the haddpd
+ *  instruction and instead uses the shuffle port.
+ */
+static inline double permuting_horizontal_add (const Vec2d & a)
+{
+    return _mm_cvtsd_f64(_mm_add_pd(_mm_permute_pd(a,1),a));
+}
+
+/** Implement a variant of horizontal_add(Vec4d) that avoids the haddpd
+ *  instruction and instead uses the shuffle port.
+ */
+static inline double permuting_horizontal_add (const Vec4d& a)
+{
+    __m128d valupper = _mm256_extractf128_pd(a, 1);
+    __m128d vallower = _mm256_castpd256_pd128(a);
+    __m128d valval = _mm_add_pd(valupper, vallower);
+    __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
+    return _mm_cvtsd_f64(res);
+}
+
+#if MAX_VECTOR_SIZE >= 512
+
+/** Implement a variant of horizontal_add(Vec8d) that avoids the haddpd
+ *  instruction and instead uses the shuffle port.
+ */
+static inline double permuting_horizontal_add(const Vec8d& a)
+{
+  return permuting_horizontal_add(a.get_low() + a.get_high());
+}
+
+#endif
+
+#else
+template<typename V>
+static inline double permuting_horizontal_add (const V& a)
+{
+    return horizontal_add(a);
+}
+
+#endif
+
 template<class V>
 typename base_floatingpoint<V>::value horizontal_add_lower(const V& x)
 {
@@ -16,4 +60,16 @@ typename base_floatingpoint<V>::value horizontal_add_upper(const V& x)
  return horizontal_add(x.get_high());
 }

+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x)
+{
+  return permuting_horizontal_add(x.get_low());
+}
+
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x)
+{
+  return permuting_horizontal_add(x.get_high());
+}
+
 #endif
--- a/dune/codegen/sumfact/oc_horizontaladd.hh
+++ b/dune/codegen/sumfact/oc_horizontaladd.hh
+#ifndef DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH
+#define DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH
+
+#include<immintrin.h>
+#include<dune/codegen/common/simdtraits.hh>
+
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x)
+{
+  return horizontal_add(x.get_low());
+}
+
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x)
+{
+  return horizontal_add(x.get_high());
+}
+
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add(const V& x)
+{
+  return horizontal_add(x);
+}
+
+#endif
--- a/patches/apply_patches.sh
+++ b/patches/apply_patches.sh
@@ -5,10 +5,6 @@ git apply ../../patches/loopy/Current.patch
 git apply ../../patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
 popd

-pushd dune/codegen/vectorclass
-git apply ../../../patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
-popd
-
 pushd python/ufl
 git apply ../../patches/ufl/0001-Remove-special-case-for-variable-in-ufl2dot.patch
 popd
--- a/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch
+++ b/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch
-From a324181d74fd8cd81fb945a4f66e4502ffbc68a0 Mon Sep 17 00:00:00 2001
-From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
-Date: Thu, 30 Nov 2017 18:51:49 +0100
-Subject: [PATCH] Alternative implementation of horizontal_add on AVX512
-
---
- vectorf512.h | 19 +++++++++++++------
- 1 file changed, 13 insertions(+), 6 deletions(-)
-
-diff --git a/vectorf512.h b/vectorf512.h
-index 0845d12..6a15ac2 100644
--- a/vectorf512.h
-+++ b/vectorf512.h
-@@ -1339,14 +1339,21 @@ static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b)
- 
- 
- // General arithmetic functions, etc.
-+#if __GNUC__ < 7
-+extern __inline double
-+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-+_mm512_cvtsd_f64 (__m512d __A)
-+{
-+  return __A[0];
-+}
-+#endif
- 
- // Horizontal add: Calculates the sum of all vector elements.
-static inline double horizontal_add (Vec8d const & a) {
-#if defined(__INTEL_COMPILER)
-    return _mm512_reduce_add_pd(a);
-#else
-    return horizontal_add(a.get_low() + a.get_high());
-#endif
-+static inline double horizontal_add (Vec8d const & x) {
-+    __m512d intermediate = _mm512_add_pd(x, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(x), _mm512_castpd_si512(x), 1)));
-+    intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 2)));
-+    intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 4)));
-+    return _mm512_cvtsd_f64(intermediate);
- }
- 
- // function max: a > b ? a : b
-- 
-2.1.4
-
--- a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
+++ b/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
-From 69f4ea4dcd018eb74c39a076a60fc27c0496e1dd Mon Sep 17 00:00:00 2001
-From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
-Date: Mon, 19 Jun 2017 13:07:22 +0200
-Subject: [PATCH] Better implementation of horizontal_add
-
---
- vectorf256.h | 9 +++++----
- 1 file changed, 5 insertions(+), 4 deletions(-)
-
-diff --git a/vectorf256.h b/vectorf256.h
-index db509f8..2bbd9de 100644
--- a/vectorf256.h
-+++ b/vectorf256.h
-@@ -1692,10 +1692,11 @@ static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b)
- 
- // Horizontal add: Calculates the sum of all vector elements.
- static inline double horizontal_add (Vec4d const & a) {
-    __m256d t1 = _mm256_hadd_pd(a,a);
-    __m128d t2 = _mm256_extractf128_pd(t1,1);
-    __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
-    return _mm_cvtsd_f64(t3);        
-+    const __m128d valupper = _mm256_extractf128_pd(a, 1);
-+    const __m128d vallower = _mm256_castpd256_pd128(a);
-+    const __m128d valval = _mm_add_pd(valupper, vallower);
-+    const __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
-+    return _mm_cvtsd_f64(res);
- }
- 
- // function max: a > b ? a : b
-- 
-2.1.4
-
--- a/python/dune/codegen/loopy/vcl.py
+++ b/python/dune/codegen/loopy/vcl.py
@@ -119,9 +119,17 @@ def vcl_function_mangler(knl, func, arg_dtypes):
        return lp.CallMangleInfo("select", (vcl,), (vcl, vcl, vcl))

    if func in ("horizontal_add", "horizontal_add_lower", "horizontal_add_upper"):
+        if get_option("permuting_horizontal_add"):
+            func = "permuting_{}".format(func)
+
        dtype = arg_dtypes[0]
        vcl = lp.types.NumpyType(get_vcl_type(dtype))
-        include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile")
+
+        if get_option("opcounter"):
+            include_file("dune/codegen/sumfact/oc_horizontaladd.hh", filetag="operatorfile")
+        else:
+            include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile")
+
        return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype.dtype),), (vcl,))

    if isinstance(func, VCLPermute):

--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -61,6 +61,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
    use_sde = CodegenOption(default=False, helpstr="Use sde instead of own performance measurements.")
    autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
    with_mpi = CodegenOption(default=True, helpstr="The module was configured with mpi")
+    permuting_horizontal_add = CodegenOption(default=True, helpstr="Whether SIMD horizontal_add should use a permuting implementation.")

    # Arguments that are mainly to be set by logic depending on other options
    max_vector_width = CodegenOption(default=256, helpstr=None)