diff --git a/dune/codegen/sumfact/horizontaladd.hh b/dune/codegen/sumfact/horizontaladd.hh
index fc62dc47dab77330a125bb5f91805c89808480b0..7dd122af4c877b7206e99d71ba8aa05fc2fc448d 100644
--- a/dune/codegen/sumfact/horizontaladd.hh
+++ b/dune/codegen/sumfact/horizontaladd.hh
@@ -4,6 +4,50 @@
 #include<dune/codegen/common/vectorclass.hh>
 
 
+// Only use our custom implementations if we have AVX2 or later!
+#if INSTRSET >= 8
+
+/** Implement a variant of horizontal_add(Vec2d) that avoids the haddpd
+ *  instruction and instead uses the shuffle port.
+ */
+static inline double permuting_horizontal_add (const Vec2d & a)
+{
+    return _mm_cvtsd_f64(_mm_add_pd(_mm_permute_pd(a,1),a));
+}
+
+/** Implement a variant of horizontal_add(Vec4d) that avoids the haddpd
+ *  instruction and instead uses the shuffle port.
+ */
+static inline double permuting_horizontal_add (const Vec4d& a)
+{
+    __m128d valupper = _mm256_extractf128_pd(a, 1);
+    __m128d vallower = _mm256_castpd256_pd128(a);
+    __m128d valval = _mm_add_pd(valupper, vallower);
+    __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
+    return _mm_cvtsd_f64(res);
+}
+
+#if MAX_VECTOR_SIZE >= 512
+
+/** Implement a variant of horizontal_add(Vec8d) that avoids the haddpd
+ *  instruction and instead uses the shuffle port.
+ */
+static inline double permuting_horizontal_add(const Vec8d& a)
+{
+  return permuting_horizontal_add(a.get_low() + a.get_high());
+}
+
+#endif
+
+#else
+template<typename V>
+static inline double permuting_horizontal_add (const V& a)
+{
+    return horizontal_add(a);
+}
+
+#endif
+
 template<class V>
 typename base_floatingpoint<V>::value horizontal_add_lower(const V& x)
 {
@@ -16,4 +60,16 @@ typename base_floatingpoint<V>::value horizontal_add_upper(const V& x)
   return horizontal_add(x.get_high());
 }
 
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x)
+{
+  return permuting_horizontal_add(x.get_low());
+}
+
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x)
+{
+  return permuting_horizontal_add(x.get_high());
+}
+
 #endif
diff --git a/dune/codegen/sumfact/oc_horizontaladd.hh b/dune/codegen/sumfact/oc_horizontaladd.hh
new file mode 100644
index 0000000000000000000000000000000000000000..d136aaf5ffc226e8e2463c5c40b3d64c8a476fe7
--- /dev/null
+++ b/dune/codegen/sumfact/oc_horizontaladd.hh
@@ -0,0 +1,25 @@
+#ifndef DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH
+#define DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH
+
+#include<immintrin.h>
+#include<dune/codegen/common/simdtraits.hh>
+
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x)
+{
+  return horizontal_add(x.get_low());
+}
+
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x)
+{
+  return horizontal_add(x.get_high());
+}
+
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add(const V& x)
+{
+  return horizontal_add(x);
+}
+
+#endif
diff --git a/patches/apply_patches.sh b/patches/apply_patches.sh
index 7d1d45112d392a13a3dec4881ba43222cf60054b..2d0cdc6f543f2cb43d3a28bf55f562db0e64f13a 100755
--- a/patches/apply_patches.sh
+++ b/patches/apply_patches.sh
@@ -5,10 +5,6 @@ git apply ../../patches/loopy/Current.patch
 git apply ../../patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
 popd
 
-pushd dune/codegen/vectorclass
-git apply ../../../patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
-popd
-
 pushd python/ufl
 git apply ../../patches/ufl/0001-Remove-special-case-for-variable-in-ufl2dot.patch
 popd
diff --git a/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch b/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch
deleted file mode 100644
index c5ca6dc30e2135ab30a28c7373b94da344b8a7ac..0000000000000000000000000000000000000000
--- a/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch
+++ /dev/null
@@ -1,44 +0,0 @@
-From a324181d74fd8cd81fb945a4f66e4502ffbc68a0 Mon Sep 17 00:00:00 2001
-From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
-Date: Thu, 30 Nov 2017 18:51:49 +0100
-Subject: [PATCH] Alternative implementation of horizontal_add on AVX512
-
----
- vectorf512.h | 19 +++++++++++++------
- 1 file changed, 13 insertions(+), 6 deletions(-)
-
-diff --git a/vectorf512.h b/vectorf512.h
-index 0845d12..6a15ac2 100644
---- a/vectorf512.h
-+++ b/vectorf512.h
-@@ -1339,14 +1339,21 @@ static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b)
- 
- 
- // General arithmetic functions, etc.
-+#if __GNUC__ < 7
-+extern __inline double
-+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-+_mm512_cvtsd_f64 (__m512d __A)
-+{
-+  return __A[0];
-+}
-+#endif
- 
- // Horizontal add: Calculates the sum of all vector elements.
--static inline double horizontal_add (Vec8d const & a) {
--#if defined(__INTEL_COMPILER)
--    return _mm512_reduce_add_pd(a);
--#else
--    return horizontal_add(a.get_low() + a.get_high());
--#endif
-+static inline double horizontal_add (Vec8d const & x) {
-+    __m512d intermediate = _mm512_add_pd(x, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(x), _mm512_castpd_si512(x), 1)));
-+    intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 2)));
-+    intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 4)));
-+    return _mm512_cvtsd_f64(intermediate);
- }
- 
- // function max: a > b ? a : b
--- 
-2.1.4
-
diff --git a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch b/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
deleted file mode 100644
index fee83d7ad7cedbacc588c530ad9581b49cfa3b54..0000000000000000000000000000000000000000
--- a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-From 69f4ea4dcd018eb74c39a076a60fc27c0496e1dd Mon Sep 17 00:00:00 2001
-From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
-Date: Mon, 19 Jun 2017 13:07:22 +0200
-Subject: [PATCH] Better implementation of horizontal_add
-
----
- vectorf256.h | 9 +++++----
- 1 file changed, 5 insertions(+), 4 deletions(-)
-
-diff --git a/vectorf256.h b/vectorf256.h
-index db509f8..2bbd9de 100644
---- a/vectorf256.h
-+++ b/vectorf256.h
-@@ -1692,10 +1692,11 @@ static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b)
- 
- // Horizontal add: Calculates the sum of all vector elements.
- static inline double horizontal_add (Vec4d const & a) {
--    __m256d t1 = _mm256_hadd_pd(a,a);
--    __m128d t2 = _mm256_extractf128_pd(t1,1);
--    __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
--    return _mm_cvtsd_f64(t3);        
-+    const __m128d valupper = _mm256_extractf128_pd(a, 1);
-+    const __m128d vallower = _mm256_castpd256_pd128(a);
-+    const __m128d valval = _mm_add_pd(valupper, vallower);
-+    const __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
-+    return _mm_cvtsd_f64(res);
- }
- 
- // function max: a > b ? a : b
--- 
-2.1.4
-
diff --git a/python/dune/codegen/loopy/vcl.py b/python/dune/codegen/loopy/vcl.py
index e0943a69ac35136abfe59a7f641e0f076681643c..2431275a5f18a3bc87272711ab5ed71c038ced0b 100644
--- a/python/dune/codegen/loopy/vcl.py
+++ b/python/dune/codegen/loopy/vcl.py
@@ -119,9 +119,17 @@ def vcl_function_mangler(knl, func, arg_dtypes):
         return lp.CallMangleInfo("select", (vcl,), (vcl, vcl, vcl))
 
     if func in ("horizontal_add", "horizontal_add_lower", "horizontal_add_upper"):
+        if get_option("permuting_horizontal_add"):
+            func = "permuting_{}".format(func)
+
         dtype = arg_dtypes[0]
         vcl = lp.types.NumpyType(get_vcl_type(dtype))
-        include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile")
+
+        if get_option("opcounter"):
+            include_file("dune/codegen/sumfact/oc_horizontaladd.hh", filetag="operatorfile")
+        else:
+            include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile")
+
         return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype.dtype),), (vcl,))
 
     if isinstance(func, VCLPermute):
diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index 51722da3fe34a3fa24b85f4d47e06750c0cc0007..9407709a515ef497b9f9b7561f570cf0411dd4ed 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -61,6 +61,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
     use_sde = CodegenOption(default=False, helpstr="Use sde instead of own performance measurements.")
     autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
     with_mpi = CodegenOption(default=True, helpstr="The module was configured with mpi")
+    permuting_horizontal_add = CodegenOption(default=True, helpstr="Whether SIMD horizontal_add should use a permuting implementation.")
 
     # Arguments that are mainly to be set by logic depending on other options
     max_vector_width = CodegenOption(default=256, helpstr=None)