diff --git a/dune/codegen/sumfact/horizontaladd.hh b/dune/codegen/sumfact/horizontaladd.hh
index fc62dc47dab77330a125bb5f91805c89808480b0..7257a432ee72e176f27775d5ecf279ff7b450f78 100644
--- a/dune/codegen/sumfact/horizontaladd.hh
+++ b/dune/codegen/sumfact/horizontaladd.hh
@@ -16,4 +16,45 @@ typename base_floatingpoint<V>::value horizontal_add_upper(const V& x)
   return horizontal_add(x.get_high());
 }
 
+/** Implement a variant of horizontal_add(Vec4d) that avoids the haddpd
+ *  instruction and instead uses the shuffle port.
+ */
+static inline double permuting_horizontal_add (Vec4d const & a)
+{
+    const __m128d valupper = _mm256_extractf128_pd(a, 1);
+    const __m128d vallower = _mm256_castpd256_pd128(a);
+    const __m128d valval = _mm_add_pd(valupper, vallower);
+    const __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
+    return _mm_cvtsd_f64(res);
+}
+
+/** Implement a variant of horizontal_add(Vec2d) that avoids the haddpd
+ *  instruction and instead uses the shuffle port.
+ */
+static inline double permuting_horizontal_add (Vec2d const & a)
+{
+    return _mm_cvtsd_f64(_mm_add_pd(_mm_permute_pd(a,1),a));
+}
+
+/** Fallback implementation that uses the non-permuting thing. Desperately needed
+ *  for the opcounting branch.
+ */
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add(const V& x)
+{
+    return horizontal_add(x);
+}
+
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x)
+{
+  return permuting_horizontal_add(x.get_low());
+}
+
+template<class V>
+typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x)
+{
+  return permuting_horizontal_add(x.get_high());
+}
+
 #endif
diff --git a/patches/apply_patches.sh b/patches/apply_patches.sh
index 7d1d45112d392a13a3dec4881ba43222cf60054b..2d0cdc6f543f2cb43d3a28bf55f562db0e64f13a 100755
--- a/patches/apply_patches.sh
+++ b/patches/apply_patches.sh
@@ -5,10 +5,6 @@ git apply ../../patches/loopy/Current.patch
 git apply ../../patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
 popd
 
-pushd dune/codegen/vectorclass
-git apply ../../../patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
-popd
-
 pushd python/ufl
 git apply ../../patches/ufl/0001-Remove-special-case-for-variable-in-ufl2dot.patch
 popd
diff --git a/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch b/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch
deleted file mode 100644
index c5ca6dc30e2135ab30a28c7373b94da344b8a7ac..0000000000000000000000000000000000000000
--- a/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch
+++ /dev/null
@@ -1,44 +0,0 @@
-From a324181d74fd8cd81fb945a4f66e4502ffbc68a0 Mon Sep 17 00:00:00 2001
-From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
-Date: Thu, 30 Nov 2017 18:51:49 +0100
-Subject: [PATCH] Alternative implementation of horizontal_add on AVX512
-
----
- vectorf512.h | 19 +++++++++++++------
- 1 file changed, 13 insertions(+), 6 deletions(-)
-
-diff --git a/vectorf512.h b/vectorf512.h
-index 0845d12..6a15ac2 100644
---- a/vectorf512.h
-+++ b/vectorf512.h
-@@ -1339,14 +1339,21 @@ static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b)
- 
- 
- // General arithmetic functions, etc.
-+#if __GNUC__ < 7
-+extern __inline double
-+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-+_mm512_cvtsd_f64 (__m512d __A)
-+{
-+  return __A[0];
-+}
-+#endif
- 
- // Horizontal add: Calculates the sum of all vector elements.
--static inline double horizontal_add (Vec8d const & a) {
--#if defined(__INTEL_COMPILER)
--    return _mm512_reduce_add_pd(a);
--#else
--    return horizontal_add(a.get_low() + a.get_high());
--#endif
-+static inline double horizontal_add (Vec8d const & x) {
-+    __m512d intermediate = _mm512_add_pd(x, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(x), _mm512_castpd_si512(x), 1)));
-+    intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 2)));
-+    intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 4)));
-+    return _mm512_cvtsd_f64(intermediate);
- }
- 
- // function max: a > b ? a : b
--- 
-2.1.4
-
diff --git a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch b/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
deleted file mode 100644
index fee83d7ad7cedbacc588c530ad9581b49cfa3b54..0000000000000000000000000000000000000000
--- a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-From 69f4ea4dcd018eb74c39a076a60fc27c0496e1dd Mon Sep 17 00:00:00 2001
-From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
-Date: Mon, 19 Jun 2017 13:07:22 +0200
-Subject: [PATCH] Better implementation of horizontal_add
-
----
- vectorf256.h | 9 +++++----
- 1 file changed, 5 insertions(+), 4 deletions(-)
-
-diff --git a/vectorf256.h b/vectorf256.h
-index db509f8..2bbd9de 100644
---- a/vectorf256.h
-+++ b/vectorf256.h
-@@ -1692,10 +1692,11 @@ static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b)
- 
- // Horizontal add: Calculates the sum of all vector elements.
- static inline double horizontal_add (Vec4d const & a) {
--    __m256d t1 = _mm256_hadd_pd(a,a);
--    __m128d t2 = _mm256_extractf128_pd(t1,1);
--    __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
--    return _mm_cvtsd_f64(t3);        
-+    const __m128d valupper = _mm256_extractf128_pd(a, 1);
-+    const __m128d vallower = _mm256_castpd256_pd128(a);
-+    const __m128d valval = _mm_add_pd(valupper, vallower);
-+    const __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
-+    return _mm_cvtsd_f64(res);
- }
- 
- // function max: a > b ? a : b
--- 
-2.1.4
-
diff --git a/python/dune/codegen/loopy/vcl.py b/python/dune/codegen/loopy/vcl.py
index e0943a69ac35136abfe59a7f641e0f076681643c..191e6e1843f8e1bb98ee5b5307336e7dafdc32bc 100644
--- a/python/dune/codegen/loopy/vcl.py
+++ b/python/dune/codegen/loopy/vcl.py
@@ -119,6 +119,9 @@ def vcl_function_mangler(knl, func, arg_dtypes):
         return lp.CallMangleInfo("select", (vcl,), (vcl, vcl, vcl))
 
     if func in ("horizontal_add", "horizontal_add_lower", "horizontal_add_upper"):
+        if get_option("permuting_horizontal_add"):
+            func = "permuting_{}".format(func)
+
         dtype = arg_dtypes[0]
         vcl = lp.types.NumpyType(get_vcl_type(dtype))
         include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile")
diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index bb6bbafaf09dd0b0ae7330b174dfa5805769d09f..a3111553a1f69aeffc3f8da5286fd201dd5ec0ec 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -55,6 +55,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
     target_name = CodegenOption(default=None, helpstr="The target name from CMake")
     operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
     debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).")
+    permuting_horizontal_add = CodegenOption(default=True, helpstr="Whether SIMD horizontal_add should use a permuting implementation.")
 
     # Arguments that are mainly to be set by logic depending on other options
     max_vector_width = CodegenOption(default=256, helpstr=None)