From 406d25027448e13656f671bb50a63ae6d5aa11db Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@uni-muenster.de>
Date: Fri, 15 Feb 2019 10:46:32 +0100
Subject: [PATCH] correctly handle tail if tail_size % new_vcl_size != 0

---
 .../codegen/blockstructured/vectorization.py  | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py
index c03ebc1f..5accb340 100644
--- a/python/dune/codegen/blockstructured/vectorization.py
+++ b/python/dune/codegen/blockstructured/vectorization.py
@@ -423,14 +423,14 @@ def add_vcl_iname_array(knl, iname, vec_iname, vcl_size):
     return knl
 
 
-def realize_tail(knl, inner_iname, outer_iname, tail_iname, vcl_size):
+def realize_tail(knl, inner_iname, outer_iname, outer_bound, tail_iname, vcl_size):
     tail_size = get_form_option('number_of_blocks') % vcl_size
     new_dom = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(tail_iname, tail_size))
 
     insns_to_duplicate = lp.find_instructions(knl, Iname(inner_iname))
     ids_to_duplicate = tuple((insn.id for insn in insns_to_duplicate))
 
-    subst_map = dict([(outer_iname, get_form_option('number_of_blocks') // vcl_size),
+    subst_map = dict([(outer_iname, outer_bound // vcl_size),
                       (inner_iname, prim.Variable(tail_iname))])
 
     temporaries_to_duplicate = dict()
@@ -478,18 +478,17 @@ def vectorize_micro_elements(knl):
         vcl_size = get_vcl_type_size(np.float64)
         knl = add_iname_array(knl, vec_iname)
 
-        def _do_vectorization(knl, vec_iname, vcl_size, level=0):
+        def _do_vectorization(knl, vec_iname, iname_bound, vcl_size, level=0):
             inner_iname = vec_iname + '_inner'
             outer_iname = vec_iname + '_outer'
 
-            has_tail = get_form_option('number_of_blocks') % vcl_size > 0
-            vectorize_tail = (vcl_size // 2) > 1 and \
-                (get_form_option('number_of_blocks') % vcl_size) % (vcl_size // 2) == 0
+            tail_size = iname_bound % vcl_size
+            vectorize_tail = ((vcl_size // 2) > 1) and (tail_size >= (vcl_size // 2))
 
             # manually add tail, since split_iname with slabs tries to vectorize the tail
-            if has_tail:
+            if tail_size > 0:
                 # fake suitable loop bound
-                vectorizable_bound = (get_form_option('number_of_blocks') // vcl_size) * vcl_size
+                vectorizable_bound = (iname_bound // vcl_size) * vcl_size
                 from loopy.kernel.tools import DomainChanger
                 domch = DomainChanger(knl, (vec_iname,))
                 knl = knl.copy(domains=domch.get_domains_with(
@@ -498,7 +497,7 @@ def vectorize_micro_elements(knl):
                 knl = lp.split_iname(knl, vec_iname, vcl_size, outer_iname=outer_iname, inner_iname=inner_iname)
 
                 tail_iname = vec_iname + '_inner' + '_tail'
-                knl = realize_tail(knl, inner_iname, outer_iname, tail_iname, vcl_size)
+                knl = realize_tail(knl, inner_iname, outer_iname, iname_bound, tail_iname, vcl_size)
             else:
                 knl = lp.split_iname(knl, vec_iname, vcl_size)
 
@@ -512,12 +511,11 @@ def vectorize_micro_elements(knl):
             knl = add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size)
             knl = add_vcl_access(knl, inner_iname, vcl_size, level)
 
-            if has_tail and vectorize_tail:
-                from pudb import set_trace; set_trace()
-                knl = _do_vectorization(knl, tail_iname, vcl_size // 2, level + 1)
+            if tail_size > 0 and vectorize_tail:
+                knl = _do_vectorization(knl, tail_iname, tail_size, vcl_size // 2, level + 1)
 
             return knl
 
-        knl = _do_vectorization(knl, orig_iname, vcl_size)
+        knl = _do_vectorization(knl, orig_iname, get_form_option('number_of_blocks'), vcl_size)
 
     return knl
-- 
GitLab