From f564f0cbe80c8d86c4c102b74bf05724562bc5cf Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@uni-muenster.de>
Date: Thu, 14 Feb 2019 10:55:11 +0100
Subject: [PATCH] tail works unvectorized

---
 .../codegen/blockstructured/vectorization.py  | 74 ++++++++++++-------
 1 file changed, 46 insertions(+), 28 deletions(-)

diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py
index e8c56e0e..9afe9baf 100644
--- a/python/dune/codegen/blockstructured/vectorization.py
+++ b/python/dune/codegen/blockstructured/vectorization.py
@@ -3,6 +3,7 @@ import numpy as np
 import pymbolic.primitives as prim
 
 from loopy.match import Tagged, Id, Writes, Or, Iname
+from islpy import BasicSet
 
 from dune.codegen.generation import get_global_context_value
 from dune.codegen.loopy.target import dtype_floatingpoint
@@ -377,46 +378,52 @@ def add_iname_array(knl, vec_iname):
 
 def realize_tail(knl, iname_inner, iname_outer, vcl_size):
     tail_size = get_form_option('number_of_blocks') % vcl_size
+    new_dom = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(iname_inner + '_tail', tail_size))
 
     insns_to_duplicate = lp.find_instructions(knl, Iname(iname_inner))
     ids_to_duplicate = tuple((insn.id for insn in insns_to_duplicate))
 
-    common_inames = knl.all_inames()
-    for insn in insns_to_duplicate:
-        common_inames = common_inames & insn.within_inames
+    subst_map = dict([(iname_outer, get_form_option('number_of_blocks') // vcl_size),
+                      (iname_inner, prim.Variable(iname_inner + '_tail'))])
 
-    additional_inames_to_duplicate = frozenset()
+    temporaries_to_duplicate = dict()
     for insn in insns_to_duplicate:
-        additional_inames_to_duplicate = additional_inames_to_duplicate | (insn.within_inames - common_inames)
+        if isinstance(insn, lp.Assignment):
+            assignee = insn.assignee
+            name = get_pymbolic_basename(assignee)
+            if name in knl.temporary_variables:
+                new_name = name + '_tail'
+                temporaries_to_duplicate[new_name] = knl.temporary_variables[name].copy(name=new_name)
+                subst_map[name] = prim.Variable(new_name)
 
-    inames_to_duplicate = frozenset({iname_inner}) | additional_inames_to_duplicate
-
-    combined_domain_str = str(knl.get_inames_domain(additional_inames_to_duplicate))
-    for iname in additional_inames_to_duplicate:
-        combined_domain_str = combined_domain_str.replace(iname, iname + '_tail')
+    new_insns = []
+    for insn in insns_to_duplicate:
+        new_insn = insn.with_transformed_expressions(lambda e: substitute(e, subst_map))
+        new_depends_on = frozenset((insn_id + '_tail' if insn_id in ids_to_duplicate else insn_id
+                                    for insn_id in insn.depends_on))
+        new_within_inames = frozenset((iname + '_tail' if iname == iname_inner else iname
+                                       for iname in insn.within_inames)) - frozenset({iname_outer})
+        new_insns.append(new_insn.copy(id=insn.id + '_tail', depends_on=new_depends_on,
+                                   within_inames=new_within_inames))
 
-    from islpy import BasicSet
-    new_doms = [BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(iname_inner + '_tail', tail_size)),
-                BasicSet(combined_domain_str)]
+    knl = knl.copy(domains=knl.domains + [new_dom], instructions=knl.instructions + new_insns,
+                   temporary_variables=dict(**knl.temporary_variables, **temporaries_to_duplicate))
 
-    subst_map = dict([(iname, prim.Variable(iname + '_tail')) for iname in inames_to_duplicate] +
-                     [(iname_outer, get_form_option('number_of_blocks') // vcl_size)])
+    common_inames = knl.all_inames()
+    for insn in insns_to_duplicate:
+        common_inames = common_inames & (insn.within_inames | insn.reduction_inames())
 
-    new_insns = []
-    for insn in knl.instructions:
-        if iname_inner in insn.within_inames:
-            new_insn = insn.with_transformed_expressions(lambda e: substitute(e, subst_map))
-            new_depends_on = frozenset((insn_id + '_tail' if insn_id in ids_to_duplicate else insn_id
-                                        for insn_id in insn.depends_on))
-            new_within_inames = frozenset((iname + '_tail' if iname in inames_to_duplicate else iname
-                                           for iname in insn.within_inames))
-            new_insns.append(new_insn.copy(id=new_insn.id + '_tail', depends_on=new_depends_on,
-                                           within_inames=new_within_inames))
+    additional_inames_to_duplicate = frozenset()
+    for insn in insns_to_duplicate:
+        additional_inames_to_duplicate = additional_inames_to_duplicate | \
+                                         ((insn.within_inames | insn.reduction_inames()) - common_inames)
 
-    knl = knl.copy(domains=knl.domains + new_doms, instructions=knl.instructions + new_insns)
+    knl = lp.duplicate_inames(knl, tuple(additional_inames_to_duplicate),
+                              Or(tuple((Id(insn.id) for insn in new_insns))))
 
     return lp.make_reduction_inames_unique(knl)
 
+
 def vectorize_micro_elements(knl):
     vec_iname = "subel_x"
     if vec_iname in knl.all_inames() and get_global_context_value('integral_type') == 'cell':
@@ -424,8 +431,19 @@ def vectorize_micro_elements(knl):
 
         knl = add_iname_array(knl, vec_iname)
 
-        knl = lp.split_iname(knl, vec_iname, vcl_size, slabs=(0,1))
-        knl = realize_tail(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size)
+        # manually add tail, since split_iname with slabs tries to vectorize the tail
+        if get_form_option('number_of_blocks') % vcl_size > 0:
+            vectorizable_bound = (get_form_option('number_of_blocks') // vcl_size) * vcl_size
+            from loopy.kernel.tools import DomainChanger
+            domch = DomainChanger(knl, (vec_iname,))
+            knl = knl.copy(domains=domch.get_domains_with(
+                BasicSet('{{ [{0}]: 0<={0}<{1} }}'.format(vec_iname, vectorizable_bound))))
+
+            knl = lp.split_iname(knl, vec_iname, vcl_size)
+            knl = realize_tail(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size)
+        else:
+            knl = lp.split_iname(knl, vec_iname, vcl_size)
+
         knl = lp.tag_inames(knl, [(vec_iname + '_inner', 'vec')])
 
         array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
-- 
GitLab