From 57ad4513d47467f1335d80e2c69aae5d31c07b9f Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@uni-muenster.de>
Date: Fri, 15 Feb 2019 16:03:44 +0100
Subject: [PATCH] add option for tail ordering

consecutive means the tail is within the same subelement inames as
the vectorized loop (except for the vectorized iname)
blocked means each tail has its on set of subelement inames
---
 .../dune/codegen/blockstructured/vectorization.py  | 14 ++++++++++----
 python/dune/codegen/options.py                     |  3 ++-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py
index 22942d66..cfdfc794 100644
--- a/python/dune/codegen/blockstructured/vectorization.py
+++ b/python/dune/codegen/blockstructured/vectorization.py
@@ -1,6 +1,7 @@
 import loopy as lp
 import numpy as np
 import pymbolic.primitives as prim
+from dune.codegen.blockstructured.tools import sub_element_inames
 
 from loopy.match import Tagged, Id, Writes, Reads, And, Or, Iname, All, Not
 from islpy import BasicSet
@@ -468,13 +469,18 @@ def realize_tail(knl, inner_iname, outer_iname, outer_bound, tail_iname, vcl_siz
                    temporary_variables=dict(**knl.temporary_variables, **temporaries_to_duplicate))
 
     common_inames = knl.all_inames()
-    for insn in insns_to_duplicate:
+    for insn in new_insns:
         common_inames = common_inames & (insn.within_inames | insn.reduction_inames())
 
+    if get_form_option('vectorization_blockstructured_tail_ordering') == 'blocked':
+        # TODO need to be more clever to get the right inames
+        macro_inames = frozenset((iname + '_0' * level) for iname in sub_element_inames())
+        common_inames = common_inames - macro_inames
+
     additional_inames_to_duplicate = frozenset()
-    for insn in insns_to_duplicate:
-        additional_inames_to_duplicate = additional_inames_to_duplicate | ((insn.within_inames |
-                                                                            insn.reduction_inames()) - common_inames)
+    for insn in new_insns:
+        insn_inames = insn.within_inames | insn.reduction_inames()
+        additional_inames_to_duplicate = additional_inames_to_duplicate | (insn_inames - common_inames)
 
     knl = lp.duplicate_inames(knl, tuple(additional_inames_to_duplicate),
                               Or(tuple((Id(insn.id) for insn in new_insns))))
diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index bf80a72a..00fc2e48 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -101,7 +101,8 @@ class CodegenFormOptionsArray(ImmutableRecord):
     blockstructured = CodegenOption(default=False, helpstr="Use block structure")
     number_of_blocks = CodegenOption(default=1, helpstr="Number of sub blocks in one direction")
     vectorization_blockstructured = CodegenOption(default=False, helpstr="Vectorize block structuring")
-    vectorization_blockstructured_tail = CodegenOption(default=True, helpstr="Try to fully vectorize block structuring even when 'nunmber_of_blocks' is not divisible by vector length.")
+    vectorization_blockstructured_tail = CodegenOption(default=True, helpstr="Try to fully vectorize block structuring even when 'nunmber_of_blocks' is not divisible by vector length")
+    vectorization_blockstructured_tail_ordering = CodegenOption(default='consecutive', helpstr="Ordering of the tail w.r.t the vectorized loop. Possible values: consecutive|blocked")
     adjoint = CodegenOption(default=False, helpstr="Generate adjoint operator")
     control = CodegenOption(default=False, helpstr="Generate operator of derivative w.r.t. the control variable")
     objective_function = CodegenOption(default=None, helpstr="Name of form representing the objective function in UFL file")
-- 
GitLab