From fed5721ed2918d419c04c08b0f86855a4e789989 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@uni-muenster.de>
Date: Fri, 15 Feb 2019 12:29:39 +0100
Subject: [PATCH] ensure ordering: vectorized code before tail

---
 .../codegen/blockstructured/vectorization.py  | 93 ++++++++++++-------
 1 file changed, 60 insertions(+), 33 deletions(-)

diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py
index 681551ca..b68d4b10 100644
--- a/python/dune/codegen/blockstructured/vectorization.py
+++ b/python/dune/codegen/blockstructured/vectorization.py
@@ -41,7 +41,7 @@ def add_vcl_temporaries(knl, vcl_size):
                     iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()}))
 
 
-def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size):
+def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size, level):
     nptype = dtype_floatingpoint()
 
     accum_insns = lp.find_instructions(knl, And((Tagged('accum'), Iname(inner_iname))))
@@ -93,13 +93,14 @@ def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size):
             var_right = prim.Subscript(prim.Variable(identifier_right), (prim.Variable(inner_iname),))
 
             # init a
-            id_init_a = idg('{}_init_' + identifier_left)
+            id_init_a = idg('insn_init_' + identifier_left)
             new_insns.append(lp.Assignment(assignee=substitute(var_left, replace_head_inames),
                                            expression=0,
                                            id=id_init_a,
                                            within_inames=(insn.within_inames - frozenset({outer_iname}) -
                                                           inames_micro) | inames_head,
-                                           tags=frozenset({'head_vec{}'.format(vcl_size)})))
+                                           tags=frozenset({'head_vec{}'.format(vcl_size),
+                                                           'vectorized_{}'.format(level)})))
 
             # setze werte für a und b
             expr_right = substitute(expr_without_r, {iname_ix: 1})
@@ -111,12 +112,14 @@ def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size):
                                            expression=expr_right,
                                            id=id_set_right,
                                            depends_on=insn.depends_on,
-                                           within_inames=insn.within_inames - frozenset({iname_ix})))
+                                           within_inames=insn.within_inames - frozenset({iname_ix}),
+                                           tags=frozenset({'vectorized_{}'.format(level)})))
             new_insns.append(lp.Assignment(assignee=var_left,
                                            expression=expr_left,
                                            id=id_set_left,
                                            depends_on=insn.depends_on | frozenset({id_init_a}),
-                                           within_inames=insn.within_inames - frozenset({iname_ix})))
+                                           within_inames=insn.within_inames - frozenset({iname_ix}),
+                                           tags=frozenset({'vectorized_{}'.format(level)})))
 
             # r+=a[iy]
             id_accum = idg('{}_mod_accum'.format(insn.id))
@@ -130,7 +133,8 @@ def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size):
                                            depends_on=insn.depends_on | frozenset({id_set_left,
                                                                                    id_init_a, id_set_right}),
                                            within_inames=insn.within_inames - frozenset({iname_ix}),
-                                           tags=frozenset({'accum_vec{}'.format(vcl_size)})))
+                                           tags=frozenset({'accum_vec{}'.format(vcl_size),
+                                                           'vectorized_{}'.format(level)})))
             # a[iy] = permute
             id_permute = idg('{}_permute'.format(insn.id))
             expr_permute = prim.Call(VCLPermute(nptype, vcl_size, (vcl_size - 1,) + (-1,) * (vcl_size - 1)),
@@ -140,7 +144,8 @@ def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size):
                                            id=id_permute,
                                            depends_on=insn.depends_on | frozenset({id_set_left, id_init_a, id_set_right,
                                                                                    id_accum}),
-                                           within_inames=insn.within_inames - frozenset({iname_ix})
+                                           within_inames=insn.within_inames - frozenset({iname_ix}),
+                                           tags=frozenset({'vectorized_{}'.format(level)})
                                            ))
 
             # tail handling, uses tail alias
@@ -161,7 +166,8 @@ def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size):
                                                        frozenset(write_to_tail_ids)),
                                            within_inames=(insn.within_inames - frozenset({inner_iname, outer_iname}) -
                                                           inames_micro) | inames_tail,
-                                           tags=frozenset({'tail_vec{}'.format(vcl_size)})))
+                                           tags=frozenset({'tail_vec{}'.format(vcl_size),
+                                                           'vectorized_{}'.format(level)})))
         else:
             if insn.id.endswith('tail') and insn.id.replace('_tail', '') in accum_ids:
                 accum_id = insn.id.replace('_tail', '')
@@ -240,7 +246,8 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
         call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), flat_index)),))
         load_insns.append(lp.CallInstruction(assignees=(), expression=call_load,
                                              id=load_id, within_inames=insn.within_inames | insn.reduction_inames(),
-                                             depends_on=insn.depends_on | write_ids,))
+                                             depends_on=insn.depends_on | write_ids,
+                                             tags=frozenset({'vectorized_{}'.format(level)})))
         read_dependencies.setdefault(id, set())
         read_dependencies[id].add(load_id)
 
@@ -268,7 +275,8 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
         store_insns.append(lp.CallInstruction(assignees=(), expression=call_store,
                                               id=store_id, within_inames=insn.within_inames,
                                               depends_on=(insn.depends_on | frozenset({id}) | read_dependencies[id] |
-                                                          write_ids)))
+                                                          write_ids),
+                                              tags=frozenset({'vectorized_{}'.format(level)})))
 
     # replace alias with vcl vector, except for accumulation assignee
     vector_alias = [a for a in knl.arg_dict if a.endswith(alias_suffix)]
@@ -291,14 +299,15 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
                 new_insns.append(insn)
         knl_with_subst_insns = knl_with_subst_insns.copy(instructions=new_insns)
 
-        # substitution rule for alias[ex_outer,ex_inner, ey, ix, iy] -> vec[ex_inner]
+        # substitution rule for alias[[ex_o]*l,ex_inner, ey, ix, iy] -> vec[ex_inner]
         parameters = ','.join(['ex_o{}'.format(l) for l in range(level + 1)]) + \
                      ',v_i,' + \
                      ','.join(['e' + d for d in dim_names[1:dim]]) + \
                      ',ix,' + \
                      ','.join(['i' + d for d in dim_names[1:dim]])
-        knl_with_subst_insns = lp.extract_subst(knl_with_subst_insns, alias + '_subst', '{}[{}]'.format(alias, parameters),
-                                             parameters=parameters)
+        knl_with_subst_insns = lp.extract_subst(knl_with_subst_insns,
+                                                alias + '_subst', '{}[{}]'.format(alias, parameters),
+                                                parameters=parameters)
         new_subst = knl_with_subst_insns.substitutions.copy()
         rule = new_subst[alias + '_subst']
         rule.expression = prim.Subscript(prim.Variable(alias.replace(alias_suffix, vector_sufix)),
@@ -330,10 +339,12 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
                     raise CodegenVectorizationError
                 new_insns.append(insn.copy(assignee=assignee_vec,
                                            depends_on=(insn.depends_on | read_dependencies[insn.id] |
-                                                       write_ids)))
+                                                       write_ids),
+                                           tags=insn.tags | frozenset({'vectorized_{}'.format(level)})))
             else:
                 new_insns.append(insn.copy(depends_on=(insn.depends_on | read_dependencies[insn.id] |
-                                                       write_ids)))
+                                                       write_ids),
+                                           tags=insn.tags | frozenset({'vectorized_{}'.format(level)})))
 
     return knl.copy(instructions=new_insns + load_insns + store_insns)
 
@@ -387,29 +398,28 @@ def add_iname_array(knl, iname):
     return knl
 
 
-def add_vcl_iname_array(knl, iname, vec_iname, vcl_size):
+def add_vcl_iname_array(knl, iname, vec_iname, vcl_size, level):
     insns_with_macro_points = lp.find_instructions(knl, And((Tagged(iname), Iname(vec_iname))))
 
     if insns_with_macro_points:
         iname_array = iname + '_arr'
         vector_name = iname + '_vec{}'.format(vcl_size)
 
-        new_temporaries = dict()
-        new_temporaries[vector_name] = DuneTemporaryVariable(vector_name, managed=True,
-                                                             shape=(get_form_option('number_of_blocks'),),
-                                                             scope=lp.temp_var_scope.PRIVATE, dtype=np.float64,
-                                                             base_storage=iname_array + '_buff',
-                                                             _base_storage_access_may_be_aliasing=True)
+        new_temporaries = {vector_name: DuneTemporaryVariable(vector_name, managed=True,
+                                                              shape=(get_form_option('number_of_blocks'),),
+                                                              scope=lp.temp_var_scope.PRIVATE, dtype=np.float64,
+                                                              base_storage=iname_array + '_buff',
+                                                              _base_storage_access_may_be_aliasing=True)}
         silenced_warning = ["read_no_write({})".format(vector_name)]
 
-        replacemap = dict()
-        replacemap[iname_array] = prim.Variable(vector_name)
+        replacemap = {iname_array: prim.Variable(vector_name)}
 
         new_insns = []
         for insn in knl.instructions:
             if insn in insns_with_macro_points:
                 transformed_insn = insn.with_transformed_expressions(lambda expr: substitute(expr, replacemap))
-                new_insns.append(transformed_insn.copy(depends_on='init_{}_buffer'.format(iname_array)))
+                new_insns.append(transformed_insn.copy(depends_on='init_{}_buffer'.format(iname_array),
+                                                       tags=insn.tags | frozenset({'vectorized_{}'.format(level)})))
             else:
                 new_insns.append(insn)
 
@@ -423,7 +433,7 @@ def add_vcl_iname_array(knl, iname, vec_iname, vcl_size):
     return knl
 
 
-def realize_tail(knl, inner_iname, outer_iname, outer_bound, tail_iname, vcl_size):
+def realize_tail(knl, inner_iname, outer_iname, outer_bound, tail_iname, vcl_size, level):
     tail_size = get_form_option('number_of_blocks') % vcl_size
     new_dom = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(tail_iname, tail_size))
 
@@ -451,7 +461,8 @@ def realize_tail(knl, inner_iname, outer_iname, outer_bound, tail_iname, vcl_siz
         new_within_inames = frozenset((iname + '_tail' if iname == inner_iname else iname
                                        for iname in insn.within_inames)) - frozenset({outer_iname})
         new_insns.append(new_insn.copy(id=insn.id + '_tail', depends_on=new_depends_on,
-                                   within_inames=new_within_inames))
+                                       within_inames=new_within_inames,
+                                       tags=insn.tags | frozenset({'tail_{}'.format(level)})))
 
     knl = knl.copy(domains=knl.domains + [new_dom], instructions=knl.instructions + new_insns,
                    temporary_variables=dict(**knl.temporary_variables, **temporaries_to_duplicate))
@@ -471,6 +482,21 @@ def realize_tail(knl, inner_iname, outer_iname, outer_bound, tail_iname, vcl_siz
     return lp.make_reduction_inames_unique(knl)
 
 
+def add_tail_dependencies(knl, level):
+    vectorized_insns = lp.find_instructions(knl, Tagged('vectorized_{}'.format(level)))
+    vectorized_ids = frozenset((insn.id for insn in vectorized_insns))
+    tail_insns = lp.find_instructions(knl, Tagged('tail_{}'.format(level)))
+
+    new_insns = []
+    for insn in knl.instructions:
+        if insn in tail_insns:
+            new_insns.append(insn.copy(depends_on=insn.depends_on | vectorized_ids))
+        else:
+            new_insns.append(insn)
+
+    return knl.copy(instructions=new_insns)
+
+
 def vectorize_micro_elements(knl):
     vec_iname = "subel_x"
     orig_iname = vec_iname
@@ -503,7 +529,7 @@ def vectorize_micro_elements(knl):
                 knl = lp.split_iname(knl, vec_iname, vcl_size, outer_iname=outer_iname, inner_iname=inner_iname)
 
                 tail_iname = vec_iname + '_inner' + '_tail'
-                knl = realize_tail(knl, inner_iname, outer_iname, iname_bound, tail_iname, vcl_size)
+                knl = realize_tail(knl, inner_iname, outer_iname, iname_bound, tail_iname, vcl_size, level)
             else:
                 knl = lp.split_iname(knl, vec_iname, vcl_size)
 
@@ -512,16 +538,17 @@ def vectorize_micro_elements(knl):
             array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
             knl = lp.split_array_axis(knl, array_alias, level, vcl_size)
 
-            knl = add_vcl_iname_array(knl, orig_iname, inner_iname, vcl_size)
             knl = add_vcl_temporaries(knl, vcl_size)
-            knl = add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size)
+            knl = add_vcl_iname_array(knl, orig_iname, inner_iname, vcl_size, level)
+            knl = add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size, level)
             knl = add_vcl_access(knl, inner_iname, vcl_size, level)
 
-            if tail_size > 0 and vectorize_tail:
-                knl = _do_vectorization(knl, tail_iname, tail_size, tail_vcl_size, level + 1)
+            if tail_size > 0:
+                knl = add_tail_dependencies(knl, level)
+                if vectorize_tail:
+                    knl = _do_vectorization(knl, tail_iname, tail_size, tail_vcl_size, level + 1)
 
             return knl
 
         knl = _do_vectorization(knl, orig_iname, get_form_option('number_of_blocks'), vcl_size)
-
     return knl
-- 
GitLab