From ef9e2a22c63955be61078b3e18bda4aaf8985af7 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@uni-muenster.de>
Date: Tue, 23 Oct 2018 11:12:55 +0200
Subject: [PATCH] add dependencies to reduce loopy warnings

---
 .../perftool/blockstructured/accumulation.py  |  5 ++-
 .../perftool/blockstructured/vectorization.py | 40 ++++++++++++++-----
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/python/dune/perftool/blockstructured/accumulation.py b/python/dune/perftool/blockstructured/accumulation.py
index ee52acbb..0eca9f4f 100644
--- a/python/dune/perftool/blockstructured/accumulation.py
+++ b/python/dune/perftool/blockstructured/accumulation.py
@@ -9,6 +9,8 @@ from dune.perftool.generation.loopy import function_mangler, globalarg
 import loopy as lp
 import pymbolic.primitives as prim
 
+from loopy.match import Writes
+
 
 def name_accumulation_alias(container, accumspace):
     name = container + "_" + accumspace.lfs.name + "_alias"
@@ -64,5 +66,6 @@ def generate_accumulation_instruction(expr, visitor):
                 forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)),
                 forced_iname_deps_is_final=True,
                 predicates=predicates,
-                tags=frozenset({'accum'})
+                tags=frozenset({'accum'}),
+                depends_on=frozenset({Writes(accumvar_alias)})
                 )
diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py
index 62aa0fb8..3946e16d 100644
--- a/python/dune/perftool/blockstructured/vectorization.py
+++ b/python/dune/perftool/blockstructured/vectorization.py
@@ -2,17 +2,16 @@ import loopy as lp
 import numpy as np
 import pymbolic.primitives as prim
 
-from loopy.match import Tagged, Id
+from loopy.match import Tagged, Id, Writes, Or
 
-from dune.perftool.generation import get_global_context_value
+from dune.perftool.generation import get_global_context_value, silenced_warning
 from dune.perftool.loopy.target import dtype_floatingpoint
 from dune.perftool.loopy.temporary import DuneTemporaryVariable
 from dune.perftool.loopy.symbolic import substitute
 from dune.perftool.loopy.vcl import get_vcl_type_size, VCLPermute, VCLLoad, VCLStore
 from dune.perftool.options import get_form_option
-from dune.perftool.pdelab.argument import PDELabAccumulationFunction
 from dune.perftool.pdelab.geometry import world_dimension
-from dune.perftool.tools import get_pymbolic_indices
+from dune.perftool.tools import get_pymbolic_basename
 
 
 def add_vcl_temporaries(knl):
@@ -152,10 +151,14 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer):
             assignee_tail = substitute(insn.assignee, subst_map)
             expr_tail = prim.Sum((substitute(var_left, {iname_inner: 0, **replace_tail_inames}), assignee_tail))
 
+            write_to_tail_ids = tuple(i.id for i in lp.find_instructions(knl,
+                                                                         Writes(get_pymbolic_basename(assignee_tail))))
+
             new_insns.append(lp.Assignment(assignee=assignee_tail,
                                            expression=expr_tail,
                                            id=id_accum_tail,
-                                           depends_on=frozenset({id_accum, id_permute, id_set_left, id_init_a}),
+                                           depends_on=frozenset({id_accum, id_permute, id_set_left, id_init_a}) |
+                                                      frozenset(write_to_tail_ids),
                                            within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer}) -
                                                           inames_micro) | inames_tail,
                                            tags=frozenset({'tail'})))
@@ -170,6 +173,7 @@ def add_vcl_access(knl, iname_inner):
     from loopy.match import Reads, Tagged
     accum_insns = set((insn.id for insn in lp.find_instructions(knl, Tagged('accum'))))
     read_insns = set((insn.id for insn in lp.find_instructions(knl, Reads('*alias'))))
+    vectorized_insns = accum_insns | read_insns
 
     from loopy.symbolic import CombineMapper
     from loopy.symbolic import IdentityMapper
@@ -198,23 +202,29 @@ def add_vcl_access(knl, iname_inner):
     aic = AliasIndexCollector()
     load_insns = []
     read_dependencies = dict()
+    vectorized_insn_to_vector_names = dict()
     for id in read_insns:
         insn = knl.id_to_insn[id]
 
         alias, index = aic(insn.expression)
         name_alias = alias.name
         name_vec = name_alias.replace('alias', 'vec')
+        vectorized_insn_to_vector_names[id] = (name_alias, name_vec)
 
         # compute index without vec iname
         strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
         index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
                                if i != 0 and i.name != iname_inner))
 
+        # find write insns
+        write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec)))))
+
         # add load instruction
         load_id = idg('insn_' + name_vec + '_load')
         call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), index)),))
         load_insns.append(lp.CallInstruction(assignees=(), expression=call_load,
-                                             id=load_id, within_inames=insn.within_inames | insn.reduction_inames(),))
+                                             id=load_id, within_inames=insn.within_inames | insn.reduction_inames(),
+                                             depends_on=insn.depends_on | write_ids,))
         read_dependencies.setdefault(id, set())
         read_dependencies[id].add(load_id)
 
@@ -226,18 +236,23 @@ def add_vcl_access(knl, iname_inner):
         alias, index = aic(insn.expression)
         name_alias = alias.name
         name_vec = name_alias.replace('alias', 'vec')
+        vectorized_insn_to_vector_names[id] = (name_alias, name_vec)
 
         # flat index without vec iname
         strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
         index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
                                if i != 0 and i.name != iname_inner))
 
+        # find write insns
+        write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec)))))
+
         # add store instruction
         store_id = idg('insn_' + name_vec + '_store')
         call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), index)),))
         store_insns.append(lp.CallInstruction(assignees=(), expression=call_store,
                                               id=store_id, within_inames=insn.within_inames,
-                                              depends_on=insn.depends_on | frozenset({id}) | read_dependencies[id]))
+                                              depends_on=insn.depends_on | frozenset({id}) | read_dependencies[id] |
+                                                         write_ids))
 
     # replace alias with vcl vector, except for accumulation assignee
     vector_alias = [a for a in knl.arg_dict if a.endswith('alias')]
@@ -279,9 +294,12 @@ def add_vcl_access(knl, iname_inner):
     # add store and load dependencies and set right accumulation assignee
     new_insns = []
     for insn in knl.instructions:
-        if insn.id not in read_insns | accum_insns:
+        if insn.id not in vectorized_insns:
             new_insns.append(insn)
         else:
+            # find write insns
+            name_alias, name_vec = vectorized_insn_to_vector_names[insn.id]
+            write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec)))))
             if insn.id in accum_insns:
                 assignee_alias = insn.assignee
                 try:
@@ -293,9 +311,11 @@ def add_vcl_access(knl, iname_inner):
                     from dune.perftool.error import PerftoolVectorizationError
                     raise PerftoolVectorizationError
                 new_insns.append(insn.copy(assignee=assignee_vec,
-                                           depends_on=insn.depends_on | read_dependencies[insn.id]))
+                                           depends_on=insn.depends_on | read_dependencies[insn.id] |
+                                                      write_ids))
             else:
-                new_insns.append(insn.copy(depends_on=insn.depends_on | read_dependencies[insn.id]))
+                new_insns.append(insn.copy(depends_on=insn.depends_on | read_dependencies[insn.id] |
+                                                      write_ids))
 
     return knl.copy(instructions=new_insns + load_insns + store_insns)
 
-- 
GitLab