From ef9e2a22c63955be61078b3e18bda4aaf8985af7 Mon Sep 17 00:00:00 2001 From: Marcel Koch <marcel.koch@uni-muenster.de> Date: Tue, 23 Oct 2018 11:12:55 +0200 Subject: [PATCH] add dependencies to reduce loopy warnings --- .../perftool/blockstructured/accumulation.py | 5 ++- .../perftool/blockstructured/vectorization.py | 40 ++++++++++++++----- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/python/dune/perftool/blockstructured/accumulation.py b/python/dune/perftool/blockstructured/accumulation.py index ee52acbb..0eca9f4f 100644 --- a/python/dune/perftool/blockstructured/accumulation.py +++ b/python/dune/perftool/blockstructured/accumulation.py @@ -9,6 +9,8 @@ from dune.perftool.generation.loopy import function_mangler, globalarg import loopy as lp import pymbolic.primitives as prim +from loopy.match import Writes + def name_accumulation_alias(container, accumspace): name = container + "_" + accumspace.lfs.name + "_alias" @@ -64,5 +66,6 @@ def generate_accumulation_instruction(expr, visitor): forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)), forced_iname_deps_is_final=True, predicates=predicates, - tags=frozenset({'accum'}) + tags=frozenset({'accum'}), + depends_on=frozenset({Writes(accumvar_alias)}) ) diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py index 62aa0fb8..3946e16d 100644 --- a/python/dune/perftool/blockstructured/vectorization.py +++ b/python/dune/perftool/blockstructured/vectorization.py @@ -2,17 +2,16 @@ import loopy as lp import numpy as np import pymbolic.primitives as prim -from loopy.match import Tagged, Id +from loopy.match import Tagged, Id, Writes, Or -from dune.perftool.generation import get_global_context_value +from dune.perftool.generation import get_global_context_value, silenced_warning from dune.perftool.loopy.target import dtype_floatingpoint from dune.perftool.loopy.temporary import DuneTemporaryVariable from dune.perftool.loopy.symbolic import substitute from dune.perftool.loopy.vcl import get_vcl_type_size, VCLPermute, VCLLoad, VCLStore from dune.perftool.options import get_form_option -from dune.perftool.pdelab.argument import PDELabAccumulationFunction from dune.perftool.pdelab.geometry import world_dimension -from dune.perftool.tools import get_pymbolic_indices +from dune.perftool.tools import get_pymbolic_basename def add_vcl_temporaries(knl): @@ -152,10 +151,14 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): assignee_tail = substitute(insn.assignee, subst_map) expr_tail = prim.Sum((substitute(var_left, {iname_inner: 0, **replace_tail_inames}), assignee_tail)) + write_to_tail_ids = tuple(i.id for i in lp.find_instructions(knl, + Writes(get_pymbolic_basename(assignee_tail)))) + new_insns.append(lp.Assignment(assignee=assignee_tail, expression=expr_tail, id=id_accum_tail, - depends_on=frozenset({id_accum, id_permute, id_set_left, id_init_a}), + depends_on=frozenset({id_accum, id_permute, id_set_left, id_init_a}) | + frozenset(write_to_tail_ids), within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer}) - inames_micro) | inames_tail, tags=frozenset({'tail'}))) @@ -170,6 +173,7 @@ def add_vcl_access(knl, iname_inner): from loopy.match import Reads, Tagged accum_insns = set((insn.id for insn in lp.find_instructions(knl, Tagged('accum')))) read_insns = set((insn.id for insn in lp.find_instructions(knl, Reads('*alias')))) + vectorized_insns = accum_insns | read_insns from loopy.symbolic import CombineMapper from loopy.symbolic import IdentityMapper @@ -198,23 +202,29 @@ def add_vcl_access(knl, iname_inner): aic = AliasIndexCollector() load_insns = [] read_dependencies = dict() + vectorized_insn_to_vector_names = dict() for id in read_insns: insn = knl.id_to_insn[id] alias, index = aic(insn.expression) name_alias = alias.name name_vec = name_alias.replace('alias', 'vec') + vectorized_insn_to_vector_names[id] = (name_alias, name_vec) # compute index without vec iname strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags) index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides) if i != 0 and i.name != iname_inner)) + # find write insns + write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec))))) + # add load instruction load_id = idg('insn_' + name_vec + '_load') call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), index)),)) load_insns.append(lp.CallInstruction(assignees=(), expression=call_load, - id=load_id, within_inames=insn.within_inames | insn.reduction_inames(),)) + id=load_id, within_inames=insn.within_inames | insn.reduction_inames(), + depends_on=insn.depends_on | write_ids,)) read_dependencies.setdefault(id, set()) read_dependencies[id].add(load_id) @@ -226,18 +236,23 @@ def add_vcl_access(knl, iname_inner): alias, index = aic(insn.expression) name_alias = alias.name name_vec = name_alias.replace('alias', 'vec') + vectorized_insn_to_vector_names[id] = (name_alias, name_vec) # flat index without vec iname strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags) index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides) if i != 0 and i.name != iname_inner)) + # find write insns + write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec))))) + # add store instruction store_id = idg('insn_' + name_vec + '_store') call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), index)),)) store_insns.append(lp.CallInstruction(assignees=(), expression=call_store, id=store_id, within_inames=insn.within_inames, - depends_on=insn.depends_on | frozenset({id}) | read_dependencies[id])) + depends_on=insn.depends_on | frozenset({id}) | read_dependencies[id] | + write_ids)) # replace alias with vcl vector, except for accumulation assignee vector_alias = [a for a in knl.arg_dict if a.endswith('alias')] @@ -279,9 +294,12 @@ def add_vcl_access(knl, iname_inner): # add store and load dependencies and set right accumulation assignee new_insns = [] for insn in knl.instructions: - if insn.id not in read_insns | accum_insns: + if insn.id not in vectorized_insns: new_insns.append(insn) else: + # find write insns + name_alias, name_vec = vectorized_insn_to_vector_names[insn.id] + write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec))))) if insn.id in accum_insns: assignee_alias = insn.assignee try: @@ -293,9 +311,11 @@ def add_vcl_access(knl, iname_inner): from dune.perftool.error import PerftoolVectorizationError raise PerftoolVectorizationError new_insns.append(insn.copy(assignee=assignee_vec, - depends_on=insn.depends_on | read_dependencies[insn.id])) + depends_on=insn.depends_on | read_dependencies[insn.id] | + write_ids)) else: - new_insns.append(insn.copy(depends_on=insn.depends_on | read_dependencies[insn.id])) + new_insns.append(insn.copy(depends_on=insn.depends_on | read_dependencies[insn.id] | + write_ids)) return knl.copy(instructions=new_insns + load_insns + store_insns) -- GitLab