diff --git a/python/dune/perftool/loopy/transformations/instrumentation.py b/python/dune/perftool/loopy/transformations/instrumentation.py index 89b08b6f0e191ca06db56d820c585ebe585e250b..2771ba141ccf5978067e3569038c4307bd01a357 100644 --- a/python/dune/perftool/loopy/transformations/instrumentation.py +++ b/python/dune/perftool/loopy/transformations/instrumentation.py @@ -22,7 +22,7 @@ def _union(a): return frozenset.union(*a) -def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', operator=False): +def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', operator=False, depends_on=frozenset()): """ Transform loopy kernel to contain instrumentation code Arguments: @@ -32,6 +32,9 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o identifier : The name of the counter to start and stop level : The instrumentation level this measurement is defined at filetag : The tag of the file that should contain the counter definitions + depends_on: Additional dependencies to add to the start instruction. This is used to correct + currently wrong behaviour of the transformation in cases where a lot of structure + of the instrumentation is known a priori. """ # If the instrumentation level is not high enough, this is a no-op if level > get_option("instrumentation_level"): @@ -53,6 +56,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o insn_inames = _intersect(tuple(i.within_inames for i in insns)) other_inames = _union(tuple(i.within_inames for i in lp.find_instructions(knl, lp.match.Not(match)))) within = _intersect((insn_inames, other_inames)) + uniontags = _intersect(tuple(i.tags for i in insns)) # Get a unique identifer - note that the same timer could be started and stopped several times # within one kernel... @@ -67,8 +71,9 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o "HP_TIMER_START({});".format(identifier), id=start_id, within_inames=within, - depends_on=start_depends, + depends_on=depends_on.union(start_depends), boostable_into=frozenset(), + tags=uniontags, ) # Add dependencies on the timing instructions @@ -82,6 +87,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o within_inames=within, depends_on=frozenset(i.id for i in insns), boostable_into=frozenset(), + tags=uniontags, ) # Find all the instructions that should depend on stop @@ -98,4 +104,7 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o other_insns = list(filter(lambda i: i.id not in [j.id for j in rewritten_insns], knl.instructions)) # Add all the modified instructions into the kernel object - return knl.copy(instructions=rewritten_insns + other_insns + [start_insn, stop_insn]) + knl = knl.copy(instructions=rewritten_insns + other_insns + [start_insn, stop_insn]) + + from loopy.kernel.creation import resolve_dependencies + return resolve_dependencies(knl) diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 04dad9834dae0a3b1fa6bd06ec204ec1e5d32135..f38b76d7e8f8d65819d4d357f037ecc8f1a3208c 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -569,8 +569,8 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin if add_timings and get_form_option("sumfact"): from dune.perftool.pdelab.signatures import assembler_routine_name kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage1"), "{}_kernel_stage1".format(assembler_routine_name()), 4) - kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage2"), "{}_kernel_quadratureloop".format(assembler_routine_name()), 4) - kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage3"), "{}_kernel_stage3".format(assembler_routine_name()), 4) + kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage2"), "{}_kernel_quadratureloop".format(assembler_routine_name()), 4, depends_on=frozenset({lp.match.Tagged("sumfact_stage1")})) + kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage3"), "{}_kernel_stage3".format(assembler_routine_name()), 4, depends_on=frozenset({lp.match.Tagged("sumfact_stage2")})) if wrap_in_cgen: # Wrap the kernel in something which can generate code