From 3cbaeb7f04d72e9d415816a25bcd20a046a5274e Mon Sep 17 00:00:00 2001 From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> Date: Wed, 28 Mar 2018 15:22:49 +0200 Subject: [PATCH] Reimplement instrumentation as a loopy transformation This is currently only used for level 4 instrumentation, as these start and stop markers were messing heavily with dependencies in the sum factorization code. This way, the necessary but ugly work of adding instrumentation is separated much more nicely. --- .../loopy/transformations/instrumentation.py | 101 ++++++++++++++++++ .../loopy/transformations/vectorize_quad.py | 6 +- python/dune/perftool/pdelab/localoperator.py | 10 +- python/dune/perftool/sumfact/accumulation.py | 35 +----- python/dune/perftool/sumfact/realization.py | 26 ----- 5 files changed, 117 insertions(+), 61 deletions(-) create mode 100644 python/dune/perftool/loopy/transformations/instrumentation.py diff --git a/python/dune/perftool/loopy/transformations/instrumentation.py b/python/dune/perftool/loopy/transformations/instrumentation.py new file mode 100644 index 00000000..89b08b6f --- /dev/null +++ b/python/dune/perftool/loopy/transformations/instrumentation.py @@ -0,0 +1,101 @@ +""" Add instrumentation instructions to a kernel """ + +from dune.perftool.generation import (dump_accumulate_timer, + post_include, + ) +from dune.perftool.options import get_option + +import loopy as lp + + +def _intersect(a): + """ Return intersection of a given tuple of frozensets. Also works for empty tuple """ + if len(a) == 0: + return frozenset() + return frozenset.intersection(*a) + + +def _union(a): + """ Return union of a given tuple of frozensets. Also works for empty tuple """ + if len(a) == 0: + return frozenset() + return frozenset.union(*a) + + +def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', operator=False): + """ Transform loopy kernel to contain instrumentation code + + Arguments: + knl : The loopy kernel, follows the loopy transformation convention + match : A loopy match object or a string (interpreted as instruction ID or tag) to describe + which instructions should be wrapped in an instrumentation block. + identifier : The name of the counter to start and stop + level : The instrumentation level this measurement is defined at + filetag : The tag of the file that should contain the counter definitions + """ + # If the instrumentation level is not high enough, this is a no-op + if level > get_option("instrumentation_level"): + return knl + + # If a string was given for match, heuristically make it a match object + if isinstance(match, str): + match = lp.match.Or((lp.match.Id(match), lp.match.Tagged(match))) + + # Find the instructions to wrap in instrumentation + insns = lp.find_instructions(knl, match) + rewritten_insns = [] + + # If the match is empty, this is also no op + if not insns: + return knl + + # Determine the iname nesting of the timing block + insn_inames = _intersect(tuple(i.within_inames for i in insns)) + other_inames = _union(tuple(i.within_inames for i in lp.find_instructions(knl, lp.match.Not(match)))) + within = _intersect((insn_inames, other_inames)) + + # Get a unique identifer - note that the same timer could be started and stopped several times + # within one kernel... + ident = identifier + if lp.find_instructions(knl, lp.match.Id("{}_start".format(identifier))): + ident = "{}_".format(ident) + + # Define the start instruction and correct dependencies for it + start_id = "{}_start".format(ident) + start_depends = _union(tuple(i.depends_on for i in insns)).difference(frozenset(i.id for i in insns)) + start_insn = lp.CInstruction([], + "HP_TIMER_START({});".format(identifier), + id=start_id, + within_inames=within, + depends_on=start_depends, + boostable_into=frozenset(), + ) + + # Add dependencies on the timing instructions + rewritten_insns.extend([i.copy(depends_on=i.depends_on.union(frozenset({start_id}))) for i in insns]) + + # Define the stop instruction and correct dependencies for it + stop_id = "{}_stop".format(ident) + stop_insn = lp.CInstruction([], + "HP_TIMER_STOP({});".format(identifier), + id=stop_id, + within_inames=within, + depends_on=frozenset(i.id for i in insns), + boostable_into=frozenset(), + ) + + # Find all the instructions that should depend on stop + dep_insns = filter(lambda i: _intersect((i.depends_on, frozenset(i.id for i in insns))), + lp.find_instructions(knl, lp.match.Not(match)) + ) + rewritten_insns.extend([i.copy(depends_on=i.depends_on.union(frozenset({stop_id}))) for i in dep_insns]) + + # Trigger code generation on the file/operator level + post_include('HP_DECLARE_TIMER({});'.format(identifier), filetag=filetag) + dump_accumulate_timer(identifier) + + # Filter all the instructions which were untouched + other_insns = list(filter(lambda i: i.id not in [j.id for j in rewritten_insns], knl.instructions)) + + # Add all the modified instructions into the kernel object + return knl.copy(instructions=rewritten_insns + other_insns + [start_insn, stop_insn]) diff --git a/python/dune/perftool/loopy/transformations/vectorize_quad.py b/python/dune/perftool/loopy/transformations/vectorize_quad.py index e3f30cc2..f82f4829 100644 --- a/python/dune/perftool/loopy/transformations/vectorize_quad.py +++ b/python/dune/perftool/loopy/transformations/vectorize_quad.py @@ -206,6 +206,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix): within_inames=common_inames.union(frozenset({outer_iname, vec_iname})), within_inames_is_final=True, id="{}_rotate{}".format(quantity, suffix), + tags=frozenset({"sumfact_stage2"}), )) # Add substitution rules @@ -267,7 +268,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix): within_inames=common_inames.union(frozenset({outer_iname, vec_iname})), within_inames_is_final=True, id=insn.id, - tags=frozenset({"vec_write{}".format(suffix)}) + tags=frozenset({"vec_write{}".format(suffix), "sumfact_stage2"}) ) ) @@ -282,6 +283,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix): within_inames=common_inames.union(frozenset({outer_iname, vec_iname})), within_inames_is_final=True, id="{}_rotateback{}".format(lhsname, suffix), + tags=frozenset({"sumfact_stage2"}), )) # Add the necessary vector indices @@ -296,6 +298,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix): within_inames=common_inames, within_inames_is_final=True, id="assign_{}{}".format(name, suffix), + tags=frozenset({"sumfact_stage2"}), )) new_insns.append(lp.Assignment(prim.Variable(name), # assignee prim.Sum((prim.Variable(name), increment)), # expression @@ -304,6 +307,7 @@ def _vectorize_quadrature_loop(knl, inames, suffix): depends_on=frozenset({Tagged("vec_write{}".format(suffix)), "assign_{}{}".format(name, suffix)}), depends_on_is_final=True, id="update_{}{}".format(name, suffix), + tags=frozenset({"sumfact_stage2"}), )) from loopy.kernel.creation import resolve_dependencies diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 7fe95bbe..15184288 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -562,6 +562,14 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin from dune.perftool.loopy.transformations.matchfma import match_fused_multiply_add kernel = match_fused_multiply_add(kernel) + # Add instrumentation to the kernel + from dune.perftool.loopy.transformations.instrumentation import add_instrumentation + if add_timings and get_form_option("sumfact"): + from dune.perftool.pdelab.signatures import assembler_routine_name + kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage1"), "{}_kernel_stage1".format(assembler_routine_name()), 4) + kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage2"), "{}_kernel_quadratureloop".format(assembler_routine_name()), 4) + kernel = add_instrumentation(kernel, lp.match.Tagged("sumfact_stage3"), "{}_kernel_stage3".format(assembler_routine_name()), 4) + if wrap_in_cgen: # Wrap the kernel in something which can generate code if signature is None: @@ -668,7 +676,7 @@ def cgen_class_from_cache(tag, members=[]): tparams = [i for i in retrieve_cache_items('{} and template_param'.format(tag))] # Construct the constructor - constructor_knl = extract_kernel_from_cache(tag, "constructor_kernel", None, wrap_in_cgen=False) + constructor_knl = extract_kernel_from_cache(tag, "constructor_kernel", None, wrap_in_cgen=False, add_timings=False) from dune.perftool.loopy.target import DuneTarget constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False)) signature = "{}({})".format(basename, ", ".join(next(iter(p.generate(with_semicolon=False))) for p in constructor_params)) diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index 43925fb7..47b0a982 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -475,23 +475,6 @@ def generate_accumulation_instruction(expr, visitor): tags=frozenset(["quadvec", "gradvec"]), ) - # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1) - timer_dep = frozenset() - if get_option("instrumentation_level") >= 4: - timer_name = "{}_kernel_stage1".format(assembler_routine_name()) - timer_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), - depends_on=frozenset({lp.match.Tagged("sumfact_stage1"), 'hptimerstart_{}'.format(timer_name)}), - id="hptimerstop_{}".format(timer_name) - )} - ) - timer_name = '{}_kernel_quadratureloop'.format(assembler_routine_name()) - post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') - dump_accumulate_timer(timer_name) - timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), - within_inames=frozenset(jacobian_inames), - id="hptimerstart_{}".format(timer_name), - depends_on=timer_dep)}) - # Determine dependencies from loopy.match import Or, Writes from loopy.symbolic import DependencyMapper @@ -506,19 +489,13 @@ def generate_accumulation_instruction(expr, visitor): expression=expr, forced_iname_deps=frozenset(quadrature_inames(trial_leaf_element) + jacobian_inames), forced_iname_deps_is_final=True, - tags=frozenset({"quadvec"}).union(vectag), - depends_on=frozenset({deps}).union(timer_dep).union(frozenset({lp.match.Tagged("sumfact_stage1")})), + tags=frozenset({"quadvec", "sumfact_stage2"}).union(vectag), + depends_on=frozenset({deps}).union(frozenset({lp.match.Tagged("sumfact_stage1")})), ) if insn_dep is None: insn_dep = frozenset({contrib_dep}) - if get_option("instrumentation_level") >= 4: - insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), - depends_on=insn_dep, - within_inames=frozenset(jacobian_inames), - id="hptimerstop_{}".format(timer_name))})) - # Add a sum factorization kernel that implements the multiplication # with the test function (stage 3) from dune.perftool.sumfact.realization import realize_sum_factorization_kernel @@ -526,11 +503,3 @@ def generate_accumulation_instruction(expr, visitor): if not get_form_option("fastdg"): insn_dep = vsf.interface.realize(vsf, result, insn_dep) - - if get_option("instrumentation_level") >= 4: - assert vsf.stage == 3 - timer_name = '{}_kernel_stage{}'.format(assembler_routine_name(), vsf.stage) - insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), - depends_on=frozenset({lp.match.Tagged("sumfact_stage3")}), - within_inames=frozenset(jacobian_inames), - id="hptimerstop_{}".format(timer_name))}) diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index 55e14108..777f8dab 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -67,32 +67,6 @@ def name_buffer_storage(buff, which): def _realize_sum_factorization_kernel(sf): insn_dep = sf.insn_dep - # Measure times and count operations in c++ code - if get_option("instrumentation_level") >= 4: - setuptimer = '{}_kernel_setup'.format(assembler_routine_name()) - timer_dep = frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer), - id="hptimerstop_{}".format(setuptimer))}) - - timer_name = '{}_kernel_stage1'.format(assembler_routine_name()) - post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') - dump_accumulate_timer(timer_name) - timer_dep = timer_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), - id="hptimerstart_{}".format(timer_name), - depends_on=timer_dep, - ), - })) - - timer_name = '{}_kernel_stage{}'.format(assembler_routine_name(), sf.stage) - post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') - dump_accumulate_timer(timer_name) - timer_dep = timer_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), - id="hptimerstart_{}".format(timer_name), - within_inames=frozenset(sf.within_inames), - depends_on=timer_dep.union(insn_dep), - ), - })) - insn_dep = insn_dep.union(timer_dep) - # Get all the necessary pieces for a function call buffers = tuple(name_buffer_storage(sf.buffer, i) for i in range(2)) -- GitLab