diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py index df47d5e9fb1ac03d8ed6bde6ec1de203470f4d2f..a4d8292f5f88bc315980efb521bf7c2cf6a95153 100644 --- a/python/dune/perftool/generation/loopy.py +++ b/python/dune/perftool/generation/loopy.py @@ -140,10 +140,9 @@ def _insn_cache_key(code=None, expression=None, **kwargs): def instruction(code=None, expression=None, **kwargs): assert (code is not None) or (expression is not None) assert not ((code is not None) and (expression is not None)) - assert 'id' not in kwargs # Get an ID for this instruction - id = 'insn_{}'.format(str(get_counter('__insn_id')).zfill(4)) + id = kwargs.pop("id", 'insn_{}'.format(str(get_counter('__insn_id')).zfill(4))) # Now create the actual instruction if code: diff --git a/python/dune/perftool/pdelab/driver/__init__.py b/python/dune/perftool/pdelab/driver/__init__.py index c859c99d0ab964e298a55bf2c9854f170c507599..3877165c574ac8ef7b6d7197b56f97d071f38f42 100644 --- a/python/dune/perftool/pdelab/driver/__init__.py +++ b/python/dune/perftool/pdelab/driver/__init__.py @@ -298,10 +298,10 @@ def generate_driver(): add_section("constraints", "Set up constraints container...") add_section("gridoperator", "Set up grid grid operators...") add_section("vector", "Set up solution vectors...") + add_section("timings", "Maybe take performance measurements...") add_section("solver", "Set up (non)linear solvers...") add_section("vtk", "Do visualization...") add_section("instat", "Set up instationary stuff...") - add_section("timings", "Maybe take performance measurements...") add_section("printing", "Maybe print residuals and matrices to stdout...") add_section("error", "Maybe calculate errors for test results...") diff --git a/python/dune/perftool/pdelab/driver/solve.py b/python/dune/perftool/pdelab/driver/solve.py index 8f3ef96ff9fd2babe838a2b3ceb7baeeb63d67f7..0648df399d809fd84d30624711dfc2fb02bbc631 100644 --- a/python/dune/perftool/pdelab/driver/solve.py +++ b/python/dune/perftool/pdelab/driver/solve.py @@ -54,7 +54,8 @@ def dune_solve(): print_matrix() if get_option('instrumentation_level') >= 2: - from dune.perftool.pdelab.driver.timings import setup_timer, name_timing_stream + from dune.perftool.pdelab.driver.timings import setup_timer, name_timing_stream, name_timing_identifier + timestream = name_timing_stream() setup_timer() from dune.perftool.generation import post_include post_include("HP_DECLARE_TIMER(solve);", filetag="driver") @@ -68,7 +69,6 @@ def dune_solve(): if get_option('instrumentation_level') >= 3: from dune.perftool.pdelab.driver.gridoperator import name_localoperator lop_name = name_localoperator(form_ident) - timestream = name_timing_stream() solve.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier())) return solve diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 7d1ded787fa9fc6cd71a5bd975c6123231dfeffe..7fe95bbe858be0aa74df2c313ad6bd96a7507e68 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -495,7 +495,7 @@ def generate_kernels_per_integral(integrals): yield generate_kernel(integrals) -def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True): +def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timings=True): # Now extract regular loopy kernel components from dune.perftool.loopy.target import DuneTarget domains = [i for i in retrieve_cache_items("{} and domain".format(tag))] @@ -567,7 +567,7 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True): if signature is None: from dune.perftool.pdelab.signatures import assembly_routine_signature signature = assembly_routine_signature() - kernel = LoopyKernelMethod(signature, kernel) + kernel = LoopyKernelMethod(signature, kernel, add_timings=add_timings) return kernel diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index 8881a510fefed4ed72fa49d2222de57e9c3cf7fb..e258f7adfa9acb4a9d6e841a2ed863b20a76cb5f 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -166,7 +166,8 @@ class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord): forced_iname_deps=frozenset(inames + additional_inames + self.within_inames), forced_iname_deps_is_final=True, depends_on=insn_dep, - predicates=sf.predicates + predicates=sf.predicates, + tags=frozenset({"sumfact_stage3"}), ) return frozenset({dep}) @@ -211,7 +212,10 @@ class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord): lhs = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + inames) result = prim.Sum((lhs, result)) - return frozenset({instruction(assignee=lhs, expression=result, **args)}) + return frozenset({instruction(assignee=lhs, + expression=result, + tags=frozenset({"sumfact_stage3"}), + **args)}) def _local_sizes(element): @@ -449,12 +453,18 @@ def generate_accumulation_instruction(expr, visitor): # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1) timer_dep = frozenset() if get_option("instrumentation_level") >= 4: - timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop' + timer_name ="{}_kernel_stage1".format(assembler_routine_name()) + timer_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), + depends_on=frozenset({lp.match.Tagged("sumfact_stage1"), 'hptimerstart_{}'.format(timer_name)}), + id="hptimerstop_{}".format(timer_name) + )}) + timer_name = '{}_kernel_quadratureloop'.format(assembler_routine_name()) post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') dump_accumulate_timer(timer_name) - if(jacobian_inames): - timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), - within_inames=frozenset(jacobian_inames))}) + timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), + within_inames=frozenset(jacobian_inames), + id="hptimerstart_{}".format(timer_name), + depends_on=timer_dep)}) # Determine dependencies from loopy.match import Or, Writes @@ -478,9 +488,10 @@ def generate_accumulation_instruction(expr, visitor): insn_dep = frozenset({contrib_dep}) if get_option("instrumentation_level") >= 4: - insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), - depends_on=insn_dep, - within_inames=frozenset(jacobian_inames))}) + insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), + depends_on=insn_dep, + within_inames=frozenset(jacobian_inames), + id="hptimerstop_{}".format(timer_name))})) # Add a sum factorization kernel that implements the multiplication # with the test function (stage 3) @@ -488,4 +499,12 @@ def generate_accumulation_instruction(expr, visitor): result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep))) if not get_form_option("fastdg"): - vsf.output.realize(vsf, result, insn_dep) + insn_dep = vsf.output.realize(vsf, result, insn_dep) + + if get_option("instrumentation_level") >= 4: + assert vsf.stage == 3 + timer_name = '{}_kernel_stage{}'.format(assembler_routine_name(), vsf.stage) + insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), + depends_on=frozenset({lp.match.Tagged("sumfact_stage3")}), + within_inames=frozenset(jacobian_inames), + id="hptimerstop_{}".format(timer_name))}) diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index 620faab65b9c089b55b47bf2f5d3e859b2798706..dfe080284986f20c1bc7fd641b8cf6b4345d2997 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -77,6 +77,32 @@ def name_buffer_storage(buff, which): def _realize_sum_factorization_kernel(sf): insn_dep = sf.insn_dep + # Measure times and count operations in c++ code + if get_option("instrumentation_level") >= 4: + setuptimer = '{}_kernel_setup'.format(assembler_routine_name()) + timer_dep = frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer), + id="hptimerstop_{}".format(setuptimer))}) + + timer_name = '{}_kernel_stage1'.format(assembler_routine_name()) + post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') + dump_accumulate_timer(timer_name) + timer_dep = timer_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), + id="hptimerstart_{}".format(timer_name), + depends_on=timer_dep, + ), + })) + + timer_name = '{}_kernel_stage{}'.format(assembler_routine_name(), sf.stage) + post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') + dump_accumulate_timer(timer_name) + timer_dep = timer_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), + id="hptimerstart_{}".format(timer_name), + within_inames=frozenset(sf.within_inames), + depends_on=timer_dep.union(insn_dep), + ), + })) + insn_dep = insn_dep.union(timer_dep) + # Get all the necessary pieces for a function call funcname = name_kernel_implementation_function(sf) buffers = tuple(name_buffer_storage(sf.buffer, i) for i in range(2)) @@ -101,10 +127,11 @@ def _realize_sum_factorization_kernel(sf): # Call the function code = "{}({}, {});".format(funcname, *buffers) + tag = "sumfact_stage{}".format(sf.stage) insn_dep = frozenset({instruction(code=code, depends_on=insn_dep, within_inames=frozenset(sf.within_inames), - tags=frozenset({"sumfact_stage{}".format(sf.stage)}), + tags=frozenset({tag}), ) }) @@ -290,7 +317,7 @@ def realize_sumfact_kernel_function(sf): name = name_kernel_implementation_function(sf) from dune.perftool.pdelab.localoperator import extract_kernel_from_cache signature = "void {}(const char* buffer0, const char* buffer1) const".format(name) - kernel = extract_kernel_from_cache("kernel_default", name, [signature]) + kernel = extract_kernel_from_cache("kernel_default", name, [signature], add_timings=False) delete_cache_items("kernel_default") return kernel