diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index a8b69b7326f76ab51fe26635cfd7d4557750bc19..5797c08c7e6966d76c51860598994407e4c206ad 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -614,10 +614,6 @@ class LoopyKernelMethod(ClassMember): content.append('{') if kernel is not None: - # Add kernel preamble - for i, p in kernel.preambles: - content.append(' ' + p) - # Start timer if add_timings and get_option('instrumentation_level') >= 3: from dune.perftool.pdelab.signatures import assembler_routine_name @@ -627,6 +623,15 @@ class LoopyKernelMethod(ClassMember): content.append(' ' + 'HP_TIMER_START({});'.format(timer_name)) dump_accumulate_timer(timer_name) + if add_timings and get_option("instrumentation_level") >= 4: + setuptimer = '{}_kernel_setup'.format(assembler_routine_name()) + post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile') + content.append(' HP_TIMER_START({});'.format(setuptimer)) + + # Add kernel preamble + for i, p in kernel.preambles: + content.append(' ' + p) + # Add kernel body content.extend(l for l in generate_body(kernel).split('\n')[1:-1]) diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index 5fe9ebb121a925d1d33ade412fa0c3cb9b4a794b..13acc6acb6a65607165961d59c33a6fba18fb876 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -59,6 +59,12 @@ def _realize_sum_factorization_kernel(sf): # Measure times and count operations in c++ code if get_option("instrumentation_level") >= 4: + if sf.stage == 1: + setuptimer = '{}_kernel_setup'.format(assembler_routine_name()) + insn_dep = insn_dep.union(frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer), + within_inames=frozenset(sf.within_inames), + depends_on=insn_dep)})) + timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(sf.stage) post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') dump_accumulate_timer(timer_name)