diff --git a/cmake/modules/GeneratedSystemtests.cmake b/cmake/modules/GeneratedSystemtests.cmake index 2e3de48d33d9939aed33ffbec4d90035f50eb42b..1803b1e508e43188c22ee1b846bd3d42abf0211f 100644 --- a/cmake/modules/GeneratedSystemtests.cmake +++ b/cmake/modules/GeneratedSystemtests.cmake @@ -49,7 +49,7 @@ function(dune_add_formcompiler_system_test) add_generated_executable(TARGET ${tname} UFLFILE ${SYSTEMTEST_UFLFILE} - FORM_COMPILER_ARGS --ini-file ${inifile} --timer + FORM_COMPILER_ARGS --ini-file ${inifile} ) # Exclude the target from all diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index 5f27ede95ccb273d089f231fc64083cb0c854d51..b521bf6dc61b99c0c614116fada26f476926e51a 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -39,8 +39,9 @@ def get_form_compiler_arguments(): parser.add_argument("--diagonal-transformation-matrix", action="store_true", help="set option if the jacobian of the transformation is diagonal (axiparallel grids)") parser.add_argument("--constant-transformation-matrix", action="store_true", help="set option if the jacobian of the transformation is constant on a cell") parser.add_argument("--ini-file", type=str, help="An inifile to use. A generated driver will be hard-coded to it, a [formcompiler] section will be used as default values to form compiler arguments (use snake case)") - parser.add_argument("--timer", action="store_true", help="measure times") - parser.add_argument("--opcounter", action="store_true", default=False, help="Count operations. Should only be used with yaspgrid. Timer should be set.") + parser.add_argument("--opcounter", action="store_true", help="Count operations. Note: In this case only oparor applications are generated since solving and operator counting does not work.") + parser.add_argument("--time-opcounter", action="store_true", help="Generate opcounter codepath. Can be used for timing opcounter programs without setting the opcounter option.") + parser.add_argument("--instrumentation-level", type=int, default=0, help="Control time/opcounter measurements. 0-do nothing, 1-measure program as a whole, 2-operator applications, 3-measure kernel (eg. alpha-volume, ...), 4-parts of kernel (eg. stage 1-3 of SF)") parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project") parser.add_argument("--fastdg", action="store_true", help="Use FastDGGridOperator from PDELab.") # TODO at some point this help description should be updated diff --git a/python/dune/perftool/pdelab/driver.py b/python/dune/perftool/pdelab/driver.py index 62a72fcea2c6ebc82596be76fa17f1d4d19dc15c..775b3bda292210ab142d4c37dcfde16cf3852fe5 100644 --- a/python/dune/perftool/pdelab/driver.py +++ b/python/dune/perftool/pdelab/driver.py @@ -1188,10 +1188,10 @@ def dune_solve(): snp = name_stationarynonlinearproblemsolver(go_type, go) solve = "{}.apply();".format(snp) - if get_option('timer'): + if get_option('instrumentation_level') >= 2: setup_timer() from dune.perftool.generation import post_include - post_include("HP_DECLARE_TIMER(total);", filetag="driver") + post_include("HP_DECLARE_TIMER(solve);", filetag="driver") # Print times after solving from dune.perftool.generation import get_global_context_value @@ -1202,12 +1202,14 @@ def dune_solve(): lop_name = name_localoperator(formdata) timestream = name_timing_stream() print_times.append("{}.dump_timers({}, argv[0], true);".format(lop_name, timestream)) - solve = ["HP_TIMER_START(total);", + + solve = ["HP_TIMER_START(solve);", "{}".format(solve), - "HP_TIMER_STOP(total);", - "DUMP_TIMER(total, {}, true);".format(timestream), + "HP_TIMER_STOP(solve);", + "DUMP_TIMER(solve, {}, true);".format(timestream), ] - solve.extend(print_times) + if get_option('instrumentation_level') >= 3: + solve.extend(print_times) return solve @@ -1391,7 +1393,7 @@ def name_test_fail_variable(): @cached def setup_timer(): - assert(get_option('timer')) + assert(get_option('instrumentation_level') >= 1) # Necessary includes and defines from dune.perftool.generation import pre_include @@ -1697,7 +1699,9 @@ def generate_driver(formdatas, data): set_driver_data(formdatas, data) # Entrypoint for driver generation - if get_option("opcounter"): + if get_option("opcounter") or get_option("time_opcounter"): + if get_option("time_opcounter"): + assert(not get_option("opcounter")) assert(any(_driver_data['form'].ufl_cell().cellname() in x for x in ["vertex", "interval", "quadrilateral", "hexahedron"])) # In case of operator conunting we only assemble the matrix and evaluate the residual @@ -1714,7 +1718,18 @@ def generate_driver(formdatas, data): from dune.perftool.generation import retrieve_cache_items from cgen import FunctionDeclaration, FunctionBody, Block, Value driver_signature = FunctionDeclaration(Value('bool', 'driver'), [Value('int', 'argc'), Value('char**', 'argv')]) - driver_body = Block(contents=[i for i in retrieve_cache_items("preamble", make_generable=True)]) + contents = [i for i in retrieve_cache_items("preamble", make_generable=True)] + + from cgen import Line + if get_option("instrumentation_level") >= 1: + from dune.perftool.generation import post_include + post_include("HP_DECLARE_TIMER(driver);\n", filetag="driver") + contents.insert(0, Line(text="HP_TIMER_START(driver);\n")) + contents.insert(len(contents) - 1, Line(text="HP_TIMER_STOP(driver);\n")) + timestream = name_timing_stream() + contents.insert(len(contents) - 1, Line(text="DUMP_TIMER(driver, {}, true);\n".format(timestream))) + contents.insert(0, Line(text="\n")) + driver_body = Block(contents) driver = FunctionBody(driver_signature, driver_body) filename = get_option("driver_file") diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 0686f44c49001424ef3cb819c9df134243ab5b18..839c57f8f5bef06cd85a241a283684ab87311f4b 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -556,7 +556,6 @@ class TimerMethod(ClassMember): knl = name_example_kernel() assert(knl is not None) - # TODO: operator counting only works if alpha_volume_kernel exists content = ["template <typename Stream>", "void dump_timers(Stream& {}, char* {}, bool {})".format(os, ex, reset), "{", @@ -595,7 +594,7 @@ class LoopyKernelMethod(ClassMember): content.append(' ' + p) # Start timer - if add_timings and get_option('timer'): + if add_timings and get_option('instrumentation_level') >= 3: from dune.perftool.pdelab.signatures import assembler_routine_name timer_name = assembler_routine_name() + '_kernel' name_example_kernel(name=timer_name) @@ -607,7 +606,7 @@ class LoopyKernelMethod(ClassMember): content.extend(l for l in generate_body(kernel).split('\n')[1:-1]) # Stop timer - if add_timings and get_option('timer'): + if add_timings and get_option('instrumentation_level') >= 3: content.append(' ' + 'HP_TIMER_STOP({});'.format(timer_name)) content.append('}') @@ -814,7 +813,7 @@ def generate_localoperator_file(formdata, kernels, filename): for k in kernels.values(): operator_methods.extend(k) - if get_option('timer'): + if get_option('instrumentation_level') >= 3: include_file('dune/perftool/common/timer.hh', filetag='operatorfile') operator_methods.append(TimerMethod()) diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index 35d91cc0d553f803988c8c6e932aed18144c6337..216ce95c0f7a06ab99cedd7210f01b740b69999b 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -10,6 +10,7 @@ from dune.perftool.generation import (backend, barrier, built_instruction, domain, + dump_accumulate_timer, function_mangler, generator_factory, get_counter, @@ -17,6 +18,7 @@ from dune.perftool.generation import (backend, globalarg, iname, instruction, + post_include, kernel_cached, retrieve_cache_items, silenced_warning, @@ -32,6 +34,7 @@ from dune.perftool.loopy.buffer import (get_buffer_temporary, from dune.perftool.sumfact.quadrature import nest_quadrature_loops from dune.perftool.pdelab.localoperator import determine_accumulation_space from dune.perftool.pdelab.restriction import restricted_name +from dune.perftool.pdelab.signatures import assembler_routine_name from dune.perftool.pdelab.spaces import (name_lfs, name_lfs_bound, ) @@ -178,6 +181,16 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): replace_dict[Variable(iname)] = i expression = substitute(pymbolic_expr, replace_dict) + # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1) + timer_dep = frozenset() + if get_option("instrumentation_level") >= 4: + timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop' + post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') + dump_accumulate_timer(timer_name) + if(visitor.inames): + timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), + within_inames=frozenset(visitor.inames))}) + # Determine dependencies from loopy.match import Or, Writes from loopy.symbolic import DependencyMapper @@ -192,12 +205,17 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): forced_iname_deps=frozenset(quadrature_inames() + visitor.inames), forced_iname_deps_is_final=True, tags=frozenset({"quadvec"}).union(vectag), - depends_on=frozenset({deps}) + depends_on=frozenset({deps}).union(timer_dep) ) if insn_dep is None: insn_dep = frozenset({contrib_dep}) + if get_option("instrumentation_level") >= 4: + insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), + depends_on=insn_dep, + within_inames=frozenset(visitor.inames))}) + # Add a sum factorization kernel that implements the multiplication # with the test function (stage 3) pref_pos = i if accterm.argument.index else None @@ -351,6 +369,14 @@ def sum_factorization_kernel(a_matrices, ctags = ctags + ",vec" vec_shape = (4,) + if get_option("instrumentation_level") >= 4: + timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(stage) + post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') + dump_accumulate_timer(timer_name) + insn_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), + depends_on=insn_dep, + within_inames=additional_inames)}) + insn_dep = frozenset({barrier(depends_on=insn_dep, within_inames=additional_inames, )}) @@ -432,6 +458,17 @@ def sum_factorization_kernel(a_matrices, ) }) + if get_option("instrumentation_level") >= 4: + insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), + depends_on=insn_dep, + within_inames=additional_inames)}) + if stage == 1: + qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop' + post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') + dump_accumulate_timer(timer_name) + insn_dep = instruction(code="HP_TIMER_START({});".format(qp_timer_name), + depends_on=insn_dep) + if outshape is None: assert stage == 3 outshape = tuple(mat.rows for mat in a_matrices)