diff --git a/bin/donkey_benchmark_compilation_wrapper.sh b/bin/donkey_benchmark_compilation_wrapper.sh index e2691b5d71db2473b731fabcea6a1095ec2516d9..a786d111264ef0e67769100713f1776e3e002c3d 100755 --- a/bin/donkey_benchmark_compilation_wrapper.sh +++ b/bin/donkey_benchmark_compilation_wrapper.sh @@ -13,7 +13,4 @@ ml parmetis ("$@") code=$? -echo "Code: $code" -sleep 0.1s - exit $code diff --git a/bin/donkey_benchmark_execution_wrapper.py b/bin/donkey_benchmark_execution_wrapper.py index 7951b8b06bfdeb217bd3328c9b3b88f229bbc606..d383963318291ae947c70b1f083c320a60250303 100755 --- a/bin/donkey_benchmark_execution_wrapper.py +++ b/bin/donkey_benchmark_execution_wrapper.py @@ -13,8 +13,3 @@ ret = subprocess.call(command) # If that failed - fail! if ret != 0: sys.exit(ret) - -# If that was succesful, wait for the output file to be available on the filesystem -# This step is necessary because the NFS synchronization is too slow for our workflow. -while not os.path.isfile(sys.argv[2]): - time.sleep(0.1) diff --git a/cmake/modules/DuneCodegenMacros.cmake b/cmake/modules/DuneCodegenMacros.cmake index bd3f9649116d72afee7118e5d3940e5f11a54e4e..61713109b32cc1af4716b0f9f133879592fec49c 100644 --- a/cmake/modules/DuneCodegenMacros.cmake +++ b/cmake/modules/DuneCodegenMacros.cmake @@ -81,6 +81,10 @@ find_package(benchmark) +if (DUNE_CODEGEN_PROFILING) + find_package(likwid) +endif() + add_custom_target(generation) # Gather a list of form compiler sources to add as dependencies @@ -174,11 +178,8 @@ function(add_generated_executable) if(DUNE_CODEGEN_PROFILING) # This is a bit silly, but cProfile only finds entry point scripts - # if their full path is provided. So we resort to using which. - dune_execute_process(COMMAND which generate_operators - OUTPUT_VARIABLE fullcommand - OUTPUT_STRIP_TRAILING_WHITESPACE - ) + # if their full path is provided. + set(fullcommand "${DUNE_PYTHON_VIRTUALENV_PATH}/bin/generate_operators") endif() # Define build rules for all operator header files and gather a list of them diff --git a/cmake/modules/Findlikwid.cmake b/cmake/modules/Findlikwid.cmake new file mode 100644 index 0000000000000000000000000000000000000000..778901280ee0b778bd7131cf1fcee4b3181557f3 --- /dev/null +++ b/cmake/modules/Findlikwid.cmake @@ -0,0 +1,104 @@ +# .. cmake_module:: +# +# Module that checks whether likwid is available and usable. +# +# Variables used by this module which you may want to set: +# +# :ref:`likwid_ROOT` +# Path list to search for likwid. +# +# Sets the following variables: +# +# :code:`likwid_FOUND` +# True if likwid available. +# +# :code:`likwid_INCLUDE_DIRS` +# Path to the likwid include directories. +# +# +# :code:`likwid_LIBRARIES` +# Link against these libraries to use likwid. +# +# .. cmake_variable:: likwid_ROOT +# +# You may set this variable to have :ref:`Findlikwid` look +# for the likwid package in the given paths before inspecting +# system paths. +# +find_path(LIKWID_INCLUDE_DIR + NAMES "likwid.h" + PATHS ${likwid_ROOT} + PATH_SUFFIXES "include" "include/likwid" + NO_DEFAULT_PATH) +find_path(LIKWID_INCLUDE_DIR + NAMES "likwid.h" + PATH_SUFFIXES "include" "include/likwid") + +find_library(LIKWID_LIBRARY + NAMES "likwid" + PATHS ${likwid_ROOT} + PATH_SUFFIXES "lib" "lib32" "lib64" + NO_DEFAULT_PATH) +find_library(LIKWID_LIBRARY + NAMES "likwid" + PATH_SUFFIXES "lib" "lib32" "lib64") + +include(CMakePushCheckState) +cmake_push_check_state() + +if(LIKWID_INCLUDE_DIR) + set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${LIKWID_INCLUDE_DIR}) +endif() +if(LIKWID_LIBRARY) + set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} ${LIKWID_LIBRARY}) +endif() + +cmake_pop_check_state() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + "likwid" + DEFAULT_MSG + LIKWID_INCLUDE_DIR + LIKWID_LIBRARY +) + +mark_as_advanced(LIKWID_INCLUDE_DIR LIKWID_LIBRARY) + +# if headers are found, store results +if(likwid_FOUND) + set(likwid_INCLUDE_DIRS ${LIKWID_INCLUDE_DIR}) + set(likwid_LIBRARIES ${LIKWID_LIBRARY}) + # log result + file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log + "Determing location of likwid succeeded:\n" + "Include directory: ${likwid_INCLUDE_DIRS}\n" + "Libraries to link against: ${likwid_LIBRARIES}\n\n") + + set(likwid_DUNE_COMPILE_FLAGS "-I${likwid_INCLUDE_DIRS}" + CACHE STRING "Compile Flags used by DUNE when compiling with likwid programs") + set(likwid_DUNE_LIBRARIES ${likwid_LIBRARIES} + CACHE STRING "Libraries used by DUNE when linking likwid programs") +else() + # log errornous result + file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log + "Determing location of likwid failed:\n" + "Include directory: ${likwid_INCLUDE_DIRS}\n" + "Libraries to link against: ${likwid_LIBRARIES}\n\n") +endif() + +# set HAVE_LIKWID for config.h +set(HAVE_LIKWID ${likwid_FOUND}) + + +# register all likwid related flags +if(likwid_FOUND) + dune_register_package_flags(COMPILE_DEFINITIONS "ENABLE_LIKWID=1" + LIBRARIES "${likwid_LIBRARIES}" + INCLUDE_DIRS "${likwid_INCLUDE_DIRS}") +endif() + +# text for feature summary +set_package_properties("LIKWID" PROPERTIES + DESCRIPTION "likwid" + PURPOSE "Performance monitoring and benchmarking suite.") \ No newline at end of file diff --git a/dune/codegen/common/simdtraits.hh b/dune/codegen/common/simdtraits.hh new file mode 100644 index 0000000000000000000000000000000000000000..73ee4caff1bb743d275e20173556416cc0bd2d30 --- /dev/null +++ b/dune/codegen/common/simdtraits.hh @@ -0,0 +1,16 @@ +#ifndef DUNE_CODEGEN_COMMON_SIMD_TRAITS_HH +#define DUNE_CODEGEN_COMMON_SIMD_TRAITS_HH + +/** This is just the declaration of the traits classes, specialization for VCL and + * OpCounter VCL are elsewhere. + */ + +template<typename T> +struct base_floatingpoint +{}; + +template<typename T> +struct simd_size +{}; + +#endif diff --git a/dune/codegen/common/vcltraits.hh b/dune/codegen/common/vcltraits.hh new file mode 100644 index 0000000000000000000000000000000000000000..de764dbe6f78a51d1bf5934856ca36d4b90a77d0 --- /dev/null +++ b/dune/codegen/common/vcltraits.hh @@ -0,0 +1,86 @@ +#ifndef DUNE_CODEGEN_COMMON_VCLTRAITS_HH +#define DUNE_CODEGEN_COMMON_VCLTRAITS_HH + +/** A collection of traits tools for the Vector Class Library */ + +#include<dune/codegen/common/vectorclass.hh> + + +template<> +struct base_floatingpoint<Vec2d> +{ + using value = double; +}; + +template<> +struct base_floatingpoint<Vec4f> +{ + using value = float; +}; + +template<> +struct simd_size<Vec2d> +{ + static constexpr std::size_t value = 2; +}; + +template<> +struct simd_size<Vec4f> +{ + static constexpr std::size_t value = 4; +}; + +#if MAX_VECTOR_SIZE >= 256 +template<> +struct base_floatingpoint<Vec4d> +{ + using value = double; +}; + +template<> +struct base_floatingpoint<Vec8f> +{ + using value = float; +}; + +template<> +struct simd_size<Vec4d> +{ + static constexpr std::size_t value = 4; +}; + +template<> +struct simd_size<Vec8f> +{ + static constexpr std::size_t value = 8; +}; +#endif + +#if MAX_VECTOR_SIZE >= 512 +template<> +struct base_floatingpoint<Vec8d> +{ + using value = double; +}; + +template<> +struct base_floatingpoint<Vec16f> +{ + using value = float; +}; + +template<> +struct simd_size<Vec8d> +{ + static constexpr std::size_t value = 8; +}; + +template<> +struct simd_size<Vec16f> +{ + static constexpr std::size_t value = 16; +}; + +#endif + +#endif diff --git a/dune/codegen/common/vectorclass.hh b/dune/codegen/common/vectorclass.hh index aa3bba7b98f9cf57dcc223f61755e3aa26c5bdca..648ea52fe24490efc8af431fe54de4089515af33 100644 --- a/dune/codegen/common/vectorclass.hh +++ b/dune/codegen/common/vectorclass.hh @@ -1,71 +1,23 @@ #ifndef DUNE_CODEGEN_COMMON_VECTORCLASS_HH #define DUNE_CODEGEN_COMMON_VECTORCLASS_HH - -template<typename T> -struct base_floatingpoint -{}; +#include<dune/codegen/common/simdtraits.hh> #ifdef ENABLE_COUNTER + #if HAVE_DUNE_OPCOUNTER #include<dune/opcounter/vectorclass.hh> - -template<typename F, int size> -struct base_floatingpoint<OpCounter::impl::OpCounterVector<F, size>> -{ - using value = OpCounter::OpCounter<F>; -}; - - #else #error "dune-opcounter is needed for opcounted vector types" #endif + #else + #include<dune/codegen/vectorclass/vectorclass.h> #include<dune/codegen/vectorclass/vectormath_exp.h> #include<dune/codegen/vectorclass/vectormath_hyp.h> #include<dune/codegen/vectorclass/vectormath_trig.h> - -template<> -struct base_floatingpoint<Vec2d> -{ - using value = double; -}; - -template<> -struct base_floatingpoint<Vec4f> -{ - using value = float; -}; - - -#if MAX_VECTOR_SIZE >= 256 -template<> -struct base_floatingpoint<Vec4d> -{ - using value = double; -}; - -template<> -struct base_floatingpoint<Vec8f> -{ - using value = float; -}; -#endif - -#if MAX_VECTOR_SIZE >= 512 -template<> -struct base_floatingpoint<Vec8d> -{ - using value = double; -}; - -template<> -struct base_floatingpoint<Vec16f> -{ - using value = float; -}; -#endif +#include<dune/codegen/common/vcltraits.hh> #endif diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b96bd06cbce6361f1130e17bb7cb4d97efe08ec7..8881504f934e7b1d755580ffc46d47f2a1467346 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -25,5 +25,5 @@ add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc) target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing) if(benchmark_FOUND) - target_link_libraries(_autotune_target benchmark::benchmark) + target_link_libraries(_autotune_target benchmark) endif() diff --git a/python/dune/codegen/generation/__init__.py b/python/dune/codegen/generation/__init__.py index d0cf1d4dc8b6880db13bfdec4458d815ea8208c2..bed0256407b7259bab61b6e932c4a17761097e75 100644 --- a/python/dune/codegen/generation/__init__.py +++ b/python/dune/codegen/generation/__init__.py @@ -16,6 +16,7 @@ from dune.codegen.generation.cpp import (base_class, class_member, constructor_parameter, dump_accumulate_timer, + register_liwkid_timer, end_of_file, include_file, initializer_list, diff --git a/python/dune/codegen/generation/cpp.py b/python/dune/codegen/generation/cpp.py index 29384f98554ab895d37670c017fe5ecc4f191655..b918291067f45c5f988bc8fdcea55651d538a9db 100644 --- a/python/dune/codegen/generation/cpp.py +++ b/python/dune/codegen/generation/cpp.py @@ -50,3 +50,8 @@ def dump_accumulate_timer(name): code = "DUMP_TIMER({},{},{},{});".format(get_option("instrumentation_level"), name, os, reset) return code + + +@generator_factory(item_tags=("register_likwid_timers",)) +def register_liwkid_timer(name): + return "LIKWID_MARKER_REGISTER(\"{}\");".format(name) diff --git a/python/dune/codegen/loopy/symbolic.py b/python/dune/codegen/loopy/symbolic.py index 799c7d6040e2e7d9e40c296d73c391aeb43cc6f3..c76fbb063b88b62732ac54bbce39f532ba96aa9a 100644 --- a/python/dune/codegen/loopy/symbolic.py +++ b/python/dune/codegen/loopy/symbolic.py @@ -122,6 +122,7 @@ lp.type_inference.TypeInferenceMapper.map_vectorized_sumfact_kernel = needs_reso # FusedMultiplyAdd node lp.symbolic.IdentityMapper.map_fused_multiply_add = identity_map_fused_multiply_add +lp.symbolic.SubstitutionMapper.map_fused_multiply_add = identity_map_fused_multiply_add lp.symbolic.WalkMapper.map_fused_multiply_add = walk_map_fused_multiply_add lp.symbolic.StringifyMapper.map_fused_multiply_add = stringify_map_fused_multiply_add lp.symbolic.DependencyMapper.map_fused_multiply_add = dependency_map_fused_multiply_add diff --git a/python/dune/codegen/loopy/transformations/instrumentation.py b/python/dune/codegen/loopy/transformations/instrumentation.py index 7b13a09e597490dd1bef85344c9be6bdfb859dd3..2fab53a6215a15f0e06e7d42a18f4736b46da34a 100644 --- a/python/dune/codegen/loopy/transformations/instrumentation.py +++ b/python/dune/codegen/loopy/transformations/instrumentation.py @@ -1,9 +1,11 @@ """ Add instrumentation instructions to a kernel """ from dune.codegen.generation import (dump_accumulate_timer, + register_liwkid_timer, post_include, ) from dune.codegen.options import get_option +from dune.codegen.pdelab.driver.timings import start_region_timer_instruction, stop_region_timer_instruction import loopy as lp @@ -67,28 +69,25 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o # Define the start instruction and correct dependencies for it start_id = "{}_start".format(ident) start_depends = _union(tuple(i.depends_on for i in insns)).difference(frozenset(i.id for i in insns)) - start_insn = lp.CInstruction([], - "HP_TIMER_START({});".format(identifier), - id=start_id, - within_inames=within, - depends_on=depends_on.union(start_depends), - boostable_into=frozenset(), - tags=uniontags, - ) + start_insn = start_region_timer_instruction(identifier, + id=start_id, + within_inames=within, + depends_on=depends_on.union(start_depends), + boostable_into=frozenset(), + tags=uniontags,) # Add dependencies on the timing instructions rewritten_insns.extend([i.copy(depends_on=i.depends_on.union(frozenset({start_id}))) for i in insns]) # Define the stop instruction and correct dependencies for it stop_id = "{}_stop".format(ident) - stop_insn = lp.CInstruction([], - "HP_TIMER_STOP({});".format(identifier), - id=stop_id, - within_inames=within, - depends_on=frozenset(i.id for i in insns), - boostable_into=frozenset(), - tags=uniontags, - ) + stop_insn = stop_region_timer_instruction(identifier, + id=stop_id, + within_inames=within, + depends_on=frozenset(i.id for i in insns), + boostable_into=frozenset(), + tags=uniontags, + ) # Find all the instructions that should depend on stop dep_insns = filter(lambda i: _intersect((i.depends_on, frozenset(i.id for i in insns))), @@ -97,8 +96,11 @@ def add_instrumentation(knl, match, identifier, level, filetag='operatorfile', o rewritten_insns.extend([i.copy(depends_on=i.depends_on.union(frozenset({stop_id}))) for i in dep_insns]) # Trigger code generation on the file/operator level - post_include('HP_DECLARE_TIMER({});'.format(identifier), filetag=filetag) - dump_accumulate_timer(identifier) + if get_option("use_likwid"): + register_liwkid_timer(identifier) + else: + post_include('HP_DECLARE_TIMER({});'.format(identifier), filetag=filetag) + dump_accumulate_timer(identifier) # Filter all the instructions which were untouched other_insns = list(filter(lambda i: i.id not in [j.id for j in rewritten_insns], knl.instructions)) diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py index 76a7f54f9d3516ee047913cb0ebd4947c189536b..c8eece5274d632fc764286bbcf1f346ba28092bd 100644 --- a/python/dune/codegen/options.py +++ b/python/dune/codegen/options.py @@ -55,6 +55,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord): target_name = CodegenOption(default=None, helpstr="The target name from CMake") operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!") debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).") + use_likwid = CodegenOption(default=False, helpstr="Use likwid instead of own performance measurements.") autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).") # Arguments that are mainly to be set by logic depending on other options diff --git a/python/dune/codegen/pdelab/driver/__init__.py b/python/dune/codegen/pdelab/driver/__init__.py index 6526dcbf85cfb60d663a3f0193dde07351a361bd..b60544c1c78242f1490c76c46d4be6a2c4448501 100644 --- a/python/dune/codegen/pdelab/driver/__init__.py +++ b/python/dune/codegen/pdelab/driver/__init__.py @@ -270,15 +270,14 @@ def generate_driver(): # Make sure that timestream is declared before retrieving chache items if get_option("instrumentation_level") >= 1: - from dune.codegen.pdelab.driver.timings import setup_timer, name_timing_stream + from dune.codegen.pdelab.driver.timings import setup_timer setup_timer() - timestream = name_timing_stream() from dune.codegen.pdelab.driver.error import return_statement return_statement() from dune.codegen.generation import retrieve_cache_items - from cgen import FunctionDeclaration, FunctionBody, Block, Value, LineComment, Line + from cgen import FunctionDeclaration, FunctionBody, Block, Value, LineComment, Line, Generable driver_signature = FunctionDeclaration(Value('int', 'main'), [Value('int', 'argc'), Value('char**', 'argv')]) contents = [] @@ -292,6 +291,11 @@ def generate_driver(): contents.append(Line("\n")) add_section("init", "Initialize basic stuff...") + + if get_option("instrumentation_level") >= 1: + init_contents = contents + contents = [] + add_section("grid", "Setup grid (view)...") add_section("fem", "Set up finite element maps...") add_section("gfs", "Set up grid function spaces...") @@ -306,13 +310,14 @@ def generate_driver(): add_section("error", "Maybe calculate errors for test results...") if get_option("instrumentation_level") >= 1: - from dune.codegen.generation import post_include - post_include("HP_DECLARE_TIMER(driver);\n", filetag="driver") - contents.insert(0, Line(text="HP_TIMER_START(driver);\n")) - contents.insert(len(contents) - 1, Line(text="HP_TIMER_STOP(driver);\n")) - contents.insert(len(contents) - 1, Line(text="DUMP_TIMER({}, driver, {}, true);\n".format(get_option("instrumentation_level"), timestream))) - contents.insert(0, Line(text="\n")) - driver_body = Block(contents) + from dune.codegen.pdelab.driver.timings import timed_region + contents = init_contents + timed_region('driver', contents) + + add_section("end", "Stuff that should happen at the end...") + add_section("return_stmt", "Return statement...") + + contents.insert(0, "\n") + driver_body = Block([c if isinstance(c, Generable) else Line(c + '\n') for c in contents]) # Wrap a try/catch block around the driver body from dune.codegen.cgen import CatchBlock, TryCatchBlock, Value, Block, Line diff --git a/python/dune/codegen/pdelab/driver/error.py b/python/dune/codegen/pdelab/driver/error.py index cf9fe42933ca41f2696b39324bb4f443c35d9003..02207b4986dccd7a652c1943402243e755a8c681 100644 --- a/python/dune/codegen/pdelab/driver/error.py +++ b/python/dune/codegen/pdelab/driver/error.py @@ -186,8 +186,7 @@ def compare_L2_squared(): " {} = true;".format(fail)] -@preamble(section="error") +@preamble(section="return_stmt") def return_statement(): - from dune.codegen.pdelab.driver.error import name_test_fail_variable fail = name_test_fail_variable() return "return {};".format(fail) diff --git a/python/dune/codegen/pdelab/driver/solve.py b/python/dune/codegen/pdelab/driver/solve.py index 79dfac051c66c2faa50c4e262eb419ed30b5fde1..4a6a3c9e7e235ee5368927fc6f97ac817ffde5f5 100644 --- a/python/dune/codegen/pdelab/driver/solve.py +++ b/python/dune/codegen/pdelab/driver/solve.py @@ -57,23 +57,8 @@ def dune_solve(): if get_form_option("generate_jacobians"): print_matrix() - if get_option('instrumentation_level') >= 2: - from dune.codegen.pdelab.driver.timings import setup_timer, name_timing_stream, name_timing_identifier - timestream = name_timing_stream() - setup_timer() - from dune.codegen.generation import post_include - post_include("HP_DECLARE_TIMER(solve);", filetag="driver") - - solve = ["HP_TIMER_START(solve);", - "{}".format(solve), - "HP_TIMER_STOP(solve);", - "DUMP_TIMER({}, solve, {}, true);".format(get_option("instrumentation_level"), timestream), - ] - - if get_option('instrumentation_level') >= 3: - from dune.codegen.pdelab.driver.gridoperator import name_localoperator - lop_name = name_localoperator(form_ident) - solve.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier())) + from dune.codegen.pdelab.driver.timings import timed_region + solve = timed_region('solve', solve) return solve diff --git a/python/dune/codegen/pdelab/driver/timings.py b/python/dune/codegen/pdelab/driver/timings.py index 714f263a353c5a8a3a6b3b83dbc651ffd3401961..aeca64d46c73f2327b48c22f07dca7a85a044104 100644 --- a/python/dune/codegen/pdelab/driver/timings.py +++ b/python/dune/codegen/pdelab/driver/timings.py @@ -1,12 +1,11 @@ """ Timing related generator functions """ -from dune.codegen.options import get_option from dune.codegen.generation import (cached, include_file, pre_include, - post_include, preamble, ) +from dune.codegen.options import get_option from dune.codegen.pdelab.driver import (get_form_ident, is_linear, name_initree, @@ -21,7 +20,7 @@ from dune.codegen.pdelab.driver.gridoperator import (name_gridoperator, type_gridoperator, ) from dune.codegen.pdelab.driver.solve import (name_vector, - type_vector, + define_vector, ) @@ -90,109 +89,172 @@ def name_timing_stream(): return name +def name_temporary_vector(name, form): + name = "{}_{}".format(name, form) + define_vector(name, form) + return name + + +@preamble(section="timings") +def define_jacobian(name, form_ident): + t_go = type_gridoperator(form_ident) + n_go = name_gridoperator(form_ident) + return ["using M_{} = typename {}::Traits::Jacobian;".format(form_ident, t_go), + "M_{} {}({});".format(form_ident, name, n_go)] + + +def name_jacobian(form_ident): + name = "J_{}".format(form_ident) + define_jacobian(name, form_ident) + return name + + +@preamble(section="init") +def init_likwid(): + return ["LIKWID_MARKER_INIT;", "LIKWID_MARKER_THREADINIT;"] + + +@preamble(section="end") +def finalize_likwid(): + return ["LIKWID_MARKER_CLOSE;"] + + +@preamble(section="timings") +def local_operator_likwid(): + lop_name = name_localoperator(get_form_ident()) + return "{}.register_likwid_timers();".format(lop_name) + + @cached def setup_timer(): # TODO check that we are using YASP? - if get_option('opcounter'): - pre_include("#define ENABLE_COUNTER", filetag="driver") - pre_include("#define ENABLE_HP_TIMERS", filetag="driver") - include_file("dune/codegen/common/timer.hh", filetag="driver") + if get_option("use_likwid"): + pre_include("#define LIKWID_PERFMON", filetag="driver") + include_file("likwid.h", filetag="driver") + init_likwid() + if get_option('instrumentation_level') >= 3: + import logging + logger = logging.getLogger(__name__) + logger.warning("timings: using instrumentation level >= 3 with likwid will slow down your code considerably") + local_operator_likwid() + finalize_likwid() + else: + from dune.codegen.loopy.target import type_floatingpoint + pre_include("#define HP_TIMER_OPCOUNTER {}".format(type_floatingpoint()), filetag="driver") + if get_option('opcounter'): + pre_include("#define ENABLE_COUNTER", filetag="driver") + pre_include("#define ENABLE_HP_TIMERS", filetag="driver") + include_file("dune/codegen/common/timer.hh", filetag="driver") -@preamble(section="timings") -def evaluate_residual_timer(): - n_go = name_gridoperator(get_form_ident()) - v = name_vector(get_form_ident()) - t_v = type_vector(get_form_ident()) - setup_timer() +@preamble(section="init") +def init_likwid_timer(region): + return ["LIKWID_MARKER_REGISTER(\"{}\");".format(region)] - if get_option('instrumentation_level') >= 2: - # Write back times + +def init_region_timer(region): + setup_timer() + if get_option("use_likwid"): + init_likwid_timer(region) + else: from dune.codegen.generation import post_include - post_include("HP_DECLARE_TIMER(residual_evaluation);", filetag="driver") + post_include("HP_DECLARE_TIMER({});".format(region), filetag="driver") + + +def start_region_timer(region): + if get_option("use_likwid"): + return ["LIKWID_MARKER_START(\"{}\");".format(region)] + else: + return ["HP_TIMER_START({});".format(region)] + + +def stop_region_timer(region): + if get_option("use_likwid"): + return ["LIKWID_MARKER_STOP(\"{}\");".format(region)] + else: timestream = name_timing_stream() - print_times = [] + return ["HP_TIMER_STOP({});".format(region), + "DUMP_TIMER({}, {}, {}, true);".format(get_option("instrumentation_level"), region, timestream)] - lop_name = name_localoperator(get_form_ident()) - if get_option('instrumentation_level') >= 3: - print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier())) - if get_option('instrumentation_level') >= 2: - evaluation = ["HP_TIMER_START(residual_evaluation);", - "{}.residual({}, r);".format(n_go, v), - "HP_TIMER_STOP(residual_evaluation);", - "DUMP_TIMER({}, residual_evaluation, {}, true);".format(get_option("instrumentation_level"), timestream)] - evaluation.extend(print_times) +def start_region_timer_instruction(region, **kwargs): + if get_option("use_likwid"): + code = "LIKWID_MARKER_START(\"{}\");".format(region) else: - evaluation = ["{}.residual({}, r);".format(n_go, v)] + code = "HP_TIMER_START({});".format(region) + from loopy import CInstruction + return CInstruction([], code, **kwargs) - evaluation = ["{} r({});".format(t_v, v), "r=0.0;"] + evaluation - return evaluation +def stop_region_timer_instruction(region, **kwargs): + if get_option("use_likwid"): + code = "LIKWID_MARKER_STOP(\"{}\");".format(region) + else: + code = "HP_TIMER_STOP({});".format(region) + from loopy import CInstruction + return CInstruction([], code, **kwargs) -@preamble(section="timings") -def apply_jacobian_timer(): - n_go = name_gridoperator(get_form_ident()) - v = name_vector(get_form_ident()) - t_v = type_vector(get_form_ident()) - setup_timer() +def timed_region(region, actions): + if isinstance(actions, str): + actions = [actions] + + assert(isinstance(actions, list)) if get_option('instrumentation_level') >= 2: - # Write back times - from dune.codegen.generation import post_include - post_include("HP_DECLARE_TIMER(apply_jacobian);", filetag="driver") - timestream = name_timing_stream() + assembly = [] print_times = [] - lop_name = name_localoperator(get_form_ident()) - if get_option('instrumentation_level') >= 3: - print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier())) + init_region_timer(region) - if is_linear(): - declaration = ["{} j({});".format(t_v, v), "j=0.0;"] - evaluation = ["{}.jacobian_apply({}, j);".format(n_go, v)] - else: - declaration = ["{} j0({});".format(t_v, v), "j0=0.0;", - "{} j1({});".format(t_v, v), "j1=0.0;"] - evaluation = ["{}.nonlinear_jacobian_apply({}, j0, j1);".format(n_go, v)] + if get_option('instrumentation_level') >= 3 and not get_option('use_likwid'): + timestream = name_timing_stream() + lop_name = name_localoperator(get_form_ident()) + print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier())) - if get_option('instrumentation_level') >= 2: - evaluation = ["HP_TIMER_START(apply_jacobian);"] + evaluation + ["HP_TIMER_STOP(apply_jacobian);", "DUMP_TIMER({}, apply_jacobian, {}, true);".format(get_option("instrumentation_level"), timestream)] - evaluation.extend(print_times) + assembly += start_region_timer(region) + assembly += actions + assembly += stop_region_timer(region) - return declaration + evaluation + return assembly + print_times + else: + return actions @preamble(section="timings") -def assemble_matrix_timer(): - t_go = type_gridoperator(get_form_ident()) +def evaluate_residual_timer(): n_go = name_gridoperator(get_form_ident()) v = name_vector(get_form_ident()) - t_v = type_vector(get_form_ident()) - setup_timer() + r = name_temporary_vector("r", get_form_ident()) - if get_option('instrumentation_level') >= 2: - # Write back times - from dune.codegen.generation import post_include - post_include("HP_DECLARE_TIMER(matrix_assembly);", filetag="driver") - timestream = name_timing_stream() - print_times = [] + action = "{}.residual({}, {});".format(n_go, v, r) - lop_name = name_localoperator(get_form_ident()) - if get_option('instrumentation_level') >= 3: - print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier())) + return timed_region("residual_evaluation", action) - if get_option('instrumentation_level') >= 2: - assembly = ["HP_TIMER_START(matrix_assembly);", - "{}.jacobian({},m);".format(n_go, v), - "HP_TIMER_STOP(matrix_assembly);", - "DUMP_TIMER({}, matrix_assembly, {}, true);".format(get_option("instrumentation_level"), timestream)] - assembly.extend(print_times) + +@preamble(section="timings") +def apply_jacobian_timer(): + form = get_form_ident() + n_go = name_gridoperator(form) + v = name_vector(form) + + if is_linear(): + j = name_temporary_vector("j", form) + action = "{}.jacobian_apply({}, {});".format(n_go, v, j) else: - assembly = ["{}.jacobian({},m);".format(n_go, v)] + j0 = name_temporary_vector("j0", form) + j1 = name_temporary_vector("j1", form) + action = "{}.nonlinear_jacobian_apply({}, {}, {});".format(n_go, v, j0, j1) + + return timed_region("apply_jacobian", action) + + +@preamble(section="timings") +def assemble_matrix_timer(): + n_go = name_gridoperator(get_form_ident()) + v = name_vector(get_form_ident()) + m = name_jacobian(get_form_ident()) - assembly = ["using M = typename {}::Traits::Jacobian;".format(t_go), - "M m({});".format(n_go)] + assembly + action = "{}.jacobian({},{});".format(n_go, v, m) - return assembly + return timed_region("matrix_assembly", action) diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py index 3c06fa2bc0dcd14f68248a4c063dc0ba906b2568..dfc553fbefee7fa8c2e6aa1d92ae7ea047acdd38 100644 --- a/python/dune/codegen/pdelab/localoperator.py +++ b/python/dune/codegen/pdelab/localoperator.py @@ -17,6 +17,7 @@ from dune.codegen.generation import (accumulation_mixin, constructor_parameter, domain, dump_accumulate_timer, + register_liwkid_timer, end_of_file, function_mangler, generator_factory, @@ -677,6 +678,19 @@ class TimerMethod(ClassMember): ClassMember.__init__(self, content) +class RegisterLikwidMethod(ClassMember): + def __init__(self): + knl = name_example_kernel() + assert(knl is not None) + + content = ["void register_likwid_timers()" + "{"] + register_liwkid_timers = [i for i in retrieve_cache_items(condition='register_likwid_timers')] + content.extend(map(lambda x: ' ' + x, register_liwkid_timers)) + content += ["}"] + ClassMember.__init__(self, content) + + class LoopyKernelMethod(ClassMember): def __init__(self, signature, kernel, add_timings=True, initializer_list=[]): from loopy import generate_body @@ -697,26 +711,49 @@ class LoopyKernelMethod(ClassMember): from dune.codegen.pdelab.signatures import assembler_routine_name timer_name = assembler_routine_name() + '_kernel' name_example_kernel(name=timer_name) - post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') - content.append(' ' + 'HP_TIMER_START({});'.format(timer_name)) - dump_accumulate_timer(timer_name) - if add_timings and get_option("instrumentation_level") >= 4: - setuptimer = '{}_kernel_setup'.format(assembler_routine_name()) - post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile') - content.append(' HP_TIMER_START({});'.format(setuptimer)) - dump_accumulate_timer(setuptimer) + if get_option('use_likwid'): + from dune.codegen.pdelab.driver.timings import init_likwid_timer + include_file("likwid.h", filetag="operatorfile") + init_likwid_timer(timer_name) + content.append(' ' + 'LIKWID_MARKER_START(\"{}\");'.format(timer_name)) + register_liwkid_timer(timer_name) + else: + post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') + content.append(' ' + 'HP_TIMER_START({});'.format(timer_name)) + dump_accumulate_timer(timer_name) + + if add_timings and get_option("instrumentation_level") >= 4: + setuptimer = '{}_kernel_setup'.format(assembler_routine_name()) + if get_option('use_likwid'): + from dune.codegen.pdelab.driver.timings import init_likwid_timer + init_likwid_timer(setuptimer) + content.append(' ' + 'LIKWID_MARKER_START(\"{}\");'.format(setuptimer)) + register_liwkid_timer(setuptimer) + else: + post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile') + content.append(' HP_TIMER_START({});'.format(setuptimer)) + dump_accumulate_timer(setuptimer) # Add kernel preamble for i, p in kernel.preambles: content.append(' ' + p) + if add_timings and get_option('instrumentation_level') >= 4: + if get_option('use_likwid'): + content.append(' ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(setuptimer)) + else: + content.append(' ' + 'HP_TIMER_STOP({});'.format(setuptimer)) + # Add kernel body content.extend(l for l in generate_body(kernel).split('\n')[1:-1]) # Stop timer if add_timings and get_option('instrumentation_level') >= 3: - content.append(' ' + 'HP_TIMER_STOP({});'.format(timer_name)) + if get_option('use_likwid'): + content.append(' ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(timer_name)) + else: + content.append(' ' + 'HP_TIMER_STOP({});'.format(timer_name)) content.append('}') ClassMember.__init__(self, content, name=kernel.name if kernel is not None else "") @@ -1145,7 +1182,10 @@ def generate_localoperator_file(kernels, filename): if get_option('instrumentation_level') >= 3: include_file('dune/codegen/common/timer.hh', filetag='operatorfile') - operator_methods.append(TimerMethod()) + if get_option('use_likwid'): + operator_methods.append(RegisterLikwidMethod()) + else: + operator_methods.append(TimerMethod()) elif get_option('opcounter'): include_file('dune/codegen/common/timer.hh', filetag='operatorfile') diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py index b4a48b49473c64b3b46136f7654c34fc53f43ca0..97e7f1d540823d1753197272f0ccd0a415d8a8b2 100644 --- a/python/dune/codegen/sumfact/autotune.py +++ b/python/dune/codegen/sumfact/autotune.py @@ -7,6 +7,7 @@ import filelock import hashlib import json from operator import mul +import time import loopy as lp from pytools import product @@ -96,10 +97,8 @@ def write_global_data(sf, filename): def write_setup_code(sf, filename, define_thetas=True): - opcounting = get_option("opcounter") with open(filename, "a") as f: # Setup a polynomial object (normally done in the LocalOperator members) - set_option("opcounter", False) from dune.codegen.loopy.target import type_floatingpoint real = type_floatingpoint() f.write(" using RF = {};\n".format(real)) @@ -118,11 +117,12 @@ def write_setup_code(sf, filename, define_thetas=True): constructor_knl = lp.get_one_scheduled_kernel(constructor_knl) # Allocate buffers + alignment = get_option("max_vector_width") // 8 size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width, product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width) size = int(size * (get_option("precision_bits") / 8)) - f.writelines([" char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size), - " char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size), + f.writelines([" char buffer0[{}] __attribute__ ((aligned ({})));\n".format(size, alignment), + " char buffer1[{}] __attribute__ ((aligned ({})));\n".format(size, alignment), ]) # Setup fastdg inputs @@ -131,7 +131,7 @@ def write_setup_code(sf, filename, define_thetas=True): f.write("{} = 0;\n".format(arg)) else: size = sf.interface.fastdg_interface_object_size - f.write(" RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size)) + f.write(" RF {}[{}] __attribute__ ((aligned ({})));\n".format(arg.split()[-1], size, alignment)) # Write stuff into the input buffer f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real), @@ -174,12 +174,15 @@ def write_setup_code(sf, filename, define_thetas=True): " rng.seed(42);\n", " std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)), ]) - return opcounting def generate_standalone_code_google_benchmark(sf, filename): delete_cache_items("kernel_default") + # Turn off opcounting + opcounting = get_option("opcounter") + set_option("opcounter", False) + # Extract sum factorization kernel from dune.codegen.pdelab.localoperator import extract_kernel_from_cache knl = realize_sumfact_kernel_function(sf) @@ -207,6 +210,10 @@ def generate_standalone_code_google_benchmark(sf, filename): write_global_data(sf, filename) with open(filename, "a") as f: + arguments = ', '.join(sf.interface.signature_args) + if len(arguments) > 0: + arguments = ', ' + arguments + arguments = 'const char* buffer0, const char* buffer1' + arguments f.write("void sumfact_kernel({})\n".format(arguments)) for line in knl.member.lines[1:]: f.write("{}\n".format(line)) @@ -214,7 +221,7 @@ def generate_standalone_code_google_benchmark(sf, filename): f.write("\n\n") f.write("static void BM_sumfact_kernel(benchmark::State& state){\n") - opcounting = write_setup_code(sf, filename, define_thetas=False) + write_setup_code(sf, filename, define_thetas=False) additional_arguments = [i.split()[-1] for i in sf.interface.signature_args] additional_arguments = ', '.join(additional_arguments) @@ -229,12 +236,18 @@ def generate_standalone_code_google_benchmark(sf, filename): "\n", "BENCHMARK_MAIN();" ]) + + # Maybe turn opcounting on again set_option("opcounter", opcounting) def generate_standalone_code(sf, filename): delete_cache_items("kernel_default") + # Turn off opcounting + opcounting = get_option("opcounter") + set_option("opcounter", False) + # Extract sum factorization kernel from dune.codegen.pdelab.localoperator import extract_kernel_from_cache knl = realize_sumfact_kernel_function(sf) @@ -259,7 +272,7 @@ def generate_standalone_code(sf, filename): "{\n", ]) - opcounting = write_setup_code(sf, filename) + write_setup_code(sf, filename) # Write measurement with open(filename, "a") as f: @@ -285,133 +298,12 @@ def generate_standalone_code(sf, filename): " std::cout << accum;\n", "}\n", ]) - set_option("opcounter", opcounting) - - -def generate_standalone_kernel_code(kernel, signature, filename): - with open(filename, 'w') as f: - # Write headers - headers = ['#include "config.h"', - '#include <iostream>', - '#include <fstream>', - '#include <random>', - '#include "benchmark/benchmark.h"', - '#include <dune/codegen/common/vectorclass.hh>', - '#include <dune/codegen/sumfact/horizontaladd.hh>', - ] - f.write("\n".join(headers)) - - # Get a list of the function argument names - assert len(signature) == 1 - sig = signature[0] - arguments = sig[sig.find('(') +1:sig.find(')')].split(',') - arguments = [a.split(' ')[-1] for a in arguments] - - global_args = [a for a in kernel.args if a.name not in arguments] - - # Declare global arguments - f.write('\n\n') - target = DuneTarget() - for g in global_args: - decl_info = g.decl_info(target, True, g.dtype) - for idi in decl_info: - ast_builder = target.get_device_ast_builder() - arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name) - arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape)) - arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl) - f.write('{}\n'.format(arg_decl)) - - # Generate function we want to benchmark - f.write('\n') - f.write(sig[0:sig.find(')')+1]) - f.writelines(lp.generate_body(kernel)) - f.write('\n\n') - - # Generate function that will do the benchmarking - f.write('static void BM_sumfact_kernel(benchmark::State& state){\n') - - # Declare random generators - real = type_floatingpoint() - lines = [' std::uniform_real_distribution<{}> unif(0,1);'.format(real), - ' std::uniform_int_distribution<int> unif_int(0,128);', - ' std::default_random_engine re;'] - f.write('\n'.join(lines) + '\n') - - # Declare function arguments - function_arguments = [a for a in kernel.args if a.name in arguments] - for arg in function_arguments: - if 'buffer' in arg.name: - byte_size = reduce(mul, arg.shape) * 8 - f.write(' char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name, - byte_size, - arg.alignment),) - elif isinstance(arg, lp.ValueArg): - assert 'jacobian_offset' in arg.name - decl = arg.get_arg_decl(ast_builder) - decl = Initializer(decl, 'unif_int(re)') - f.write(' {}\n'.format(decl)) - else: - assert 'fastdg' in arg.name - size = reduce(mul, arg.shape) - alignment = arg.dtype.itemsize - f.write(' {} {}[{}] __attribute__ ((aligned ({})));\n'.format(real, - arg.name, - size, - alignment)) - - # Initialize arguments - def _initialize_arg(arg): - if isinstance(arg, lp.ValueArg): - return [] - real = type_floatingpoint() - size = reduce(mul, arg.shape) - fill_name = arg.name + '_fill' - lines = [' {}* {} = (double *) {};'.format(real, fill_name, arg.name), - ' for (std::size_t i=0; i<{}; ++i){{'.format(size), - ' {}[i] = unif(re);'.format(fill_name), - ' }'] - return lines - - for arg in kernel.args: - lines = _initialize_arg(arg) - f.write('\n'.join(lines) + '\n') - - # Benchmark loop - function_call = kernel.name + '({})'.format(','.join(arguments)) - f.writelines([' for (auto _ : state){\n', - ' {};\n'.format(function_call), - ' }\n', - ]) - f.write('}\n') - - # Benchmark main - main = ['', - 'BENCHMARK(BM_sumfact_kernel);', - '', - 'BENCHMARK_MAIN();'] - f.write('\n'.join(main)) - - -def autotune_realization(sf=None, kernel=None, signature=None): - """"Generate an executable run a benchmark and return time - - The benchmark can be generated from a SumfactKernel or a LoopKernel with a - function signature. For SumfactKernels you can generate benchmarks with or - without google benchmark, for LoopKernels only google benchmarks are - possible. - - Parameters - ---------- - sf : SumfactKernel or VectorizedSumfactKernel - kernel : loopy.kernel.LoopKernel - signature : str - """ - if sf is None: - assert kernel and signature - assert get_option("autotune_google_benchmark") - else: - assert kernel is None and signature is None + # Maybe turn opcounting on again + set_option("opcounter", opcounting) + + +def autotune_realization(sf): # Make sure that the benchmark directory exists dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks") if not os.path.exists(dir): @@ -452,6 +344,10 @@ def autotune_realization(sf=None, kernel=None, signature=None): if ret != 0: raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call))) + # File system synchronization! + while not os.path.exists(executable): + time.sleep(0.01) + # Check whether the user specified an execution wrapper call = [] wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_EXECUTION_WRAPPER") @@ -469,10 +365,19 @@ def autotune_realization(sf=None, kernel=None, signature=None): if ret != 0: raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call))) + # File system synchronization! + while not os.path.exists(logname): + time.sleep(0.01) + # Extract the result form the log file if get_option("autotune_google_benchmark"): + import json with open(logname) as json_file: - data = json.load(json_file) - return data['benchmarks'][0]['cpu_time'] + try: + data = json.load(json_file) + return data['benchmarks'][0]['cpu_time'] + except Exception as e: + print("Error while loading file {}".format(logname)) + raise e else: return float(next(iter(open(logname, "r")))) / 1000000