From a34261f526c0f85a7cacaf2f9e4f17af4af7152d Mon Sep 17 00:00:00 2001 From: pchintalapudi <34727397+pchintalapudi@users.noreply.github.com> Date: Sat, 22 Apr 2023 20:44:16 +0000 Subject: [PATCH] Add ITTAPI timing functions to JL_TIMING (#49448) --- src/codegen.cpp | 6 +-- src/gc.c | 8 ++-- src/jitlayers.cpp | 4 +- src/threading.c | 1 + src/timing.c | 34 +++++++++----- src/timing.h | 117 ++++++++++++++++++++++++++++++++-------------- 6 files changed, 115 insertions(+), 55 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index 3cde68c60ced3..b50110a20a8fc 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -8435,7 +8435,7 @@ jl_llvm_functions_t jl_emit_code( jl_value_t *jlrettype, jl_codegen_params_t ¶ms) { - JL_TIMING(CODEGEN, CODEGEN); + JL_TIMING(CODEGEN, CODEGEN_LLVM); jl_timing_show_func_sig((jl_value_t *)li->specTypes, JL_TIMING_CURRENT_BLOCK); // caller must hold codegen_lock jl_llvm_functions_t decls = {}; @@ -8478,7 +8478,7 @@ jl_llvm_functions_t jl_emit_codeinst( jl_code_info_t *src, jl_codegen_params_t ¶ms) { - JL_TIMING(CODEGEN, CODEGEN); + JL_TIMING(CODEGEN, CODEGEN_Codeinst); jl_timing_show_method_instance(codeinst->def, JL_TIMING_CURRENT_BLOCK); JL_GC_PUSH1(&src); if (!src) { @@ -8558,7 +8558,7 @@ void jl_compile_workqueue( Module &original, jl_codegen_params_t ¶ms, CompilationPolicy policy) { - JL_TIMING(CODEGEN, CODEGEN); + JL_TIMING(CODEGEN, CODEGEN_Workqueue); jl_code_info_t *src = NULL; JL_GC_PUSH1(&src); while (!params.workqueue.empty()) { diff --git a/src/gc.c b/src/gc.c index 8653b8a05851f..388cf7fa6a671 100644 --- a/src/gc.c +++ b/src/gc.c @@ -311,7 +311,7 @@ NOINLINE uintptr_t gc_get_stack_ptr(void) void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) { - JL_TIMING(GC, Stop); + JL_TIMING(GC, GC_Stop); #ifdef USE_TRACY TracyCZoneCtx ctx = *(JL_TIMING_CURRENT_BLOCK->tracy_ctx); TracyCZoneColor(ctx, 0x696969); @@ -3043,7 +3043,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) uint64_t start_mark_time = jl_hrtime(); JL_PROBE_GC_MARK_BEGIN(); { - JL_TIMING(GC, Mark); + JL_TIMING(GC, GC_Mark); // 1. fix GC bits of objects in the remset. assert(gc_n_threads); @@ -3204,7 +3204,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) uint64_t start_sweep_time = jl_hrtime(); JL_PROBE_GC_SWEEP_BEGIN(sweep_full); { - JL_TIMING(GC, Sweep); + JL_TIMING(GC, GC_Sweep); #ifdef USE_TRACY if (sweep_full) { TracyCZoneCtx ctx = *(JL_TIMING_CURRENT_BLOCK->tracy_ctx); @@ -3405,7 +3405,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) // Doing this on all threads is racy (it's impossible to check // or wait for finalizers on other threads without dead lock). if (!ptls->finalizers_inhibited && ptls->locks.len == 0) { - JL_TIMING(GC, Finalizers); + JL_TIMING(GC, GC_Finalizers); run_finalizers(ct); } JL_PROBE_GC_FINALIZER(); diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 0f75789077dbc..37302e8ca2ace 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -192,7 +192,7 @@ static jl_callptr_t _jl_compile_codeinst( "invalid world for method-instance"); assert(src && jl_is_code_info(src)); - JL_TIMING(LLVM_MODULE_FINISH, LLVM_MODULE_FINISH); + JL_TIMING(CODEINST_COMPILE, CODEINST_COMPILE); jl_callptr_t fptr = NULL; // emit the code in LLVM IR form @@ -1435,7 +1435,7 @@ void JuliaOJIT::addGlobalMapping(StringRef Name, uint64_t Addr) void JuliaOJIT::addModule(orc::ThreadSafeModule TSM) { - JL_TIMING(LLVM_MODULE_FINISH, LLVM_MODULE_FINISH); + JL_TIMING(LLVM_ORC, LLVM_ORC); ++ModulesAdded; orc::SymbolLookupSet NewExports; TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT { diff --git a/src/threading.c b/src/threading.c index 76c21496927a8..ea9ec8e16ca45 100644 --- a/src/threading.c +++ b/src/threading.c @@ -744,6 +744,7 @@ void _jl_mutex_wait(jl_task_t *self, jl_mutex_t *lock, int safepoint) lock->count++; return; } + JL_TIMING(LOCK_SPIN, LOCK_SPIN); jl_profile_lock_start_wait(lock); while (1) { if (owner == NULL && jl_atomic_cmpswap(&lock->owner, &owner, self)) { diff --git a/src/timing.c b/src/timing.c index 7337902010a3d..65c350f247b27 100644 --- a/src/timing.c +++ b/src/timing.c @@ -11,28 +11,26 @@ extern "C" { #endif #ifdef ENABLE_TIMINGS -#include "timing.h" #ifndef HAVE_TIMING_SUPPORT #error Timings are not supported on your compiler #endif static uint64_t t0; -#ifdef USE_TRACY +#if defined(USE_TRACY) || defined(USE_ITTAPI) /** * These sources often generate millions of events / minute. Although Tracy * can generally keep up with that, those events also bloat the saved ".tracy" * files, so we disable them by default. **/ -JL_DLLEXPORT uint64_t jl_timing_enable_mask = 0xFFFFFFFFFFFFFFFF & - ~(1ull << JL_TIMING_ROOT) & - ~(1ull << JL_TIMING_TYPE_CACHE_LOOKUP) & - ~(1ull << JL_TIMING_METHOD_MATCH) & - ~(1ull << JL_TIMING_METHOD_LOOKUP_FAST) & - ~(1ull << JL_TIMING_AST_COMPRESS) & - ~(1ull << JL_TIMING_AST_UNCOMPRESS); +JL_DLLEXPORT uint64_t jl_timing_enable_mask = ~((1ull << JL_TIMING_ROOT) | + (1ull << JL_TIMING_TYPE_CACHE_LOOKUP) | + (1ull << JL_TIMING_METHOD_MATCH) | + (1ull << JL_TIMING_METHOD_LOOKUP_FAST) | + (1ull << JL_TIMING_AST_COMPRESS) | + (1ull << JL_TIMING_AST_UNCOMPRESS)); #else -JL_DLLEXPORT uint64_t jl_timing_enable_mask = 0xFFFFFFFFFFFFFFFF; +JL_DLLEXPORT uint64_t jl_timing_enable_mask = ~0ull; #endif JL_DLLEXPORT uint64_t jl_timing_counts[(int)JL_TIMING_LAST] = {0}; @@ -43,11 +41,15 @@ JL_DLLEXPORT uint32_t jl_timing_print_limit = 10; const char *jl_timing_names[(int)JL_TIMING_LAST] = { -#define X(name) #name +#define X(name) #name, JL_TIMING_OWNERS #undef X }; +#ifdef USE_ITTAPI +JL_DLLEXPORT __itt_event jl_timing_ittapi_events[(int)JL_TIMING_EVENT_LAST]; +#endif + void jl_print_timings(void) { uint64_t total_time = cycleclock() - t0; @@ -66,6 +68,16 @@ void jl_print_timings(void) void jl_init_timing(void) { t0 = cycleclock(); + + _Static_assert(JL_TIMING_EVENT_LAST < sizeof(uint64_t) * CHAR_BIT, "Too many timing events!"); + _Static_assert((int)JL_TIMING_LAST <= (int)JL_TIMING_EVENT_LAST, "More owners than events!"); + + int i = 0; +#ifdef USE_ITTAPI +#define X(name) jl_timing_ittapi_events[i++] = __itt_event_create(#name, strlen(#name)); + JL_TIMING_EVENTS +#undef X +#endif } void jl_destroy_timing(void) diff --git a/src/timing.h b/src/timing.h index de8d980c357fb..ddf9b1d5201d8 100644 --- a/src/timing.h +++ b/src/timing.h @@ -3,6 +3,8 @@ #ifndef JL_TIMING_H #define JL_TIMING_H +#include "julia.h" + #ifdef __cplusplus extern "C" { #endif @@ -106,39 +108,60 @@ void jl_timing_printf(jl_timing_block_t *cur_block, const char *format, ...); #define JL_TIMING_CURRENT_BLOCK (&__timing_block) #endif -#define JL_TIMING_OWNERS \ - X(ROOT), \ - X(GC), \ - X(LOWERING), \ - X(PARSING), \ - X(INFERENCE), \ - X(CODEGEN), \ - X(METHOD_LOOKUP_SLOW), \ - X(METHOD_LOOKUP_FAST), \ - X(LLVM_OPT), \ - X(LLVM_MODULE_FINISH), \ - X(METHOD_MATCH), \ - X(TYPE_CACHE_LOOKUP), \ - X(TYPE_CACHE_INSERT), \ - X(STAGED_FUNCTION), \ - X(MACRO_INVOCATION), \ - X(AST_COMPRESS), \ - X(AST_UNCOMPRESS), \ - X(SYSIMG_LOAD), \ - X(SYSIMG_DUMP), \ - X(NATIVE_DUMP), \ - X(ADD_METHOD), \ - X(LOAD_MODULE), \ - X(SAVE_MODULE), \ - X(INIT_MODULE), +#define JL_TIMING_OWNERS \ + X(ROOT) \ + X(GC) \ + X(LOWERING) \ + X(PARSING) \ + X(INFERENCE) \ + X(CODEGEN) \ + X(METHOD_LOOKUP_SLOW) \ + X(METHOD_LOOKUP_FAST) \ + X(CODEINST_COMPILE) \ + X(LLVM_OPT) \ + X(LLVM_ORC) \ + X(METHOD_MATCH) \ + X(TYPE_CACHE_LOOKUP) \ + X(TYPE_CACHE_INSERT) \ + X(STAGED_FUNCTION) \ + X(MACRO_INVOCATION) \ + X(AST_COMPRESS) \ + X(AST_UNCOMPRESS) \ + X(SYSIMG_LOAD) \ + X(SYSIMG_DUMP) \ + X(NATIVE_DUMP) \ + X(ADD_METHOD) \ + X(LOAD_MODULE) \ + X(SAVE_MODULE) \ + X(INIT_MODULE) \ + X(LOCK_SPIN) \ + + +#define JL_TIMING_EVENTS \ + JL_TIMING_OWNERS \ + X(GC_Stop) \ + X(GC_Mark) \ + X(GC_Sweep) \ + X(GC_Finalizers) \ + X(CODEGEN_LLVM) \ + X(CODEGEN_Codeinst) \ + X(CODEGEN_Workqueue) \ + enum jl_timing_owners { -#define X(name) JL_TIMING_ ## name +#define X(name) JL_TIMING_ ## name, JL_TIMING_OWNERS #undef X JL_TIMING_LAST }; +enum jl_timing_events { +#define X(name) JL_TIMING_EVENT_ ## name, + JL_TIMING_EVENTS +#undef X + JL_TIMING_EVENT_LAST +}; + /** * Timing back-ends differ in terms of whether they support nested * and asynchronous events. @@ -177,6 +200,18 @@ enum jl_timing_owners { #define _TRACY_DESTROY(block) #endif +#ifdef USE_ITTAPI +#define _ITTAPI_CTX_MEMBER int event; +#define _ITTAPI_CTOR(block, event) block->event = event +#define _ITTAPI_START(block) if (_jl_timing_enabled(block->event)) __itt_event_start(jl_timing_ittapi_events[block->event]) +#define _ITTAPI_STOP(block) if (_jl_timing_enabled(block->event)) __itt_event_end(jl_timing_ittapi_events[block->event]) +#else +#define _ITTAPI_CTX_MEMBER +#define _ITTAPI_CTOR(block, event) +#define _ITTAPI_START(block) +#define _ITTAPI_STOP(block) +#endif + /** * Implementation: Aggregated counts back-end **/ @@ -225,16 +260,27 @@ STATIC_INLINE void _jl_timing_counts_destroy(jl_timing_counts_t *block) JL_NOTSA extern uint64_t jl_timing_enable_mask; extern const char *jl_timing_names[(int)JL_TIMING_LAST]; +#ifdef USE_ITTAPI +extern __itt_event jl_timing_ittapi_events[(int)JL_TIMING_EVENT_LAST]; +#endif + struct _jl_timing_block_t { // typedef in julia.h struct _jl_timing_block_t *prev; _TRACY_CTX_MEMBER + _ITTAPI_CTX_MEMBER _COUNTS_CTX_MEMBER }; -STATIC_INLINE void _jl_timing_block_ctor(jl_timing_block_t *block, int owner) JL_NOTSAFEPOINT { +STATIC_INLINE int _jl_timing_enabled(int event) JL_NOTSAFEPOINT { + return !!(jl_timing_enable_mask & (1 << event)); +} + +STATIC_INLINE void _jl_timing_block_ctor(jl_timing_block_t *block, int owner, int event) JL_NOTSAFEPOINT { uint64_t t = cycleclock(); (void)t; _COUNTS_CTOR(&block->counts_ctx, owner); _COUNTS_START(&block->counts_ctx, t); + _ITTAPI_CTOR(block, event); + _ITTAPI_START(block); jl_task_t *ct = jl_current_task; jl_timing_block_t **prevp = &ct->ptls->timing_stack; @@ -248,6 +294,7 @@ STATIC_INLINE void _jl_timing_block_ctor(jl_timing_block_t *block, int owner) JL STATIC_INLINE void _jl_timing_block_destroy(jl_timing_block_t *block) JL_NOTSAFEPOINT { uint64_t t = cycleclock(); (void)t; + _ITTAPI_STOP(block); _COUNTS_STOP(&block->counts_ctx, t); _COUNTS_DESTROY(&block->counts_ctx); _TRACY_DESTROY(block->tracy_ctx); @@ -281,8 +328,8 @@ STATIC_INLINE void _jl_timing_suspend_destroy(jl_timing_suspend_t *suspend) JL_N #ifdef __cplusplus struct jl_timing_block_cpp_t { jl_timing_block_t block; - jl_timing_block_cpp_t(int owner) JL_NOTSAFEPOINT { - _jl_timing_block_ctor(&block, owner); + jl_timing_block_cpp_t(int owner, int event) JL_NOTSAFEPOINT { + _jl_timing_block_ctor(&block, owner, event); } ~jl_timing_block_cpp_t() JL_NOTSAFEPOINT { _jl_timing_block_destroy(&block); @@ -292,13 +339,13 @@ struct jl_timing_block_cpp_t { jl_timing_block_cpp_t& operator=(const jl_timing_block_cpp_t &) = delete; jl_timing_block_cpp_t& operator=(const jl_timing_block_cpp_t &&) = delete; }; -#define JL_TIMING(subsystem, event) jl_timing_block_cpp_t __timing_block(JL_TIMING_ ## subsystem); \ +#define JL_TIMING(subsystem, event) jl_timing_block_cpp_t __timing_block(JL_TIMING_ ## subsystem, JL_TIMING_EVENT_ ## event); \ _TRACY_CTOR(__timing_block.block.tracy_ctx, #event, (jl_timing_enable_mask >> (JL_TIMING_ ## subsystem)) & 1) #else #define JL_TIMING(subsystem, event) \ __attribute__((cleanup(_jl_timing_block_destroy))) \ jl_timing_block_t __timing_block; \ - _jl_timing_block_ctor(&__timing_block, JL_TIMING_ ## subsystem); \ + _jl_timing_block_ctor(&__timing_block, JL_TIMING_ ## subsystem, JL_TIMING_EVENT_ ## event); \ _TRACY_CTOR(__timing_block.tracy_ctx, #event, (jl_timing_enable_mask >> (JL_TIMING_ ## subsystem)) & 1) #endif @@ -311,10 +358,10 @@ struct jl_timing_suspend_cpp_t { ~jl_timing_suspend_cpp_t() JL_NOTSAFEPOINT { _jl_timing_suspend_destroy(&suspend); } - jl_timing_suspend_cpp_t(const jl_timing_block_cpp_t&) = delete; - jl_timing_suspend_cpp_t(const jl_timing_block_cpp_t&&) = delete; - jl_timing_suspend_cpp_t& operator=(const jl_timing_block_cpp_t &) = delete; - jl_timing_suspend_cpp_t& operator=(const jl_timing_block_cpp_t &&) = delete; + jl_timing_suspend_cpp_t(const jl_timing_suspend_cpp_t &) = delete; + jl_timing_suspend_cpp_t(jl_timing_suspend_cpp_t &&) = delete; + jl_timing_suspend_cpp_t& operator=(const jl_timing_suspend_cpp_t &) = delete; + jl_timing_suspend_cpp_t& operator=(jl_timing_suspend_cpp_t &&) = delete; }; #define JL_TIMING_SUSPEND(subsystem, ct) jl_timing_suspend_cpp_t __suspend_block(#subsystem, ct) #else