diff --git a/Project.toml b/Project.toml index 3f73c968f13..effb24d3251 100644 --- a/Project.toml +++ b/Project.toml @@ -57,8 +57,10 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Convex = "f65535da-76fb-5f13-bab9-19810c17039a" ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199" +IntelITT = "c9b2f978-7543-4802-ae44-75068f23ee64" Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56" +NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5" @@ -66,8 +68,10 @@ SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5" TrixiAMDGPUExt = "AMDGPU" TrixiCUDAExt = "CUDA" TrixiConvexECOSExt = ["Convex", "ECOS"] +TrixiIntelITTExt = "IntelITT" TrixiMakieExt = "Makie" TrixiNLsolveExt = "NLsolve" +TrixiNVTXExt = ["NVTX", "CUDA"] TrixiPlotsExt = "Plots" TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer" @@ -89,6 +93,7 @@ EllipsisNotation = "1.0" FillArrays = "1.9" ForwardDiff = "0.10.38, 1" HDF5 = "0.17" +IntelITT = "0.2" KernelAbstractions = "0.9.38" LinearAlgebra = "1" LinearMaps = "2.7, 3.0" @@ -97,6 +102,7 @@ MPI = "0.20.23" Makie = "0.22, 0.23, 0.24" MuladdMacro = "0.2.4" NLsolve = "4.5.1" +NVTX = "1.0.0" Octavian = "0.3.28" OffsetArrays = "1.13" P4est = "0.4.12" diff --git a/benchmark/CUDA/Project.toml b/benchmark/CUDA/Project.toml index 2e9f130fe6c..1bdbed2467b 100644 --- a/benchmark/CUDA/Project.toml +++ b/benchmark/CUDA/Project.toml @@ -1,6 +1,7 @@ [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb" @@ -11,6 +12,7 @@ Trixi = {path = "../.."} [compat] CUDA = "5.8.2" JSON = "1.4.0" +NVTX = "1" OrdinaryDiffEqLowStorageRK = "1.12.0" TimerOutputs = "0.5.25" Trixi = "0.16" diff --git a/benchmark/CUDA/run.jl b/benchmark/CUDA/run.jl index b9d02246c9b..89907cb9e9b 100644 --- a/benchmark/CUDA/run.jl +++ b/benchmark/CUDA/run.jl @@ -1,5 +1,6 @@ using Trixi using CUDA +using NVTX # Load to get tracing support for Trixi using TimerOutputs using JSON diff --git a/docs/src/performance.md b/docs/src/performance.md index 3d4ff8fdfce..505663735ae 100644 --- a/docs/src/performance.md +++ b/docs/src/performance.md @@ -288,3 +288,44 @@ requires. It can thus be seen as a proxy for "energy used" and, as an extension, timing result, you need to set the analysis interval such that the `AnalysisCallback` is invoked at least once during the course of the simulation and discard the first PID value. + +## Tracing support for profilers + +Trixi supports tracing profiler integration through [ittapi](https://github.com/intel/ittapi) for Intel VTune and [NVTX](https://github.com/NVIDIA/NVTX) for [NVIDIA Nsight Systems](https://developer.nvidia.com/nsight-systems). + +!!! note "Extensions" + Tracing support is implemented through extensions and requires trigger packages to be loaded. + +Tracing support is only available for regions that are instrumented with `@trixi_timeit_ext`. + +### Using Intel VTune + +We can use Intel VTune to profile CPU code. For more information see the [Julia documentation](https://docs.julialang.org/en/v1/manual/profile/#External-Profiling) and the [IntelITT.jl](https://github.com/JuliaPerf/IntelITT.jl) package. + +!!! note "Trigger package" + ```julia + using IntelITT + ``` + +To get the most out of Intel VTune we recommend passing the environment flag `ENABLE_JITPROFILING=1` to Julia, which will allow you to symbolize JIT compiled call frames. + +!!! note "Usage of `juliaup`" + Sometime `juliaup` can make it harder for a profiler to attach to the right process. You can use `Base.julia_cmd()` in the REPL to obtain the path to the actual Julia binary you will be running. + + +### NVIDIA Nsight Systems + +We can use NVIDIA Nsight Systems to trace GPU. + +We recommend reading the CUDA.jl documentation on using [Nsight Systems](https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems) + +!!! note "Trigger package" + ```julia + using CUDA + using NVTX + ``` + +You can also just use `CUDA.@profile` (see [Integrated Profiler](https://cuda.juliagpu.org/stable/development/profiling/#Integrated-profiler)) to obtain profiler results that include the NVTX ranges. + +#### Known limitation +Nsight Systems can also be used for CPU and in particular MPI codes. The Trixi extension will only be enabled when GPU backend is being used. diff --git a/ext/TrixiIntelITTExt.jl b/ext/TrixiIntelITTExt.jl new file mode 100644 index 00000000000..3ce1158ed6e --- /dev/null +++ b/ext/TrixiIntelITTExt.jl @@ -0,0 +1,28 @@ +module TrixiIntelITTExt + +using Trixi: CPU +import Trixi: trixi_range_active, trixi_range_start, trixi_range_end + +import IntelITT + +const domain = Ref{IntelITT.Domain}() +function __init__() + domain[] = IntelITT.Domain("Trixi") +end + +function trixi_range_active(::Union{Nothing, CPU}) + return IntelITT.isactive() +end + +function trixi_range_start(::Union{Nothing, CPU}, label) + task = IntelITT.Task(domain[], label) + IntelITT.start(task) + return task +end + +function trixi_range_end(::Union{Nothing, CPU}, id) + IntelITT.stop(id) + return nothing +end + +end # module diff --git a/ext/TrixiNVTXExt.jl b/ext/TrixiNVTXExt.jl new file mode 100644 index 00000000000..69c79c7a3d0 --- /dev/null +++ b/ext/TrixiNVTXExt.jl @@ -0,0 +1,25 @@ +module TrixiNVTXExt + +using NVTX +using CUDA: CUDABackend +import Trixi: trixi_range_active, trixi_range_start, trixi_range_end + +# One can also use Nsight Systems and thus NVTX for CPU code + +const domain = NVTX.Domain("Trixi") +const color = 0xff40e0d0 # turquoise + +function trixi_range_active(::CUDABackend) + return NVTX.isactive() +end + +function trixi_range_start(::CUDABackend, label) + return NVTX.range_start(NVTX.init!(domain); message = label, color = color) +end + +function trixi_range_end(::CUDABackend, id) + NVTX.range_end(id) + return nothing +end + +end # module diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl index 28e4c47d339..887b8e10746 100644 --- a/src/auxiliary/auxiliary.jl +++ b/src/auxiliary/auxiliary.jl @@ -82,6 +82,35 @@ end return ncalls_first end +# TODO: move to KernelAbstractions +""" + trixi_range_active(backend) + +Returns `true` if the given `backend` supports range annotations and a profiler is active, `false` otherwise. +""" +function trixi_range_active(backend::Any) + return false +end + +""" + trixi_range_start(backend, label) + +Starts a range annotation for the given `backend` with the specified `label`. +Returns a handle to the started range, which should be passed to `trixi_range_end` to end the range annotation. +""" +function trixi_range_start(backend::Any, label) + return nothing +end + +""" + trixi_range_end(backend, id) + +Ends a range annotation for the given `backend` with the specified `id`. +""" +function trixi_range_end(backend::Any, id) + return nothing +end + """ @trixi_timeit_ext backend timer() "some label" expression @@ -93,10 +122,17 @@ See also [`@trixi_timeit`](@ref). """ macro trixi_timeit_ext(backend, timer_output, label, expr) expr = quote + local active = $trixi_range_active($(esc(backend))) + if active + id = $trixi_range_start($(esc(backend)), $(esc(label))) + end local val = $(esc(expr)) if $(esc(backend)) !== nothing && $(TrixiBase).timeit_debug_enabled() $(KernelAbstractions.synchronize)($(esc(backend))) end + if active + $trixi_range_end($(esc(backend)), id) + end val end return :(@trixi_timeit($(esc(timer_output)), $(esc(label)), $(expr)))