From 48a223c8494661d39777fe782edbb5647a376dd0 Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Tue, 14 Nov 2023 21:30:04 +0100 Subject: [PATCH] =?UTF-8?q?Reland=20[OpenMP][libomptarget]=20Enable=20para?= =?UTF-8?q?llel=20copies=20via=20multiple=20SDM=E2=80=A6=20(#72307)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …A engines (#71801) This enables the AMDGPU plugin to use a new ROCm 5.7 interface to dispatch asynchronous data transfers across SDMA engines. The default functionality stays unchanged, meaning that all data transfers are enqueued into a H2D queue or an D2H queue, depending on transfer direction, via the HSA interface used previously. The new interface can be enabled via the environment variable `LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES=true` when libomptarget is built against a recent ROCm version (5.7 and later). As of now, requests are distributed in a round-robin fashion across available SDMA engines. --- .../plugins-nextgen/amdgpu/src/rtl.cpp | 130 ++++++++++++------ 1 file changed, 86 insertions(+), 44 deletions(-) diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index a529c379844e90..8f64baa22cb39f 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -130,6 +130,45 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) { "Error in hsa_amd_agent_iterate_memory_pools: %s"); } +/// Dispatches an asynchronous memory copy. +/// Enables different SDMA engines for the dispatch in a round-robin fashion. +Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent, + const void *Src, hsa_agent_t SrcAgent, size_t Size, + uint32_t NumDepSignals, const hsa_signal_t *DepSignals, + hsa_signal_t CompletionSignal) { + if (!UseMultipleSdmaEngines) { + hsa_status_t S = + hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size, + NumDepSignals, DepSignals, CompletionSignal); + return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s"); + } + +// This solution is probably not the best +#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \ + HSA_AMD_INTERFACE_VERSION_MINOR >= 2) + return Plugin::error("Async copy on selected SDMA requires ROCm 5.7"); +#else + static std::atomic SdmaEngine{1}; + + // This atomics solution is probably not the best, but should be sufficient + // for now. + // In a worst case scenario, in which threads read the same value, they will + // dispatch to the same SDMA engine. This may result in sub-optimal + // performance. However, I think the possibility to be fairly low. + int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire); + // This call is only avail in ROCm >= 5.7 + hsa_status_t S = hsa_amd_memory_async_copy_on_engine( + Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals, + CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine, + /*force_copy_on_sdma=*/true); + // Increment to use one of three SDMA engines: 0x1, 0x2, 0x4 + LocalSdmaEngine = (LocalSdmaEngine << 1) % 7; + SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed); + + return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s"); +#endif +} + } // namespace utils /// Utility class representing generic resource references to AMDGPU resources. @@ -945,6 +984,9 @@ struct AMDGPUStreamTy { /// Timeout hint for HSA actively waiting for signal value to change const uint64_t StreamBusyWaitMicroseconds; + /// Indicate to spread data transfers across all avilable SDMAs + bool UseMultipleSdmaEngines; + /// Return the current number of asychronous operations on the stream. uint32_t size() const { return NextSlot; } @@ -1170,15 +1212,15 @@ struct AMDGPUStreamTy { InputSignal = nullptr; // Issue the async memory copy. - hsa_status_t Status; if (InputSignal) { hsa_signal_t InputSignalRaw = InputSignal->get(); - Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 1, - &InputSignalRaw, OutputSignal->get()); - } else - Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 0, - nullptr, OutputSignal->get()); - return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"); + return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, + CopySize, 1, &InputSignalRaw, + OutputSignal->get()); + } + + return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, + CopySize, 0, nullptr, OutputSignal->get()); } /// Push an asynchronous memory copy device-to-host involving an unpinned @@ -1214,21 +1256,19 @@ struct AMDGPUStreamTy { // Issue the first step: device to host transfer. Avoid defining the input // dependency if already satisfied. - hsa_status_t Status; if (InputSignal) { hsa_signal_t InputSignalRaw = InputSignal->get(); - Status = - hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1, - &InputSignalRaw, OutputSignals[0]->get()); + if (auto Err = utils::asyncMemCopy( + UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1, + &InputSignalRaw, OutputSignals[0]->get())) + return Err; } else { - Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 0, - nullptr, OutputSignals[0]->get()); + if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent, + Src, Agent, CopySize, 0, nullptr, + OutputSignals[0]->get())) + return Err; } - if (auto Err = - Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) - return Err; - // Consume another stream slot and compute dependencies. std::tie(Curr, InputSignal) = consume(OutputSignals[1]); assert(InputSignal && "Invalid input signal"); @@ -1242,7 +1282,7 @@ struct AMDGPUStreamTy { std::atomic_thread_fence(std::memory_order_release); // Issue the second step: host to host transfer. - Status = hsa_amd_signal_async_handler( + hsa_status_t Status = hsa_amd_signal_async_handler( InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, (void *)&Slots[Curr]); @@ -1318,16 +1358,14 @@ struct AMDGPUStreamTy { // Issue the second step: host to device transfer. Avoid defining the input // dependency if already satisfied. - hsa_status_t Status; if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 1, - &InputSignalRaw, OutputSignal->get()); - } else - Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 0, - nullptr, OutputSignal->get()); - - return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"); + return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, + Agent, CopySize, 1, &InputSignalRaw, + OutputSignal->get()); + } + return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent, + CopySize, 0, nullptr, OutputSignal->get()); } // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead @@ -1353,17 +1391,15 @@ struct AMDGPUStreamTy { // allocated by this runtime or the caller made the appropriate // access calls. - hsa_status_t Status; if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); - Status = - hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1, - &InputSignalRaw, OutputSignal->get()); - } else - Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, - 0, nullptr, OutputSignal->get()); - - return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s"); + return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, + SrcAgent, CopySize, 1, &InputSignalRaw, + OutputSignal->get()); + } + return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, + SrcAgent, CopySize, 0, nullptr, + OutputSignal->get()); } /// Synchronize with the stream. The current thread waits until all operations @@ -1788,6 +1824,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS", 64), OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000), + OMPX_UseMultipleSdmaEngines( + "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false), AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {} @@ -2206,10 +2244,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size, - 0, nullptr, Signal.get()); - if (auto Err = - Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) + if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, + Agent, PinnedPtr, Agent, Size, 0, + nullptr, Signal.get())) return Err; if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) @@ -2267,10 +2304,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size, - 0, nullptr, Signal.get()); - if (auto Err = - Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) + if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr, + Agent, TgtPtr, Agent, Size, 0, nullptr, + Signal.get())) return Err; if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) @@ -2633,6 +2669,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { }); } + bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; } + private: using AMDGPUEventRef = AMDGPUResourceRef; using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy; @@ -2702,6 +2740,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// are microseconds. UInt32Envar OMPX_StreamBusyWait; + /// Use ROCm 5.7 interface for multiple SDMA engines + BoolEnvar OMPX_UseMultipleSdmaEngines; + /// Stream manager for AMDGPU streams. AMDGPUStreamManagerTy AMDGPUStreamManager; @@ -2803,7 +2844,8 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device) SignalManager(Device.getSignalManager()), Device(Device), // Initialize the std::deque with some empty positions. Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr), - StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {} + StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()), + UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {} /// Class implementing the AMDGPU-specific functionalities of the global /// handler.