Skip to content

Commit

Permalink
Reland [OpenMP][libomptarget] Enable parallel copies via multiple SDM… (
Browse files Browse the repository at this point in the history
llvm#72307)

…A engines (llvm#71801)

This enables the AMDGPU plugin to use a new ROCm 5.7 interface to
dispatch asynchronous data transfers across SDMA engines.

The default functionality stays unchanged, meaning that all data
transfers are enqueued into a H2D queue or an D2H queue, depending on
transfer direction, via the HSA interface used previously.

The new interface can be enabled via the environment variable
`LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES=true` when libomptarget
is built against a recent ROCm version (5.7 and later). As of now,
requests are distributed in a round-robin fashion across available SDMA
engines.
  • Loading branch information
jplehr authored and zahiraam committed Nov 20, 2023
1 parent bd261cf commit 48a223c
Showing 1 changed file with 86 additions and 44 deletions.
130 changes: 86 additions & 44 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,45 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
"Error in hsa_amd_agent_iterate_memory_pools: %s");
}

/// Dispatches an asynchronous memory copy.
/// Enables different SDMA engines for the dispatch in a round-robin fashion.
Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
const void *Src, hsa_agent_t SrcAgent, size_t Size,
uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
hsa_signal_t CompletionSignal) {
if (!UseMultipleSdmaEngines) {
hsa_status_t S =
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
NumDepSignals, DepSignals, CompletionSignal);
return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s");
}

// This solution is probably not the best
#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
#else
static std::atomic<int> SdmaEngine{1};

// This atomics solution is probably not the best, but should be sufficient
// for now.
// In a worst case scenario, in which threads read the same value, they will
// dispatch to the same SDMA engine. This may result in sub-optimal
// performance. However, I think the possibility to be fairly low.
int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
// This call is only avail in ROCm >= 5.7
hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
/*force_copy_on_sdma=*/true);
// Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
LocalSdmaEngine = (LocalSdmaEngine << 1) % 7;
SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);

return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s");
#endif
}

} // namespace utils

/// Utility class representing generic resource references to AMDGPU resources.
Expand Down Expand Up @@ -945,6 +984,9 @@ struct AMDGPUStreamTy {
/// Timeout hint for HSA actively waiting for signal value to change
const uint64_t StreamBusyWaitMicroseconds;

/// Indicate to spread data transfers across all avilable SDMAs
bool UseMultipleSdmaEngines;

/// Return the current number of asychronous operations on the stream.
uint32_t size() const { return NextSlot; }

Expand Down Expand Up @@ -1170,15 +1212,15 @@ struct AMDGPUStreamTy {
InputSignal = nullptr;

// Issue the async memory copy.
hsa_status_t Status;
if (InputSignal) {
hsa_signal_t InputSignalRaw = InputSignal->get();
Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 1,
&InputSignalRaw, OutputSignal->get());
} else
Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 0,
nullptr, OutputSignal->get());
return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
CopySize, 1, &InputSignalRaw,
OutputSignal->get());
}

return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
CopySize, 0, nullptr, OutputSignal->get());
}

/// Push an asynchronous memory copy device-to-host involving an unpinned
Expand Down Expand Up @@ -1214,21 +1256,19 @@ struct AMDGPUStreamTy {

// Issue the first step: device to host transfer. Avoid defining the input
// dependency if already satisfied.
hsa_status_t Status;
if (InputSignal) {
hsa_signal_t InputSignalRaw = InputSignal->get();
Status =
hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1,
&InputSignalRaw, OutputSignals[0]->get());
if (auto Err = utils::asyncMemCopy(
UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
&InputSignalRaw, OutputSignals[0]->get()))
return Err;
} else {
Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 0,
nullptr, OutputSignals[0]->get());
if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
Src, Agent, CopySize, 0, nullptr,
OutputSignals[0]->get()))
return Err;
}

if (auto Err =
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
return Err;

// Consume another stream slot and compute dependencies.
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
assert(InputSignal && "Invalid input signal");
Expand All @@ -1242,7 +1282,7 @@ struct AMDGPUStreamTy {
std::atomic_thread_fence(std::memory_order_release);

// Issue the second step: host to host transfer.
Status = hsa_amd_signal_async_handler(
hsa_status_t Status = hsa_amd_signal_async_handler(
InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
(void *)&Slots[Curr]);

Expand Down Expand Up @@ -1318,16 +1358,14 @@ struct AMDGPUStreamTy {

// Issue the second step: host to device transfer. Avoid defining the input
// dependency if already satisfied.
hsa_status_t Status;
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 1,
&InputSignalRaw, OutputSignal->get());
} else
Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 0,
nullptr, OutputSignal->get());

return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
Agent, CopySize, 1, &InputSignalRaw,
OutputSignal->get());
}
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
CopySize, 0, nullptr, OutputSignal->get());
}

// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
Expand All @@ -1353,17 +1391,15 @@ struct AMDGPUStreamTy {
// allocated by this runtime or the caller made the appropriate
// access calls.

hsa_status_t Status;
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
Status =
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
&InputSignalRaw, OutputSignal->get());
} else
Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize,
0, nullptr, OutputSignal->get());

return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s");
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
SrcAgent, CopySize, 1, &InputSignalRaw,
OutputSignal->get());
}
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
SrcAgent, CopySize, 0, nullptr,
OutputSignal->get());
}

/// Synchronize with the stream. The current thread waits until all operations
Expand Down Expand Up @@ -1788,6 +1824,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
64),
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
OMPX_UseMultipleSdmaEngines(
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}

Expand Down Expand Up @@ -2206,10 +2244,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = Signal.init())
return Err;

Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size,
0, nullptr, Signal.get());
if (auto Err =
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
Agent, PinnedPtr, Agent, Size, 0,
nullptr, Signal.get()))
return Err;

if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
Expand Down Expand Up @@ -2267,10 +2304,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = Signal.init())
return Err;

Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size,
0, nullptr, Signal.get());
if (auto Err =
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
Agent, TgtPtr, Agent, Size, 0, nullptr,
Signal.get()))
return Err;

if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
Expand Down Expand Up @@ -2633,6 +2669,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
});
}

bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }

private:
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
Expand Down Expand Up @@ -2702,6 +2740,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// are microseconds.
UInt32Envar OMPX_StreamBusyWait;

/// Use ROCm 5.7 interface for multiple SDMA engines
BoolEnvar OMPX_UseMultipleSdmaEngines;

/// Stream manager for AMDGPU streams.
AMDGPUStreamManagerTy AMDGPUStreamManager;

Expand Down Expand Up @@ -2803,7 +2844,8 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
SignalManager(Device.getSignalManager()), Device(Device),
// Initialize the std::deque with some empty positions.
Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}

/// Class implementing the AMDGPU-specific functionalities of the global
/// handler.
Expand Down

0 comments on commit 48a223c

Please sign in to comment.