Skip to content

Commit 5c22b90

Browse files
authored
Reland [OpenMP][libomptarget] Enable parallel copies via multiple SDM… (#72307)
…A engines (#71801) This enables the AMDGPU plugin to use a new ROCm 5.7 interface to dispatch asynchronous data transfers across SDMA engines. The default functionality stays unchanged, meaning that all data transfers are enqueued into a H2D queue or an D2H queue, depending on transfer direction, via the HSA interface used previously. The new interface can be enabled via the environment variable `LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES=true` when libomptarget is built against a recent ROCm version (5.7 and later). As of now, requests are distributed in a round-robin fashion across available SDMA engines.
1 parent 011c9ee commit 5c22b90

File tree

1 file changed

+86
-44
lines changed
  • openmp/libomptarget/plugins-nextgen/amdgpu/src

1 file changed

+86
-44
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

+86-44
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,45 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
130130
"Error in hsa_amd_agent_iterate_memory_pools: %s");
131131
}
132132

133+
/// Dispatches an asynchronous memory copy.
134+
/// Enables different SDMA engines for the dispatch in a round-robin fashion.
135+
Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
136+
const void *Src, hsa_agent_t SrcAgent, size_t Size,
137+
uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
138+
hsa_signal_t CompletionSignal) {
139+
if (!UseMultipleSdmaEngines) {
140+
hsa_status_t S =
141+
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
142+
NumDepSignals, DepSignals, CompletionSignal);
143+
return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s");
144+
}
145+
146+
// This solution is probably not the best
147+
#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
148+
HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
149+
return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
150+
#else
151+
static std::atomic<int> SdmaEngine{1};
152+
153+
// This atomics solution is probably not the best, but should be sufficient
154+
// for now.
155+
// In a worst case scenario, in which threads read the same value, they will
156+
// dispatch to the same SDMA engine. This may result in sub-optimal
157+
// performance. However, I think the possibility to be fairly low.
158+
int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
159+
// This call is only avail in ROCm >= 5.7
160+
hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
161+
Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
162+
CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
163+
/*force_copy_on_sdma=*/true);
164+
// Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
165+
LocalSdmaEngine = (LocalSdmaEngine << 1) % 7;
166+
SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);
167+
168+
return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s");
169+
#endif
170+
}
171+
133172
} // namespace utils
134173

135174
/// Utility class representing generic resource references to AMDGPU resources.
@@ -945,6 +984,9 @@ struct AMDGPUStreamTy {
945984
/// Timeout hint for HSA actively waiting for signal value to change
946985
const uint64_t StreamBusyWaitMicroseconds;
947986

987+
/// Indicate to spread data transfers across all avilable SDMAs
988+
bool UseMultipleSdmaEngines;
989+
948990
/// Return the current number of asychronous operations on the stream.
949991
uint32_t size() const { return NextSlot; }
950992

@@ -1170,15 +1212,15 @@ struct AMDGPUStreamTy {
11701212
InputSignal = nullptr;
11711213

11721214
// Issue the async memory copy.
1173-
hsa_status_t Status;
11741215
if (InputSignal) {
11751216
hsa_signal_t InputSignalRaw = InputSignal->get();
1176-
Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 1,
1177-
&InputSignalRaw, OutputSignal->get());
1178-
} else
1179-
Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 0,
1180-
nullptr, OutputSignal->get());
1181-
return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
1217+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1218+
CopySize, 1, &InputSignalRaw,
1219+
OutputSignal->get());
1220+
}
1221+
1222+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1223+
CopySize, 0, nullptr, OutputSignal->get());
11821224
}
11831225

11841226
/// Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1214,21 +1256,19 @@ struct AMDGPUStreamTy {
12141256

12151257
// Issue the first step: device to host transfer. Avoid defining the input
12161258
// dependency if already satisfied.
1217-
hsa_status_t Status;
12181259
if (InputSignal) {
12191260
hsa_signal_t InputSignalRaw = InputSignal->get();
1220-
Status =
1221-
hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1,
1222-
&InputSignalRaw, OutputSignals[0]->get());
1261+
if (auto Err = utils::asyncMemCopy(
1262+
UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
1263+
&InputSignalRaw, OutputSignals[0]->get()))
1264+
return Err;
12231265
} else {
1224-
Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 0,
1225-
nullptr, OutputSignals[0]->get());
1266+
if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
1267+
Src, Agent, CopySize, 0, nullptr,
1268+
OutputSignals[0]->get()))
1269+
return Err;
12261270
}
12271271

1228-
if (auto Err =
1229-
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
1230-
return Err;
1231-
12321272
// Consume another stream slot and compute dependencies.
12331273
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
12341274
assert(InputSignal && "Invalid input signal");
@@ -1242,7 +1282,7 @@ struct AMDGPUStreamTy {
12421282
std::atomic_thread_fence(std::memory_order_release);
12431283

12441284
// Issue the second step: host to host transfer.
1245-
Status = hsa_amd_signal_async_handler(
1285+
hsa_status_t Status = hsa_amd_signal_async_handler(
12461286
InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
12471287
(void *)&Slots[Curr]);
12481288

@@ -1318,16 +1358,14 @@ struct AMDGPUStreamTy {
13181358

13191359
// Issue the second step: host to device transfer. Avoid defining the input
13201360
// dependency if already satisfied.
1321-
hsa_status_t Status;
13221361
if (InputSignal && InputSignal->load()) {
13231362
hsa_signal_t InputSignalRaw = InputSignal->get();
1324-
Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 1,
1325-
&InputSignalRaw, OutputSignal->get());
1326-
} else
1327-
Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 0,
1328-
nullptr, OutputSignal->get());
1329-
1330-
return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
1363+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1364+
Agent, CopySize, 1, &InputSignalRaw,
1365+
OutputSignal->get());
1366+
}
1367+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
1368+
CopySize, 0, nullptr, OutputSignal->get());
13311369
}
13321370

13331371
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1353,17 +1391,15 @@ struct AMDGPUStreamTy {
13531391
// allocated by this runtime or the caller made the appropriate
13541392
// access calls.
13551393

1356-
hsa_status_t Status;
13571394
if (InputSignal && InputSignal->load()) {
13581395
hsa_signal_t InputSignalRaw = InputSignal->get();
1359-
Status =
1360-
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
1361-
&InputSignalRaw, OutputSignal->get());
1362-
} else
1363-
Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize,
1364-
0, nullptr, OutputSignal->get());
1365-
1366-
return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s");
1396+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1397+
SrcAgent, CopySize, 1, &InputSignalRaw,
1398+
OutputSignal->get());
1399+
}
1400+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1401+
SrcAgent, CopySize, 0, nullptr,
1402+
OutputSignal->get());
13671403
}
13681404

13691405
/// Synchronize with the stream. The current thread waits until all operations
@@ -1788,6 +1824,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
17881824
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
17891825
64),
17901826
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
1827+
OMPX_UseMultipleSdmaEngines(
1828+
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
17911829
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
17921830
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
17931831

@@ -2206,10 +2244,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22062244
if (auto Err = Signal.init())
22072245
return Err;
22082246

2209-
Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size,
2210-
0, nullptr, Signal.get());
2211-
if (auto Err =
2212-
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
2247+
if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
2248+
Agent, PinnedPtr, Agent, Size, 0,
2249+
nullptr, Signal.get()))
22132250
return Err;
22142251

22152252
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2267,10 +2304,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22672304
if (auto Err = Signal.init())
22682305
return Err;
22692306

2270-
Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size,
2271-
0, nullptr, Signal.get());
2272-
if (auto Err =
2273-
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
2307+
if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
2308+
Agent, TgtPtr, Agent, Size, 0, nullptr,
2309+
Signal.get()))
22742310
return Err;
22752311

22762312
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2633,6 +2669,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26332669
});
26342670
}
26352671

2672+
bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
2673+
26362674
private:
26372675
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
26382676
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
@@ -2702,6 +2740,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27022740
/// are microseconds.
27032741
UInt32Envar OMPX_StreamBusyWait;
27042742

2743+
/// Use ROCm 5.7 interface for multiple SDMA engines
2744+
BoolEnvar OMPX_UseMultipleSdmaEngines;
2745+
27052746
/// Stream manager for AMDGPU streams.
27062747
AMDGPUStreamManagerTy AMDGPUStreamManager;
27072748

@@ -2803,7 +2844,8 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
28032844
SignalManager(Device.getSignalManager()), Device(Device),
28042845
// Initialize the std::deque with some empty positions.
28052846
Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
2806-
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
2847+
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
2848+
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
28072849

28082850
/// Class implementing the AMDGPU-specific functionalities of the global
28092851
/// handler.

0 commit comments

Comments
 (0)