@@ -130,6 +130,45 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
130
130
" Error in hsa_amd_agent_iterate_memory_pools: %s" );
131
131
}
132
132
133
+ // / Dispatches an asynchronous memory copy.
134
+ // / Enables different SDMA engines for the dispatch in a round-robin fashion.
135
+ Error asyncMemCopy (bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
136
+ const void *Src, hsa_agent_t SrcAgent, size_t Size ,
137
+ uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
138
+ hsa_signal_t CompletionSignal) {
139
+ if (!UseMultipleSdmaEngines) {
140
+ hsa_status_t S =
141
+ hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, Size ,
142
+ NumDepSignals, DepSignals, CompletionSignal);
143
+ return Plugin::check (S, " Error in hsa_amd_memory_async_copy: %s" );
144
+ }
145
+
146
+ // This solution is probably not the best
147
+ #if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
148
+ HSA_AMD_INTERFACE_VERSION_MINOR >= 2 )
149
+ return Plugin::error (" Async copy on selected SDMA requires ROCm 5.7" );
150
+ #else
151
+ static std::atomic<int > SdmaEngine{1 };
152
+
153
+ // This atomics solution is probably not the best, but should be sufficient
154
+ // for now.
155
+ // In a worst case scenario, in which threads read the same value, they will
156
+ // dispatch to the same SDMA engine. This may result in sub-optimal
157
+ // performance. However, I think the possibility to be fairly low.
158
+ int LocalSdmaEngine = SdmaEngine.load (std::memory_order_acquire);
159
+ // This call is only avail in ROCm >= 5.7
160
+ hsa_status_t S = hsa_amd_memory_async_copy_on_engine (
161
+ Dst, DstAgent, Src, SrcAgent, Size , NumDepSignals, DepSignals,
162
+ CompletionSignal, (hsa_amd_sdma_engine_id_t )LocalSdmaEngine,
163
+ /* force_copy_on_sdma=*/ true );
164
+ // Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
165
+ LocalSdmaEngine = (LocalSdmaEngine << 1 ) % 7 ;
166
+ SdmaEngine.store (LocalSdmaEngine, std::memory_order_relaxed);
167
+
168
+ return Plugin::check (S, " Error in hsa_amd_memory_async_copy_on_engine: %s" );
169
+ #endif
170
+ }
171
+
133
172
} // namespace utils
134
173
135
174
// / Utility class representing generic resource references to AMDGPU resources.
@@ -945,6 +984,9 @@ struct AMDGPUStreamTy {
945
984
// / Timeout hint for HSA actively waiting for signal value to change
946
985
const uint64_t StreamBusyWaitMicroseconds;
947
986
987
+ // / Indicate to spread data transfers across all avilable SDMAs
988
+ bool UseMultipleSdmaEngines;
989
+
948
990
// / Return the current number of asychronous operations on the stream.
949
991
uint32_t size () const { return NextSlot; }
950
992
@@ -1170,15 +1212,15 @@ struct AMDGPUStreamTy {
1170
1212
InputSignal = nullptr ;
1171
1213
1172
1214
// Issue the async memory copy.
1173
- hsa_status_t Status;
1174
1215
if (InputSignal) {
1175
1216
hsa_signal_t InputSignalRaw = InputSignal->get ();
1176
- Status = hsa_amd_memory_async_copy (Dst, Agent, Src, Agent, CopySize, 1 ,
1177
- &InputSignalRaw, OutputSignal->get ());
1178
- } else
1179
- Status = hsa_amd_memory_async_copy (Dst, Agent, Src, Agent, CopySize, 0 ,
1180
- nullptr , OutputSignal->get ());
1181
- return Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" );
1217
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1218
+ CopySize, 1 , &InputSignalRaw,
1219
+ OutputSignal->get ());
1220
+ }
1221
+
1222
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1223
+ CopySize, 0 , nullptr , OutputSignal->get ());
1182
1224
}
1183
1225
1184
1226
// / Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1214,21 +1256,19 @@ struct AMDGPUStreamTy {
1214
1256
1215
1257
// Issue the first step: device to host transfer. Avoid defining the input
1216
1258
// dependency if already satisfied.
1217
- hsa_status_t Status;
1218
1259
if (InputSignal) {
1219
1260
hsa_signal_t InputSignalRaw = InputSignal->get ();
1220
- Status =
1221
- hsa_amd_memory_async_copy (Inter, Agent, Src, Agent, CopySize, 1 ,
1222
- &InputSignalRaw, OutputSignals[0 ]->get ());
1261
+ if (auto Err = utils::asyncMemCopy (
1262
+ UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1 ,
1263
+ &InputSignalRaw, OutputSignals[0 ]->get ()))
1264
+ return Err;
1223
1265
} else {
1224
- Status = hsa_amd_memory_async_copy (Inter, Agent, Src, Agent, CopySize, 0 ,
1225
- nullptr , OutputSignals[0 ]->get ());
1266
+ if (auto Err = utils::asyncMemCopy (UseMultipleSdmaEngines, Inter, Agent,
1267
+ Src, Agent, CopySize, 0 , nullptr ,
1268
+ OutputSignals[0 ]->get ()))
1269
+ return Err;
1226
1270
}
1227
1271
1228
- if (auto Err =
1229
- Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
1230
- return Err;
1231
-
1232
1272
// Consume another stream slot and compute dependencies.
1233
1273
std::tie (Curr, InputSignal) = consume (OutputSignals[1 ]);
1234
1274
assert (InputSignal && " Invalid input signal" );
@@ -1242,7 +1282,7 @@ struct AMDGPUStreamTy {
1242
1282
std::atomic_thread_fence (std::memory_order_release);
1243
1283
1244
1284
// Issue the second step: host to host transfer.
1245
- Status = hsa_amd_signal_async_handler (
1285
+ hsa_status_t Status = hsa_amd_signal_async_handler (
1246
1286
InputSignal->get (), HSA_SIGNAL_CONDITION_EQ, 0 , asyncActionCallback,
1247
1287
(void *)&Slots[Curr]);
1248
1288
@@ -1318,16 +1358,14 @@ struct AMDGPUStreamTy {
1318
1358
1319
1359
// Issue the second step: host to device transfer. Avoid defining the input
1320
1360
// dependency if already satisfied.
1321
- hsa_status_t Status;
1322
1361
if (InputSignal && InputSignal->load ()) {
1323
1362
hsa_signal_t InputSignalRaw = InputSignal->get ();
1324
- Status = hsa_amd_memory_async_copy (Dst, Agent, Inter, Agent, CopySize, 1 ,
1325
- &InputSignalRaw, OutputSignal->get ());
1326
- } else
1327
- Status = hsa_amd_memory_async_copy (Dst, Agent, Inter, Agent, CopySize, 0 ,
1328
- nullptr , OutputSignal->get ());
1329
-
1330
- return Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" );
1363
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter,
1364
+ Agent, CopySize, 1 , &InputSignalRaw,
1365
+ OutputSignal->get ());
1366
+ }
1367
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
1368
+ CopySize, 0 , nullptr , OutputSignal->get ());
1331
1369
}
1332
1370
1333
1371
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1353,17 +1391,15 @@ struct AMDGPUStreamTy {
1353
1391
// allocated by this runtime or the caller made the appropriate
1354
1392
// access calls.
1355
1393
1356
- hsa_status_t Status;
1357
1394
if (InputSignal && InputSignal->load ()) {
1358
1395
hsa_signal_t InputSignalRaw = InputSignal->get ();
1359
- Status =
1360
- hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize, 1 ,
1361
- &InputSignalRaw, OutputSignal->get ());
1362
- } else
1363
- Status = hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize,
1364
- 0 , nullptr , OutputSignal->get ());
1365
-
1366
- return Plugin::check (Status, " Error in D2D hsa_amd_memory_async_copy: %s" );
1396
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, DstAgent, Src,
1397
+ SrcAgent, CopySize, 1 , &InputSignalRaw,
1398
+ OutputSignal->get ());
1399
+ }
1400
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, DstAgent, Src,
1401
+ SrcAgent, CopySize, 0 , nullptr ,
1402
+ OutputSignal->get ());
1367
1403
}
1368
1404
1369
1405
// / Synchronize with the stream. The current thread waits until all operations
@@ -1788,6 +1824,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1788
1824
OMPX_InitialNumSignals (" LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" ,
1789
1825
64 ),
1790
1826
OMPX_StreamBusyWait (" LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT" , 2000000 ),
1827
+ OMPX_UseMultipleSdmaEngines (
1828
+ " LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES" , false ),
1791
1829
AMDGPUStreamManager (*this , Agent), AMDGPUEventManager(*this ),
1792
1830
AMDGPUSignalManager (*this ), Agent(Agent), HostDevice(HostDevice) {}
1793
1831
@@ -2206,10 +2244,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2206
2244
if (auto Err = Signal.init ())
2207
2245
return Err;
2208
2246
2209
- Status = hsa_amd_memory_async_copy (TgtPtr, Agent, PinnedPtr, Agent, Size ,
2210
- 0 , nullptr , Signal.get ());
2211
- if (auto Err =
2212
- Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
2247
+ if (auto Err = utils::asyncMemCopy (useMultipleSdmaEngines (), TgtPtr,
2248
+ Agent, PinnedPtr, Agent, Size , 0 ,
2249
+ nullptr , Signal.get ()))
2213
2250
return Err;
2214
2251
2215
2252
if (auto Err = Signal.wait (getStreamBusyWaitMicroseconds ()))
@@ -2267,10 +2304,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2267
2304
if (auto Err = Signal.init ())
2268
2305
return Err;
2269
2306
2270
- Status = hsa_amd_memory_async_copy (PinnedPtr, Agent, TgtPtr, Agent, Size ,
2271
- 0 , nullptr , Signal.get ());
2272
- if (auto Err =
2273
- Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
2307
+ if (auto Err = utils::asyncMemCopy (useMultipleSdmaEngines (), PinnedPtr,
2308
+ Agent, TgtPtr, Agent, Size , 0 , nullptr ,
2309
+ Signal.get ()))
2274
2310
return Err;
2275
2311
2276
2312
if (auto Err = Signal.wait (getStreamBusyWaitMicroseconds ()))
@@ -2633,6 +2669,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2633
2669
});
2634
2670
}
2635
2671
2672
+ bool useMultipleSdmaEngines () const { return OMPX_UseMultipleSdmaEngines; }
2673
+
2636
2674
private:
2637
2675
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
2638
2676
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
@@ -2702,6 +2740,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2702
2740
// / are microseconds.
2703
2741
UInt32Envar OMPX_StreamBusyWait;
2704
2742
2743
+ // / Use ROCm 5.7 interface for multiple SDMA engines
2744
+ BoolEnvar OMPX_UseMultipleSdmaEngines;
2745
+
2705
2746
// / Stream manager for AMDGPU streams.
2706
2747
AMDGPUStreamManagerTy AMDGPUStreamManager;
2707
2748
@@ -2803,7 +2844,8 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
2803
2844
SignalManager (Device.getSignalManager()), Device(Device),
2804
2845
// Initialize the std::deque with some empty positions.
2805
2846
Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer(nullptr ),
2806
- StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
2847
+ StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
2848
+ UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
2807
2849
2808
2850
// / Class implementing the AMDGPU-specific functionalities of the global
2809
2851
// / handler.
0 commit comments