Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport branch/2.3.x] Add cuda::ptx::st_async #1093

Merged
merged 4 commits into from
Nov 14, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix usage example
(cherry picked from commit dc1d934)
ahendriksen authored and jrhemstad committed Nov 13, 2023
commit f66aec1443141944d70151c20b7b03c42ccb2e58
108 changes: 66 additions & 42 deletions libcudacxx/docs/extended_api/ptx.md
Original file line number Diff line number Diff line change
@@ -336,51 +336,75 @@ __device__ static inline void st_async(
#include <cuda/barrier>
#include <cooperative_groups.h>
__global__ void __cluster_dims__(2, 1, 1) cluster_kernel()
__global__ void __cluster_dims__(8, 1, 1) kernel()
{
using cuda::ptx::sem_release;
using cuda::ptx::sem_acquire;
using cuda::ptx::space_cluster;
using cuda::ptx::space_shared;
using cuda::ptx::scope_cluster;
using cuda::ptx::scope_cta;
using cuda::ptx::sem_release;
using cuda::ptx::sem_acquire;
using cuda::ptx::space_cluster;
using cuda::ptx::space_shared;
using cuda::ptx::scope_cluster;
namespace cg = cooperative_groups;
cg::cluster_group cluster = cg::this_cluster();
using barrier_t = cuda::barrier<cuda::thread_scope_block>;
__shared__ int receive_buffer[4];
__shared__ barrier_t bar;
init(&bar, blockDim.x);
// Sync cluster to ensure remote barrier is initialized.
cluster.sync();
// Get address of remote cluster barrier:
unsigned int other_block_rank = cluster.block_rank() ^ 1;
uint64_t * remote_bar = cluster.map_shared_rank(cuda::device::barrier_native_handle(bar), other_block_rank);
// int * remote_buffer = cluster.map_shared_rank(&receive_buffer, other_block_rank);
int * remote_buffer = cluster.map_shared_rank(&receive_buffer[0], other_block_rank);
// Arrive on local barrier:
uint64_t arrival_token;
if (threadIdx.x == 0) {
// Thread 0 arrives and indicates it expects to receive a certain number of bytes as well
arrival_token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, cuda::device::barrier_native_handle(bar), sizeof(receive_buffer));
} else {
arrival_token = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, cuda::device::barrier_native_handle(bar));
}
if (threadIdx.x == 0) {
printf("[block %d] arrived with expected tx count = %llu\n", cluster.block_rank(), sizeof(receive_buffer));
}
// Send bytes to remote buffer, arriving on remote barrier
if (threadIdx.x == 0) {
cuda::ptx::st_async(remote_buffer, {int(cluster.block_rank()), 2, 3, 4}, remote_bar);
}
if (threadIdx.x == 0) {
printf("[block %d] st_async to %p, %p\n",
cluster.block_rank(),
remote_buffer,
remote_bar
);
}
// Wait on local barrier:
while(!cuda::ptx::mbarrier_try_wait(sem_acquire, scope_cluster, cuda::device::barrier_native_handle(bar), arrival_token)) {}
// Print received values:
if (threadIdx.x == 0) {
printf(
"[block %d] receive_buffer = {%d, %d, %d, %d}\n",
cluster.block_rank(),
receive_buffer[0], receive_buffer[1], receive_buffer[2], receive_buffer[3]
);
}
using barrier_t = cuda::barrier<cuda::thread_scope_block>;
}
__shared__ int receive_buffer[4];
__shared__ barrier_t bar;
init(&bar, blockDim.x);
// Sync cluster to ensure remote barrier is initialized.
cluster.sync();
// Get address of remote cluster barrier:
namespace cg = cooperative_groups;
cg::cluster_group cluster = cg::this_cluster();
unsigned int other_block_rank = cluster.block_rank() ^ 1;
uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank);
int * remote_buffer = cluster.map_shared_rank(&bar, other_block_rank);
// Arrive on local barrier:
uint64_t arrival_token;
if (threadIdx.x == 0) {
// Thread 0 arrives and indicates it expects to receive a certain number of bytes as well
arrival_token = cuda::ptx::mbarrier_arrive_expext_tx(sem_release, scope_cluster, space_shared, &bar, sizeof(receive_buffer));
} else {
arrival_token = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar);
}
// Send bytes to remote buffer, arriving on remote barrier
cuda::ptx::st_async(remote_buffer, {1, 2, 3, 4}, remote_bar);
// Wait on local barrier:
while(!cuda::ptx::mbarrier_try_wait(sem_acquire, scope_cluster, cuda::device::barrier_native_handle(bar), arrival_token)) {}
// Print received values:
if (threadIdx.x == 0) {
printf("[@%d] receive_buffer = {%d, %d, %d, %d}\n",
cluster.block_rank(),
receive_buffer[0], receive_buffer[1], receive_buffer[2], receive_buffer[3]);
}
int main() {
kernel<<<8, 128>>>();
cudaDeviceSynchronize();
}
```
### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy)