From 6db5bb7eed2100642521071319078319394f7d6e Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 9 Nov 2023 11:40:03 +0100
Subject: [PATCH 1/4] Add st.async

(cherry picked from commit 30ce5abfbf4eb853602673480aca249c4007828d)
---
 .../cuda/ptx/ptx.st.async.compile.pass.cpp    |  93 ++++++++++++
 libcudacxx/docs/extended_api/ptx.md           |  86 ++++++++++-
 .../std/detail/libcxx/include/__cuda/ptx.h    | 135 ++++++++++++++++++
 3 files changed, 313 insertions(+), 1 deletion(-)
 create mode 100644 libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
new file mode 100644
index 00000000000..f9c9d0f57b9
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+/*
+ * We use a special strategy to force the generation of the PTX. This is mainly
+ * a fight against dead-code-elimination in the NVVM layer.
+ *
+ * The reason we need this strategy is because certain older versions of ptxas
+ * segfault when a non-sensical sequence of PTX is generated. So instead, we try
+ * to force the instantiation and compilation to PTX of all the overloads of the
+ * PTX wrapping functions.
+ *
+ * We do this by writing a function pointer of each overload to the `__device__`
+ * variable `fn_ptr`. Now, because weak stores from a single thread may be
+ * elided, we also wrap the store in an if branch that cannot be removed.
+ *
+ * To prevent dead-code-elimination of the if branch, we use
+ * `non_eliminated_false`, which uses inline assembly to hide the fact that is
+ * always false from NVVM.
+ *
+ * So this is how we ensure that none of the function pointer stores are elided.
+ * Because `fn_ptr` is possibly visible outside this translation unit, the
+ * compiler must compile all the functions which are stored.
+ *
+ */
+
+__device__ void * fn_ptr = nullptr;
+
+__device__ bool non_eliminated_false(void){
+  int ret = 0;
+  asm ("": "=r"(ret)::);
+  return ret != 0;
+}
+
+__global__ void test_compilation() {
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.
+      auto overload = static_cast<void (*)(void* , const int32_t& , uint64_t* )>(cuda::ptx::st_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+    if (non_eliminated_false()) {
+      // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.
+      auto overload = static_cast<void (*)(void* , const int64_t& , uint64_t* )>(cuda::ptx::st_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
+      auto overload = static_cast<void (*)(void* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+    if (non_eliminated_false()) {
+      // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
+      auto overload = static_cast<void (*)(void* , const int64_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.
+      auto overload = static_cast<void (*)(void* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::st_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+}
+
+int main(int, char**)
+{
+    return 0;
+}
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index e45eed54a42..c693b395bdb 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -286,7 +286,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 [`ld.global.nc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc
 [`ldu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu
 [`st`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
-[`st.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
+[`st.async`]: #stasync
 [`multimem.ld_reduce, multimem.st, multimem.red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
 [`prefetch, prefetchu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu
 [`applypriority`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority
@@ -299,6 +299,90 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 [`mapa`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
 [`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
 
+#### `st.async`
+
+-  PTX ISA: [`st.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async)
+
+**st_async**:
+```cuda
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81, SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  void* addr,
+  const Type& value,
+  uint64_t* remote_bar);
+
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.  PTX ISA 81, SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  void* addr,
+  const Type (&value)[2],
+  uint64_t* remote_bar);
+
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81, SM_90
+template <typename B32>
+__device__ static inline void st_async(
+  void* addr,
+  const B32 (&value)[4],
+  uint64_t* remote_bar);
+```
+
+**Usage**:
+```cuda
+#include <cstdio>
+#include <cuda/ptx>
+#include <cuda/barrier>
+#include <cooperative_groups.h>
+
+__global__ void __cluster_dims__(2, 1, 1) cluster_kernel()
+{
+    using cuda::ptx::sem_release;
+    using cuda::ptx::sem_acquire;
+    using cuda::ptx::space_cluster;
+    using cuda::ptx::space_shared;
+    using cuda::ptx::scope_cluster;
+    using cuda::ptx::scope_cta;
+
+    using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+
+    __shared__ int receive_buffer[4];
+    __shared__ barrier_t bar;
+    init(&bar, blockDim.x);
+    // Sync cluster to ensure remote barrier is initialized.
+    cluster.sync();
+
+    // Get address of remote cluster barrier:
+    namespace cg = cooperative_groups;
+    cg::cluster_group cluster = cg::this_cluster();
+    unsigned int other_block_rank = cluster.block_rank() ^ 1;
+    uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank);
+    int * remote_buffer = cluster.map_shared_rank(&bar, other_block_rank);
+
+    // Arrive on local barrier:
+    uint64_t arrival_token;
+    if (threadIdx.x == 0) {
+      // Thread 0 arrives and indicates it expects to receive a certain number of bytes as well
+      arrival_token = cuda::ptx::mbarrier_arrive_expext_tx(sem_release, scope_cluster, space_shared, &bar, sizeof(receive_buffer));
+    } else {
+      arrival_token = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar);
+    }
+
+    // Send bytes to remote buffer, arriving on remote barrier
+    cuda::ptx::st_async(remote_buffer, {1, 2, 3, 4}, remote_bar);
+
+    // Wait on local barrier:
+    while(!cuda::ptx::mbarrier_try_wait(sem_acquire, scope_cluster, cuda::device::barrier_native_handle(bar), arrival_token)) {}
+
+    // Print received values:
+    if (threadIdx.x == 0) {
+      printf("[@%d] receive_buffer = {%d, %d, %d, %d}\n",
+             cluster.block_rank(),
+             receive_buffer[0], receive_buffer[1], receive_buffer[2], receive_buffer[3]);
+    }
+}
+```
 ### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy)
 
 | Instruction                              | Available in libcu++ |
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index 3414bea568d..9b22963cb92 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -386,6 +386,141 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // 9.7.8.12. Data Movement and Conversion Instructions: st.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
 
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81, SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  void* addr,
+  const Type& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _Type>
+_LIBCUDACXX_DEVICE static inline void st_async(
+  void* __addr,
+  const _Type& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 4) {
+      asm (
+        "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2];    // 1. "
+        :
+        : "r"(__as_ptr_remote_dsmem(__addr)),
+          "r"(__as_b32(__value)),
+          "r"(__as_ptr_remote_dsmem(__remote_bar))
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 8) {
+      asm (
+        "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2];    // 1. "
+        :
+        : "r"(__as_ptr_remote_dsmem(__addr)),
+          "l"(__as_b64(__value)),
+          "r"(__as_ptr_remote_dsmem(__remote_bar))
+        : "memory"
+      );
+    }
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.  PTX ISA 81, SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  void* addr,
+  const Type (&value)[2],
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _Type>
+_LIBCUDACXX_DEVICE static inline void st_async(
+  void* __addr,
+  const _Type (&__value)[2],
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 4) {
+      asm (
+        "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. "
+        :
+        : "r"(__as_ptr_remote_dsmem(__addr)),
+          "r"(__as_b32(__value[0])),
+          "r"(__as_b32(__value[1])),
+          "r"(__as_ptr_remote_dsmem(__remote_bar))
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 8) {
+      asm (
+        "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. "
+        :
+        : "r"(__as_ptr_remote_dsmem(__addr)),
+          "l"(__as_b64(__value[0])),
+          "l"(__as_b64(__value[1])),
+          "r"(__as_ptr_remote_dsmem(__remote_bar))
+        : "memory"
+      );
+    }
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81, SM_90
+template <typename B32>
+__device__ static inline void st_async(
+  void* addr,
+  const B32 (&value)[4],
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_LIBCUDACXX_DEVICE static inline void st_async(
+  void* __addr,
+  const _B32 (&__value)[4],
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_B32) == 4, "");
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5];    // 3. "
+      :
+      : "r"(__as_ptr_remote_dsmem(__addr)),
+        "r"(__as_b32(__value[0])),
+        "r"(__as_b32(__value[1])),
+        "r"(__as_b32(__value[2])),
+        "r"(__as_b32(__value[3])),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+
 // 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
 

From f66aec1443141944d70151c20b7b03c42ccb2e58 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 9 Nov 2023 12:05:18 +0100
Subject: [PATCH 2/4] Fix usage example

(cherry picked from commit dc1d934a84a3da44691895a6555634358906b77c)
---
 libcudacxx/docs/extended_api/ptx.md | 108 +++++++++++++++++-----------
 1 file changed, 66 insertions(+), 42 deletions(-)

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index c693b395bdb..3cb50f9f7ff 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -336,51 +336,75 @@ __device__ static inline void st_async(
 #include <cuda/barrier>
 #include <cooperative_groups.h>
 
-__global__ void __cluster_dims__(2, 1, 1) cluster_kernel()
+__global__ void __cluster_dims__(8, 1, 1) kernel()
 {
-    using cuda::ptx::sem_release;
-    using cuda::ptx::sem_acquire;
-    using cuda::ptx::space_cluster;
-    using cuda::ptx::space_shared;
-    using cuda::ptx::scope_cluster;
-    using cuda::ptx::scope_cta;
+  using cuda::ptx::sem_release;
+  using cuda::ptx::sem_acquire;
+  using cuda::ptx::space_cluster;
+  using cuda::ptx::space_shared;
+  using cuda::ptx::scope_cluster;
+
+  namespace cg = cooperative_groups;
+  cg::cluster_group cluster = cg::this_cluster();
+
+  using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+
+  __shared__ int receive_buffer[4];
+  __shared__ barrier_t bar;
+  init(&bar, blockDim.x);
+
+  // Sync cluster to ensure remote barrier is initialized.
+  cluster.sync();
+
+  // Get address of remote cluster barrier:
+  unsigned int other_block_rank = cluster.block_rank() ^ 1;
+  uint64_t * remote_bar = cluster.map_shared_rank(cuda::device::barrier_native_handle(bar), other_block_rank);
+  // int * remote_buffer = cluster.map_shared_rank(&receive_buffer, other_block_rank);
+  int * remote_buffer = cluster.map_shared_rank(&receive_buffer[0], other_block_rank);
+
+  // Arrive on local barrier:
+  uint64_t arrival_token;
+  if (threadIdx.x == 0) {
+    // Thread 0 arrives and indicates it expects to receive a certain number of bytes as well
+    arrival_token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, cuda::device::barrier_native_handle(bar), sizeof(receive_buffer));
+  } else {
+    arrival_token = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, cuda::device::barrier_native_handle(bar));
+  }
+
+  if (threadIdx.x == 0) {
+    printf("[block %d] arrived with expected tx count = %llu\n", cluster.block_rank(), sizeof(receive_buffer));
+  }
+
+  // Send bytes to remote buffer, arriving on remote barrier
+  if (threadIdx.x == 0) {
+    cuda::ptx::st_async(remote_buffer, {int(cluster.block_rank()), 2, 3, 4}, remote_bar);
+  }
+
+  if (threadIdx.x == 0) {
+    printf("[block %d] st_async to %p, %p\n",
+           cluster.block_rank(),
+           remote_buffer,
+           remote_bar
+    );
+  }
+
+  // Wait on local barrier:
+  while(!cuda::ptx::mbarrier_try_wait(sem_acquire, scope_cluster, cuda::device::barrier_native_handle(bar), arrival_token)) {}
+
+  // Print received values:
+  if (threadIdx.x == 0) {
+    printf(
+      "[block %d] receive_buffer = {%d, %d, %d, %d}\n",
+      cluster.block_rank(),
+      receive_buffer[0], receive_buffer[1], receive_buffer[2], receive_buffer[3]
+    );
+  }
 
-    using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+}
 
-    __shared__ int receive_buffer[4];
-    __shared__ barrier_t bar;
-    init(&bar, blockDim.x);
-    // Sync cluster to ensure remote barrier is initialized.
-    cluster.sync();
-
-    // Get address of remote cluster barrier:
-    namespace cg = cooperative_groups;
-    cg::cluster_group cluster = cg::this_cluster();
-    unsigned int other_block_rank = cluster.block_rank() ^ 1;
-    uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank);
-    int * remote_buffer = cluster.map_shared_rank(&bar, other_block_rank);
-
-    // Arrive on local barrier:
-    uint64_t arrival_token;
-    if (threadIdx.x == 0) {
-      // Thread 0 arrives and indicates it expects to receive a certain number of bytes as well
-      arrival_token = cuda::ptx::mbarrier_arrive_expext_tx(sem_release, scope_cluster, space_shared, &bar, sizeof(receive_buffer));
-    } else {
-      arrival_token = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar);
-    }
-
-    // Send bytes to remote buffer, arriving on remote barrier
-    cuda::ptx::st_async(remote_buffer, {1, 2, 3, 4}, remote_bar);
-
-    // Wait on local barrier:
-    while(!cuda::ptx::mbarrier_try_wait(sem_acquire, scope_cluster, cuda::device::barrier_native_handle(bar), arrival_token)) {}
-
-    // Print received values:
-    if (threadIdx.x == 0) {
-      printf("[@%d] receive_buffer = {%d, %d, %d, %d}\n",
-             cluster.block_rank(),
-             receive_buffer[0], receive_buffer[1], receive_buffer[2], receive_buffer[3]);
-    }
+int main() {
+  kernel<<<8, 128>>>();
+  cudaDeviceSynchronize();
 }
 ```
 ### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy)

From 397cb0c8bb40463d251c24e65d977cf3cc895a36 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 9 Nov 2023 13:41:08 +0100
Subject: [PATCH 3/4] Use typed instead of void pointers

Because the size can be either 32 or 64 bit, this can catch a lot of
errors.

For instance:

uint64_t * remote_buffer;
uint64_t * remote_bar;
cuda::ptx::st_async(remote_buffer, 1, remote_bar);

would previously use the .b32 path because the `1` is an integer and
determines the type resolution.

Now, this will result in a compiler error.

Resolution is to either (a) change the value type, or (b) change the
buffer type.

a)
uint64_t * remote_buffer;
cuda::ptx::st_async(remote_buffer, uint64_t(1), remote_bar);

b)
int32_t * remote_buffer;
cuda::ptx::st_async(remote_buffer, 1, remote_bar);

(cherry picked from commit 76044b749e7a017507f1c9eafd6b10f0ef1d23f9)
---
 .../test/cuda/ptx/ptx.st.async.compile.pass.cpp      | 10 +++++-----
 libcudacxx/docs/extended_api/ptx.md                  |  6 +++---
 .../cuda/std/detail/libcxx/include/__cuda/ptx.h      | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
index f9c9d0f57b9..0a95f9dcece 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
@@ -50,12 +50,12 @@ __global__ void test_compilation() {
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.
-      auto overload = static_cast<void (*)(void* , const int32_t& , uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int32_t* , const int32_t& , uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.
-      auto overload = static_cast<void (*)(void* , const int64_t& , uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int64_t* , const int64_t& , uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
   ));
@@ -65,12 +65,12 @@ __global__ void test_compilation() {
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
-      auto overload = static_cast<void (*)(void* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int32_t* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
-      auto overload = static_cast<void (*)(void* , const int64_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int64_t* , const int64_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
   ));
@@ -80,7 +80,7 @@ __global__ void test_compilation() {
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.
-      auto overload = static_cast<void (*)(void* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int32_t* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
   ));
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index 3cb50f9f7ff..5bfc2d7fa7c 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -309,7 +309,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type& value,
   uint64_t* remote_bar);
 
@@ -317,14 +317,14 @@ __device__ static inline void st_async(
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type (&value)[2],
   uint64_t* remote_bar);
 
 // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81, SM_90
 template <typename B32>
 __device__ static inline void st_async(
-  void* addr,
+  B32* addr,
   const B32 (&value)[4],
   uint64_t* remote_bar);
 ```
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index 9b22963cb92..48dd076986c 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -391,7 +391,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type& value,
   uint64_t* remote_bar);
 */
@@ -399,7 +399,7 @@ __device__ static inline void st_async(
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
 template <typename _Type>
 _LIBCUDACXX_DEVICE static inline void st_async(
-  void* __addr,
+  _Type* __addr,
   const _Type& __value,
   _CUDA_VSTD::uint64_t* __remote_bar)
 {
@@ -438,7 +438,7 @@ _LIBCUDACXX_DEVICE static inline void st_async(
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type (&value)[2],
   uint64_t* remote_bar);
 */
@@ -446,7 +446,7 @@ __device__ static inline void st_async(
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
 template <typename _Type>
 _LIBCUDACXX_DEVICE static inline void st_async(
-  void* __addr,
+  _Type* __addr,
   const _Type (&__value)[2],
   _CUDA_VSTD::uint64_t* __remote_bar)
 {
@@ -486,7 +486,7 @@ _LIBCUDACXX_DEVICE static inline void st_async(
 // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81, SM_90
 template <typename B32>
 __device__ static inline void st_async(
-  void* addr,
+  B32* addr,
   const B32 (&value)[4],
   uint64_t* remote_bar);
 */
@@ -494,7 +494,7 @@ __device__ static inline void st_async(
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
 template <typename _B32>
 _LIBCUDACXX_DEVICE static inline void st_async(
-  void* __addr,
+  _B32* __addr,
   const _B32 (&__value)[4],
   _CUDA_VSTD::uint64_t* __remote_bar)
 {

From 4a3f797f2d6a86c52d7933df56fa68d8b7070a54 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 9 Nov 2023 13:56:33 +0100
Subject: [PATCH 4/4] Add note on alignment

The type may be misleading on this one, so I added a note on alignment
of the destination address.

(cherry picked from commit 8230836aa80d67c131fdc776d840cd1e67d9d91f)
---
 libcudacxx/docs/extended_api/ptx.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index 5bfc2d7fa7c..b76f7ce4398 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -303,6 +303,9 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 
 -  PTX ISA: [`st.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async)
 
+**NOTE.** Alignment of `addr` must be a multiple of vector size. For instance,
+the `addr` supplied to the `v2.b32` variant must be aligned to `2 x 4 = 8` bytes.
+
 **st_async**:
 ```cuda
 // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81, SM_90