diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
new file mode 100644
index 00000000000..0a95f9dcece
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+/*
+ * We use a special strategy to force the generation of the PTX. This is mainly
+ * a fight against dead-code-elimination in the NVVM layer.
+ *
+ * The reason we need this strategy is because certain older versions of ptxas
+ * segfault when a non-sensical sequence of PTX is generated. So instead, we try
+ * to force the instantiation and compilation to PTX of all the overloads of the
+ * PTX wrapping functions.
+ *
+ * We do this by writing a function pointer of each overload to the `__device__`
+ * variable `fn_ptr`. Now, because weak stores from a single thread may be
+ * elided, we also wrap the store in an if branch that cannot be removed.
+ *
+ * To prevent dead-code-elimination of the if branch, we use
+ * `non_eliminated_false`, which uses inline assembly to hide the fact that is
+ * always false from NVVM.
+ *
+ * So this is how we ensure that none of the function pointer stores are elided.
+ * Because `fn_ptr` is possibly visible outside this translation unit, the
+ * compiler must compile all the functions which are stored.
+ *
+ */
+
+__device__ void * fn_ptr = nullptr;
+
+__device__ bool non_eliminated_false(void){
+  int ret = 0;
+  asm ("": "=r"(ret)::);
+  return ret != 0;
+}
+
+__global__ void test_compilation() {
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.
+      auto overload = static_cast<void (*)(int32_t* , const int32_t& , uint64_t* )>(cuda::ptx::st_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+    if (non_eliminated_false()) {
+      // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.
+      auto overload = static_cast<void (*)(int64_t* , const int64_t& , uint64_t* )>(cuda::ptx::st_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
+      auto overload = static_cast<void (*)(int32_t* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+    if (non_eliminated_false()) {
+      // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
+      auto overload = static_cast<void (*)(int64_t* , const int64_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.
+      auto overload = static_cast<void (*)(int32_t* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::st_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+}
+
+int main(int, char**)
+{
+    return 0;
+}
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index e45eed54a42..b76f7ce4398 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -286,7 +286,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 [`ld.global.nc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc
 [`ldu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu
 [`st`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
-[`st.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
+[`st.async`]: #stasync
 [`multimem.ld_reduce, multimem.st, multimem.red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
 [`prefetch, prefetchu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu
 [`applypriority`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority
@@ -299,6 +299,117 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 [`mapa`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
 [`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
 
+#### `st.async`
+
+-  PTX ISA: [`st.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async)
+
+**NOTE.** Alignment of `addr` must be a multiple of vector size. For instance,
+the `addr` supplied to the `v2.b32` variant must be aligned to `2 x 4 = 8` bytes.
+
+**st_async**:
+```cuda
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81, SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  Type* addr,
+  const Type& value,
+  uint64_t* remote_bar);
+
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.  PTX ISA 81, SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  Type* addr,
+  const Type (&value)[2],
+  uint64_t* remote_bar);
+
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81, SM_90
+template <typename B32>
+__device__ static inline void st_async(
+  B32* addr,
+  const B32 (&value)[4],
+  uint64_t* remote_bar);
+```
+
+**Usage**:
+```cuda
+#include <cstdio>
+#include <cuda/ptx>
+#include <cuda/barrier>
+#include <cooperative_groups.h>
+
+__global__ void __cluster_dims__(8, 1, 1) kernel()
+{
+  using cuda::ptx::sem_release;
+  using cuda::ptx::sem_acquire;
+  using cuda::ptx::space_cluster;
+  using cuda::ptx::space_shared;
+  using cuda::ptx::scope_cluster;
+
+  namespace cg = cooperative_groups;
+  cg::cluster_group cluster = cg::this_cluster();
+
+  using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+
+  __shared__ int receive_buffer[4];
+  __shared__ barrier_t bar;
+  init(&bar, blockDim.x);
+
+  // Sync cluster to ensure remote barrier is initialized.
+  cluster.sync();
+
+  // Get address of remote cluster barrier:
+  unsigned int other_block_rank = cluster.block_rank() ^ 1;
+  uint64_t * remote_bar = cluster.map_shared_rank(cuda::device::barrier_native_handle(bar), other_block_rank);
+  // int * remote_buffer = cluster.map_shared_rank(&receive_buffer, other_block_rank);
+  int * remote_buffer = cluster.map_shared_rank(&receive_buffer[0], other_block_rank);
+
+  // Arrive on local barrier:
+  uint64_t arrival_token;
+  if (threadIdx.x == 0) {
+    // Thread 0 arrives and indicates it expects to receive a certain number of bytes as well
+    arrival_token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, cuda::device::barrier_native_handle(bar), sizeof(receive_buffer));
+  } else {
+    arrival_token = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, cuda::device::barrier_native_handle(bar));
+  }
+
+  if (threadIdx.x == 0) {
+    printf("[block %d] arrived with expected tx count = %llu\n", cluster.block_rank(), sizeof(receive_buffer));
+  }
+
+  // Send bytes to remote buffer, arriving on remote barrier
+  if (threadIdx.x == 0) {
+    cuda::ptx::st_async(remote_buffer, {int(cluster.block_rank()), 2, 3, 4}, remote_bar);
+  }
+
+  if (threadIdx.x == 0) {
+    printf("[block %d] st_async to %p, %p\n",
+           cluster.block_rank(),
+           remote_buffer,
+           remote_bar
+    );
+  }
+
+  // Wait on local barrier:
+  while(!cuda::ptx::mbarrier_try_wait(sem_acquire, scope_cluster, cuda::device::barrier_native_handle(bar), arrival_token)) {}
+
+  // Print received values:
+  if (threadIdx.x == 0) {
+    printf(
+      "[block %d] receive_buffer = {%d, %d, %d, %d}\n",
+      cluster.block_rank(),
+      receive_buffer[0], receive_buffer[1], receive_buffer[2], receive_buffer[3]
+    );
+  }
+
+}
+
+int main() {
+  kernel<<<8, 128>>>();
+  cudaDeviceSynchronize();
+}
+```
 ### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy)
 
 | Instruction                              | Available in libcu++ |
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index 3414bea568d..48dd076986c 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -386,6 +386,141 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // 9.7.8.12. Data Movement and Conversion Instructions: st.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
 
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81, SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  Type* addr,
+  const Type& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _Type>
+_LIBCUDACXX_DEVICE static inline void st_async(
+  _Type* __addr,
+  const _Type& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 4) {
+      asm (
+        "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2];    // 1. "
+        :
+        : "r"(__as_ptr_remote_dsmem(__addr)),
+          "r"(__as_b32(__value)),
+          "r"(__as_ptr_remote_dsmem(__remote_bar))
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 8) {
+      asm (
+        "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2];    // 1. "
+        :
+        : "r"(__as_ptr_remote_dsmem(__addr)),
+          "l"(__as_b64(__value)),
+          "r"(__as_ptr_remote_dsmem(__remote_bar))
+        : "memory"
+      );
+    }
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.  PTX ISA 81, SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  Type* addr,
+  const Type (&value)[2],
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _Type>
+_LIBCUDACXX_DEVICE static inline void st_async(
+  _Type* __addr,
+  const _Type (&__value)[2],
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 4) {
+      asm (
+        "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. "
+        :
+        : "r"(__as_ptr_remote_dsmem(__addr)),
+          "r"(__as_b32(__value[0])),
+          "r"(__as_b32(__value[1])),
+          "r"(__as_ptr_remote_dsmem(__remote_bar))
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 8) {
+      asm (
+        "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. "
+        :
+        : "r"(__as_ptr_remote_dsmem(__addr)),
+          "l"(__as_b64(__value[0])),
+          "l"(__as_b64(__value[1])),
+          "r"(__as_ptr_remote_dsmem(__remote_bar))
+        : "memory"
+      );
+    }
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81, SM_90
+template <typename B32>
+__device__ static inline void st_async(
+  B32* addr,
+  const B32 (&value)[4],
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_LIBCUDACXX_DEVICE static inline void st_async(
+  _B32* __addr,
+  const _B32 (&__value)[4],
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_B32) == 4, "");
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5];    // 3. "
+      :
+      : "r"(__as_ptr_remote_dsmem(__addr)),
+        "r"(__as_b32(__value[0])),
+        "r"(__as_b32(__value[1])),
+        "r"(__as_b32(__value[2])),
+        "r"(__as_b32(__value[3])),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+
 // 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red