diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp new file mode 100644 index 00000000000..0a95f9dcece --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +/* + * We use a special strategy to force the generation of the PTX. This is mainly + * a fight against dead-code-elimination in the NVVM layer. + * + * The reason we need this strategy is because certain older versions of ptxas + * segfault when a non-sensical sequence of PTX is generated. So instead, we try + * to force the instantiation and compilation to PTX of all the overloads of the + * PTX wrapping functions. + * + * We do this by writing a function pointer of each overload to the `__device__` + * variable `fn_ptr`. Now, because weak stores from a single thread may be + * elided, we also wrap the store in an if branch that cannot be removed. + * + * To prevent dead-code-elimination of the if branch, we use + * `non_eliminated_false`, which uses inline assembly to hide the fact that is + * always false from NVVM. + * + * So this is how we ensure that none of the function pointer stores are elided. + * Because `fn_ptr` is possibly visible outside this translation unit, the + * compiler must compile all the functions which are stored. + * + */ + +__device__ void * fn_ptr = nullptr; + +__device__ bool non_eliminated_false(void){ + int ret = 0; + asm ("": "=r"(ret)::); + return ret != 0; +} + +__global__ void test_compilation() { +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. + auto overload = static_cast(cuda::ptx::st_async); + fn_ptr = reinterpret_cast(overload); + } + if (non_eliminated_false()) { + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. + auto overload = static_cast(cuda::ptx::st_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. + auto overload = static_cast(cuda::ptx::st_async); + fn_ptr = reinterpret_cast(overload); + } + if (non_eliminated_false()) { + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. + auto overload = static_cast(cuda::ptx::st_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. + auto overload = static_cast(cuda::ptx::st_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 +} + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index e45eed54a42..b76f7ce4398 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -286,7 +286,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release [`ld.global.nc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc [`ldu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu [`st`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st -[`st.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async +[`st.async`]: #stasync [`multimem.ld_reduce, multimem.st, multimem.red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red [`prefetch, prefetchu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu [`applypriority`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority @@ -299,6 +299,117 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release [`mapa`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa [`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank +#### `st.async` + +- PTX ISA: [`st.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async) + +**NOTE.** Alignment of `addr` must be a multiple of vector size. For instance, +the `addr` supplied to the `v2.b32` variant must be aligned to `2 x 4 = 8` bytes. + +**st_async**: +```cuda +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + Type* addr, + const Type& value, + uint64_t* remote_bar); + +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + Type* addr, + const Type (&value)[2], + uint64_t* remote_bar); + +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90 +template +__device__ static inline void st_async( + B32* addr, + const B32 (&value)[4], + uint64_t* remote_bar); +``` + +**Usage**: +```cuda +#include +#include +#include +#include + +__global__ void __cluster_dims__(8, 1, 1) kernel() +{ + using cuda::ptx::sem_release; + using cuda::ptx::sem_acquire; + using cuda::ptx::space_cluster; + using cuda::ptx::space_shared; + using cuda::ptx::scope_cluster; + + namespace cg = cooperative_groups; + cg::cluster_group cluster = cg::this_cluster(); + + using barrier_t = cuda::barrier; + + __shared__ int receive_buffer[4]; + __shared__ barrier_t bar; + init(&bar, blockDim.x); + + // Sync cluster to ensure remote barrier is initialized. + cluster.sync(); + + // Get address of remote cluster barrier: + unsigned int other_block_rank = cluster.block_rank() ^ 1; + uint64_t * remote_bar = cluster.map_shared_rank(cuda::device::barrier_native_handle(bar), other_block_rank); + // int * remote_buffer = cluster.map_shared_rank(&receive_buffer, other_block_rank); + int * remote_buffer = cluster.map_shared_rank(&receive_buffer[0], other_block_rank); + + // Arrive on local barrier: + uint64_t arrival_token; + if (threadIdx.x == 0) { + // Thread 0 arrives and indicates it expects to receive a certain number of bytes as well + arrival_token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, cuda::device::barrier_native_handle(bar), sizeof(receive_buffer)); + } else { + arrival_token = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, cuda::device::barrier_native_handle(bar)); + } + + if (threadIdx.x == 0) { + printf("[block %d] arrived with expected tx count = %llu\n", cluster.block_rank(), sizeof(receive_buffer)); + } + + // Send bytes to remote buffer, arriving on remote barrier + if (threadIdx.x == 0) { + cuda::ptx::st_async(remote_buffer, {int(cluster.block_rank()), 2, 3, 4}, remote_bar); + } + + if (threadIdx.x == 0) { + printf("[block %d] st_async to %p, %p\n", + cluster.block_rank(), + remote_buffer, + remote_bar + ); + } + + // Wait on local barrier: + while(!cuda::ptx::mbarrier_try_wait(sem_acquire, scope_cluster, cuda::device::barrier_native_handle(bar), arrival_token)) {} + + // Print received values: + if (threadIdx.x == 0) { + printf( + "[block %d] receive_buffer = {%d, %d, %d, %d}\n", + cluster.block_rank(), + receive_buffer[0], receive_buffer[1], receive_buffer[2], receive_buffer[3] + ); + } + +} + +int main() { + kernel<<<8, 128>>>(); + cudaDeviceSynchronize(); +} +``` ### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy) | Instruction | Available in libcu++ | diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index 3414bea568d..48dd076986c 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -386,6 +386,141 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.12. Data Movement and Conversion Instructions: st.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + Type* addr, + const Type& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void st_async( + _Type* __addr, + const _Type& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 4) { + asm ( + "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value)), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 8) { + asm ( + "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "l"(__as_b64(__value)), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + } + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + Type* addr, + const Type (&value)[2], + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void st_async( + _Type* __addr, + const _Type (&__value)[2], + _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 4) { + asm ( + "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 8) { + asm ( + "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "l"(__as_b64(__value[0])), + "l"(__as_b64(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + } + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90 +template +__device__ static inline void st_async( + B32* addr, + const B32 (&value)[4], + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void st_async( + _B32* __addr, + const _B32 (&__value)[4], + _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_B32) == 4, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // 3. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_b32(__value[2])), + "r"(__as_b32(__value[3])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + + // 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red