From 6db5bb7eed2100642521071319078319394f7d6e Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 9 Nov 2023 11:40:03 +0100 Subject: [PATCH 1/4] Add st.async (cherry picked from commit 30ce5abfbf4eb853602673480aca249c4007828d) --- .../cuda/ptx/ptx.st.async.compile.pass.cpp | 93 ++++++++++++ libcudacxx/docs/extended_api/ptx.md | 86 ++++++++++- .../std/detail/libcxx/include/__cuda/ptx.h | 135 ++++++++++++++++++ 3 files changed, 313 insertions(+), 1 deletion(-) create mode 100644 libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp new file mode 100644 index 00000000000..f9c9d0f57b9 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +/* + * We use a special strategy to force the generation of the PTX. This is mainly + * a fight against dead-code-elimination in the NVVM layer. + * + * The reason we need this strategy is because certain older versions of ptxas + * segfault when a non-sensical sequence of PTX is generated. So instead, we try + * to force the instantiation and compilation to PTX of all the overloads of the + * PTX wrapping functions. + * + * We do this by writing a function pointer of each overload to the `__device__` + * variable `fn_ptr`. Now, because weak stores from a single thread may be + * elided, we also wrap the store in an if branch that cannot be removed. + * + * To prevent dead-code-elimination of the if branch, we use + * `non_eliminated_false`, which uses inline assembly to hide the fact that is + * always false from NVVM. + * + * So this is how we ensure that none of the function pointer stores are elided. + * Because `fn_ptr` is possibly visible outside this translation unit, the + * compiler must compile all the functions which are stored. + * + */ + +__device__ void * fn_ptr = nullptr; + +__device__ bool non_eliminated_false(void){ + int ret = 0; + asm ("": "=r"(ret)::); + return ret != 0; +} + +__global__ void test_compilation() { +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. + auto overload = static_cast(cuda::ptx::st_async); + fn_ptr = reinterpret_cast(overload); + } + if (non_eliminated_false()) { + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. + auto overload = static_cast(cuda::ptx::st_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. + auto overload = static_cast(cuda::ptx::st_async); + fn_ptr = reinterpret_cast(overload); + } + if (non_eliminated_false()) { + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. + auto overload = static_cast(cuda::ptx::st_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. + auto overload = static_cast(cuda::ptx::st_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 +} + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index e45eed54a42..c693b395bdb 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -286,7 +286,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release [`ld.global.nc`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc [`ldu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu [`st`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st -[`st.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async +[`st.async`]: #stasync [`multimem.ld_reduce, multimem.st, multimem.red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red [`prefetch, prefetchu`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu [`applypriority`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority @@ -299,6 +299,90 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release [`mapa`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa [`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank +#### `st.async` + +- PTX ISA: [`st.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async) + +**st_async**: +```cuda +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + void* addr, + const Type& value, + uint64_t* remote_bar); + +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + void* addr, + const Type (&value)[2], + uint64_t* remote_bar); + +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90 +template +__device__ static inline void st_async( + void* addr, + const B32 (&value)[4], + uint64_t* remote_bar); +``` + +**Usage**: +```cuda +#include +#include +#include +#include + +__global__ void __cluster_dims__(2, 1, 1) cluster_kernel() +{ + using cuda::ptx::sem_release; + using cuda::ptx::sem_acquire; + using cuda::ptx::space_cluster; + using cuda::ptx::space_shared; + using cuda::ptx::scope_cluster; + using cuda::ptx::scope_cta; + + using barrier_t = cuda::barrier; + + __shared__ int receive_buffer[4]; + __shared__ barrier_t bar; + init(&bar, blockDim.x); + // Sync cluster to ensure remote barrier is initialized. + cluster.sync(); + + // Get address of remote cluster barrier: + namespace cg = cooperative_groups; + cg::cluster_group cluster = cg::this_cluster(); + unsigned int other_block_rank = cluster.block_rank() ^ 1; + uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank); + int * remote_buffer = cluster.map_shared_rank(&bar, other_block_rank); + + // Arrive on local barrier: + uint64_t arrival_token; + if (threadIdx.x == 0) { + // Thread 0 arrives and indicates it expects to receive a certain number of bytes as well + arrival_token = cuda::ptx::mbarrier_arrive_expext_tx(sem_release, scope_cluster, space_shared, &bar, sizeof(receive_buffer)); + } else { + arrival_token = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar); + } + + // Send bytes to remote buffer, arriving on remote barrier + cuda::ptx::st_async(remote_buffer, {1, 2, 3, 4}, remote_bar); + + // Wait on local barrier: + while(!cuda::ptx::mbarrier_try_wait(sem_acquire, scope_cluster, cuda::device::barrier_native_handle(bar), arrival_token)) {} + + // Print received values: + if (threadIdx.x == 0) { + printf("[@%d] receive_buffer = {%d, %d, %d, %d}\n", + cluster.block_rank(), + receive_buffer[0], receive_buffer[1], receive_buffer[2], receive_buffer[3]); + } +} +``` ### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy) | Instruction | Available in libcu++ | diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index 3414bea568d..9b22963cb92 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -386,6 +386,141 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.12. Data Movement and Conversion Instructions: st.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + void* addr, + const Type& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void st_async( + void* __addr, + const _Type& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 4) { + asm ( + "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value)), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 8) { + asm ( + "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "l"(__as_b64(__value)), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + } + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + void* addr, + const Type (&value)[2], + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void st_async( + void* __addr, + const _Type (&__value)[2], + _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 4) { + asm ( + "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (sizeof(_Type) == 8) { + asm ( + "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "l"(__as_b64(__value[0])), + "l"(__as_b64(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + } + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90 +template +__device__ static inline void st_async( + void* addr, + const B32 (&value)[4], + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void st_async( + void* __addr, + const _B32 (&__value)[4], + _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_B32) == 4, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // 3. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_b32(__value[2])), + "r"(__as_b32(__value[3])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + + // 9.7.8.13. Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red From f66aec1443141944d70151c20b7b03c42ccb2e58 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 9 Nov 2023 12:05:18 +0100 Subject: [PATCH 2/4] Fix usage example (cherry picked from commit dc1d934a84a3da44691895a6555634358906b77c) --- libcudacxx/docs/extended_api/ptx.md | 108 +++++++++++++++++----------- 1 file changed, 66 insertions(+), 42 deletions(-) diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index c693b395bdb..3cb50f9f7ff 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -336,51 +336,75 @@ __device__ static inline void st_async( #include #include -__global__ void __cluster_dims__(2, 1, 1) cluster_kernel() +__global__ void __cluster_dims__(8, 1, 1) kernel() { - using cuda::ptx::sem_release; - using cuda::ptx::sem_acquire; - using cuda::ptx::space_cluster; - using cuda::ptx::space_shared; - using cuda::ptx::scope_cluster; - using cuda::ptx::scope_cta; + using cuda::ptx::sem_release; + using cuda::ptx::sem_acquire; + using cuda::ptx::space_cluster; + using cuda::ptx::space_shared; + using cuda::ptx::scope_cluster; + + namespace cg = cooperative_groups; + cg::cluster_group cluster = cg::this_cluster(); + + using barrier_t = cuda::barrier; + + __shared__ int receive_buffer[4]; + __shared__ barrier_t bar; + init(&bar, blockDim.x); + + // Sync cluster to ensure remote barrier is initialized. + cluster.sync(); + + // Get address of remote cluster barrier: + unsigned int other_block_rank = cluster.block_rank() ^ 1; + uint64_t * remote_bar = cluster.map_shared_rank(cuda::device::barrier_native_handle(bar), other_block_rank); + // int * remote_buffer = cluster.map_shared_rank(&receive_buffer, other_block_rank); + int * remote_buffer = cluster.map_shared_rank(&receive_buffer[0], other_block_rank); + + // Arrive on local barrier: + uint64_t arrival_token; + if (threadIdx.x == 0) { + // Thread 0 arrives and indicates it expects to receive a certain number of bytes as well + arrival_token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, cuda::device::barrier_native_handle(bar), sizeof(receive_buffer)); + } else { + arrival_token = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, cuda::device::barrier_native_handle(bar)); + } + + if (threadIdx.x == 0) { + printf("[block %d] arrived with expected tx count = %llu\n", cluster.block_rank(), sizeof(receive_buffer)); + } + + // Send bytes to remote buffer, arriving on remote barrier + if (threadIdx.x == 0) { + cuda::ptx::st_async(remote_buffer, {int(cluster.block_rank()), 2, 3, 4}, remote_bar); + } + + if (threadIdx.x == 0) { + printf("[block %d] st_async to %p, %p\n", + cluster.block_rank(), + remote_buffer, + remote_bar + ); + } + + // Wait on local barrier: + while(!cuda::ptx::mbarrier_try_wait(sem_acquire, scope_cluster, cuda::device::barrier_native_handle(bar), arrival_token)) {} + + // Print received values: + if (threadIdx.x == 0) { + printf( + "[block %d] receive_buffer = {%d, %d, %d, %d}\n", + cluster.block_rank(), + receive_buffer[0], receive_buffer[1], receive_buffer[2], receive_buffer[3] + ); + } - using barrier_t = cuda::barrier; +} - __shared__ int receive_buffer[4]; - __shared__ barrier_t bar; - init(&bar, blockDim.x); - // Sync cluster to ensure remote barrier is initialized. - cluster.sync(); - - // Get address of remote cluster barrier: - namespace cg = cooperative_groups; - cg::cluster_group cluster = cg::this_cluster(); - unsigned int other_block_rank = cluster.block_rank() ^ 1; - uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank); - int * remote_buffer = cluster.map_shared_rank(&bar, other_block_rank); - - // Arrive on local barrier: - uint64_t arrival_token; - if (threadIdx.x == 0) { - // Thread 0 arrives and indicates it expects to receive a certain number of bytes as well - arrival_token = cuda::ptx::mbarrier_arrive_expext_tx(sem_release, scope_cluster, space_shared, &bar, sizeof(receive_buffer)); - } else { - arrival_token = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar); - } - - // Send bytes to remote buffer, arriving on remote barrier - cuda::ptx::st_async(remote_buffer, {1, 2, 3, 4}, remote_bar); - - // Wait on local barrier: - while(!cuda::ptx::mbarrier_try_wait(sem_acquire, scope_cluster, cuda::device::barrier_native_handle(bar), arrival_token)) {} - - // Print received values: - if (threadIdx.x == 0) { - printf("[@%d] receive_buffer = {%d, %d, %d, %d}\n", - cluster.block_rank(), - receive_buffer[0], receive_buffer[1], receive_buffer[2], receive_buffer[3]); - } +int main() { + kernel<<<8, 128>>>(); + cudaDeviceSynchronize(); } ``` ### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy) From 397cb0c8bb40463d251c24e65d977cf3cc895a36 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 9 Nov 2023 13:41:08 +0100 Subject: [PATCH 3/4] Use typed instead of void pointers Because the size can be either 32 or 64 bit, this can catch a lot of errors. For instance: uint64_t * remote_buffer; uint64_t * remote_bar; cuda::ptx::st_async(remote_buffer, 1, remote_bar); would previously use the .b32 path because the `1` is an integer and determines the type resolution. Now, this will result in a compiler error. Resolution is to either (a) change the value type, or (b) change the buffer type. a) uint64_t * remote_buffer; cuda::ptx::st_async(remote_buffer, uint64_t(1), remote_bar); b) int32_t * remote_buffer; cuda::ptx::st_async(remote_buffer, 1, remote_bar); (cherry picked from commit 76044b749e7a017507f1c9eafd6b10f0ef1d23f9) --- .../test/cuda/ptx/ptx.st.async.compile.pass.cpp | 10 +++++----- libcudacxx/docs/extended_api/ptx.md | 6 +++--- .../cuda/std/detail/libcxx/include/__cuda/ptx.h | 12 ++++++------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp index f9c9d0f57b9..0a95f9dcece 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp @@ -50,12 +50,12 @@ __global__ void test_compilation() { NV_IF_TARGET(NV_PROVIDES_SM_90, ( if (non_eliminated_false()) { // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. - auto overload = static_cast(cuda::ptx::st_async); + auto overload = static_cast(cuda::ptx::st_async); fn_ptr = reinterpret_cast(overload); } if (non_eliminated_false()) { // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. - auto overload = static_cast(cuda::ptx::st_async); + auto overload = static_cast(cuda::ptx::st_async); fn_ptr = reinterpret_cast(overload); } )); @@ -65,12 +65,12 @@ __global__ void test_compilation() { NV_IF_TARGET(NV_PROVIDES_SM_90, ( if (non_eliminated_false()) { // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. - auto overload = static_cast(cuda::ptx::st_async); + auto overload = static_cast(cuda::ptx::st_async); fn_ptr = reinterpret_cast(overload); } if (non_eliminated_false()) { // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. - auto overload = static_cast(cuda::ptx::st_async); + auto overload = static_cast(cuda::ptx::st_async); fn_ptr = reinterpret_cast(overload); } )); @@ -80,7 +80,7 @@ __global__ void test_compilation() { NV_IF_TARGET(NV_PROVIDES_SM_90, ( if (non_eliminated_false()) { // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. - auto overload = static_cast(cuda::ptx::st_async); + auto overload = static_cast(cuda::ptx::st_async); fn_ptr = reinterpret_cast(overload); } )); diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index 3cb50f9f7ff..5bfc2d7fa7c 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -309,7 +309,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release // .type = { .b32, .b64 } template __device__ static inline void st_async( - void* addr, + Type* addr, const Type& value, uint64_t* remote_bar); @@ -317,14 +317,14 @@ __device__ static inline void st_async( // .type = { .b32, .b64 } template __device__ static inline void st_async( - void* addr, + Type* addr, const Type (&value)[2], uint64_t* remote_bar); // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90 template __device__ static inline void st_async( - void* addr, + B32* addr, const B32 (&value)[4], uint64_t* remote_bar); ``` diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index 9b22963cb92..48dd076986c 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -391,7 +391,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // .type = { .b32, .b64 } template __device__ static inline void st_async( - void* addr, + Type* addr, const Type& value, uint64_t* remote_bar); */ @@ -399,7 +399,7 @@ __device__ static inline void st_async( extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline void st_async( - void* __addr, + _Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar) { @@ -438,7 +438,7 @@ _LIBCUDACXX_DEVICE static inline void st_async( // .type = { .b32, .b64 } template __device__ static inline void st_async( - void* addr, + Type* addr, const Type (&value)[2], uint64_t* remote_bar); */ @@ -446,7 +446,7 @@ __device__ static inline void st_async( extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline void st_async( - void* __addr, + _Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar) { @@ -486,7 +486,7 @@ _LIBCUDACXX_DEVICE static inline void st_async( // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90 template __device__ static inline void st_async( - void* addr, + B32* addr, const B32 (&value)[4], uint64_t* remote_bar); */ @@ -494,7 +494,7 @@ __device__ static inline void st_async( extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline void st_async( - void* __addr, + _B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar) { From 4a3f797f2d6a86c52d7933df56fa68d8b7070a54 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 9 Nov 2023 13:56:33 +0100 Subject: [PATCH 4/4] Add note on alignment The type may be misleading on this one, so I added a note on alignment of the destination address. (cherry picked from commit 8230836aa80d67c131fdc776d840cd1e67d9d91f) --- libcudacxx/docs/extended_api/ptx.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index 5bfc2d7fa7c..b76f7ce4398 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -303,6 +303,9 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release - PTX ISA: [`st.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async) +**NOTE.** Alignment of `addr` must be a multiple of vector size. For instance, +the `addr` supplied to the `v2.b32` variant must be aligned to `2 x 4 = 8` bytes. + **st_async**: ```cuda // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90