diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h index 9fcc2130dda..6d16bfd6fcc 100644 --- a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h @@ -46,36 +46,45 @@ void mbarrier_complete_tx( ); } -template +template __device__ -void thread(Barrier& b, int arrives_per_thread) +void thread(cuda::barrier& b, int arrives_per_thread) { constexpr int tx_count = 1; - auto tok = cuda::device::barrier_arrive_tx(b, arrives_per_thread, tx_count); + typename cuda::barrier::arrival_token tok; + + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 (split_arrive_and_expect) { + cuda::device::barrier_expect_tx(b, tx_count); + tok = b.arrive(arrives_per_thread); + } else{ + tok = cuda::device::barrier_arrive_tx(b, arrives_per_thread, tx_count); + } + // Manually increase the transaction count of the barrier. mbarrier_complete_tx(b, tx_count); b.wait(cuda::std::move(tok)); } +template __device__ void test() { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( // Run all threads, each arriving with arrival count 1 - constexpr auto block = cuda::thread_scope_block; + using barrier_t = cuda::barrier; - __shared__ cuda::barrier bar_1; - init(&bar_1, (int) blockDim.x); + shared_memory_selector sel_1; + barrier_t* bar_1 = sel_1.construct(blockDim.x); __syncthreads(); - thread(bar_1, 1); + thread(*bar_1, 1); // Run all threads, each arriving with arrival count 2 - __shared__ cuda::barrier bar_2; - init(&bar_2, (int) 2 * blockDim.x); + shared_memory_selector sel_2; + barrier_t* bar_2 = sel_2.construct(2 * blockDim.x); __syncthreads(); - thread(bar_2, 2); + thread(*bar_2, 2); ) ); } diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cta.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cta.pass.cpp index 9c67e23c6f9..0d2f5ca841b 100644 --- a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cta.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_cta.pass.cpp @@ -25,7 +25,8 @@ int main(int, char**) cuda_thread_count = 256; ), NV_IS_DEVICE, ( - test(); + constexpr bool split_arrive_and_expect = false; + test(); ) ); diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_thread.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_thread.pass.cpp index d1ac1d66de8..a6ac3803bd7 100644 --- a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_thread.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_thread.pass.cpp @@ -25,7 +25,8 @@ int main(int, char**) cuda_thread_count = 2; ), NV_IS_DEVICE, ( - test(); + constexpr bool split_arrive_and_expect = false; + test(); ) ); diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_warp.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_warp.pass.cpp index 0c75474e179..4cc310d9580 100644 --- a/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_warp.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx_warp.pass.cpp @@ -25,7 +25,8 @@ int main(int, char**) cuda_thread_count = 32; ), NV_IS_DEVICE, ( - test(); + constexpr bool split_arrive_and_expect = false; + test(); ) ); diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_cta.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_cta.pass.cpp new file mode 100644 index 00000000000..771eb6fde70 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_cta.pass.cpp @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include "arrive_tx.h" + +int main(int, char**) +{ + NV_DISPATCH_TARGET( + NV_IS_HOST, ( + // Required by concurrent_agents_launch to know how many we're + // launching. This can only be an int, because the nvrtc tests use grep + // to figure out how many threads to launch. + cuda_thread_count = 256; + ), + NV_IS_DEVICE, ( + constexpr bool split_arrive_and_expect = true; + test(); + ) + ); + + return 0; +} diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_device.runfail.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_device.runfail.cpp new file mode 100644 index 00000000000..6da34c14afa --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_device.runfail.cpp @@ -0,0 +1,39 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include +#include "test_macros.h" + +// Suppress warning about barrier in shared memory +TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init) + +__device__ uint64_t bar_storage; + +int main(int, char**){ + NV_IF_TARGET( + NV_IS_DEVICE, ( + cuda::barrier *bar_ptr; + bar_ptr = reinterpret_cast *>(bar_storage); + + if (threadIdx.x == 0) { + init(bar_ptr, blockDim.x); + } + __syncthreads(); + + // Should fail because the barrier is in device memory. + cuda::device::barrier_expect_tx(*bar_ptr, 1); + )); + return 0; +} diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_thread.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_thread.pass.cpp new file mode 100644 index 00000000000..5ec98c26ee2 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_thread.pass.cpp @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include "arrive_tx.h" + +int main(int, char**) +{ + NV_DISPATCH_TARGET( + NV_IS_HOST, ( + // Required by concurrent_agents_launch to know how many we're + // launching. This can only be an int, because the nvrtc tests use grep + // to figure out how many threads to launch. + cuda_thread_count = 2; + ), + NV_IS_DEVICE, ( + constexpr bool split_arrive_and_expect = true; + test(); + ) + ); + + return 0; +} diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_warp.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_warp.pass.cpp new file mode 100644 index 00000000000..bf626a902b8 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/barrier/expect_tx_warp.pass.cpp @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: pre-sm-90 + +// + +#include "arrive_tx.h" + +int main(int, char**) +{ + NV_DISPATCH_TARGET( + NV_IS_HOST, ( + // Required by concurrent_agents_launch to know how many we're + // launching. This can only be an int, because the nvrtc tests use grep + // to figure out how many threads to launch. + cuda_thread_count = 32; + ), + NV_IS_DEVICE, ( + constexpr bool split_arrive_and_expect = true; + test(); + ) + ); + + return 0; +} diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md b/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md index 7acfb95d918..c4bb54ee259 100644 --- a/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md +++ b/libcudacxx/docs/extended_api/synchronization_primitives/barrier.md @@ -20,10 +20,10 @@ If `!(scope == thread_block_scope && __isShared(this))`, then the semantics are the same as [`cuda::std::barrier`]; otherwise, see below. The `cuda::barrier` class templates extends `cuda::std::barrier` with the following additional operations: -| [`cuda::barrier::init`] | Initialize a `cuda::barrier`. `(friend function)` | -| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function)` | +| [`cuda::barrier::init`] | Initialize a `cuda::barrier`. `(friend function)` | +| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function)` | | [`cuda::device::barrier_arrive_tx`] | Arrive on a `cuda::barrier` with transaction count update. `(function)` | - +| [`cuda::device::barrier_expect_tx`] | Update transaction count of `cuda::barrier`. `(function)` | If `scope == thread_scope_block && __isShared(this)`, then the semantics of [[thread.barrier.class]](http://eel.is/c++draft/thread.barrier.class) of ISO/IEC diff --git a/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_expect_tx.md b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_expect_tx.md new file mode 100644 index 00000000000..f029b99e922 --- /dev/null +++ b/libcudacxx/docs/extended_api/synchronization_primitives/barrier/barrier_expect_tx.md @@ -0,0 +1,75 @@ +--- +grand_parent: Extended API +parent: Barriers +--- + +# `cuda::device::barrier_expect_tx` + +Defined in header ``: + +```cuda +__device__ +void cuda::device::barrier_expect_tx( + cuda::barrier& bar, + ptrdiff_t transaction_count_update); +``` + +Updates the expected transaction count of a barrier in shared memory. + +## Preconditions + +* `__isShared(&bar) == true` +* `0 <= transaction_count_update && transaction_count_update <= (1 << 20) - 1` + +## Effects + +* This function increments the expected transaction count by `transaction_count_update`. +* This function executes atomically. + +## Notes + +This function can only be used under CUDA Compute Capability 9.0 (Hopper) or +higher. + +## Example + +```cuda +#include +#include // cuda::std::move + +#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900 +static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy_expect_tx is not available."); +#endif // __CUDA_MINIMUM_ARCH__ + +__device__ alignas(16) int gmem_x[2048]; + +__global__ void example_kernel() { + using barrier_t = cuda::barrier; + __shared__ alignas(16) int smem_x[1024]; + __shared__ barrier_t bar; + + if (threadIdx.x == 0) { + init(&bar, blockDim.x); + } + __syncthreads(); + + if (threadIdx.x == 0) { + cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar); + cuda::device::barrier_expect_tx(bar, sizeof(smem_x)); + } + auto token = bar.arrive(1); + + bar.wait(cuda::std::move(token)); + + // smem_x contains the contents of gmem_x[0], ..., gmem_x[1023] + smem_x[threadIdx.x] += 1; +} +``` + +[See it on Godbolt](https://godbolt.org/z/9Yj89P76z){: .btn } + + +[`cuda::thread_scope`]: ./memory_model.md +[Tracking asynchronous operations by the mbarrier object]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object +[thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12 + diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h index da6b09b3e3d..7ffaa1734e5 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h @@ -652,6 +652,33 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx( return async_contract_fulfillment::async; } + +_LIBCUDACXX_DEVICE inline +void barrier_expect_tx( + barrier & __b, + _CUDA_VSTD::ptrdiff_t __transaction_count_update) { + + _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); + _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative."); + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object + _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); + + // We do not check for the statespace of the barrier here. This is + // on purpose. This allows debugging tools like memcheck/racecheck + // to detect that we are passing a pointer with the wrong state + // space to mbarrier.arrive. If we checked for the state space here, + // and __trap() if wrong, then those tools would not be able to help + // us in release builds. In debug builds, the error would be caught + // by the asserts at the top of this function. + + auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b)); + asm ( + "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) + : "memory"); +} #endif // __CUDA_MINIMUM_ARCH__ _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE