Skip to content

Commit

Permalink
Add cuda::device::barrier_expect_tx (#498)
Browse files Browse the repository at this point in the history
  • Loading branch information
ahendriksen authored Oct 17, 2023
1 parent b269438 commit a3faeb1
Show file tree
Hide file tree
Showing 11 changed files with 271 additions and 16 deletions.
29 changes: 19 additions & 10 deletions libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,36 +46,45 @@ void mbarrier_complete_tx(
);
}

template<typename Barrier>
template<bool split_arrive_and_expect>
__device__
void thread(Barrier& b, int arrives_per_thread)
void thread(cuda::barrier<cuda::thread_scope_block>& b, int arrives_per_thread)
{
constexpr int tx_count = 1;
auto tok = cuda::device::barrier_arrive_tx(b, arrives_per_thread, tx_count);
typename cuda::barrier<cuda::thread_scope_block>::arrival_token tok;

if _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 (split_arrive_and_expect) {
cuda::device::barrier_expect_tx(b, tx_count);
tok = b.arrive(arrives_per_thread);
} else{
tok = cuda::device::barrier_arrive_tx(b, arrives_per_thread, tx_count);
}

// Manually increase the transaction count of the barrier.
mbarrier_complete_tx(b, tx_count);

b.wait(cuda::std::move(tok));
}

template<bool split_arrive_and_expect>
__device__
void test()
{
NV_DISPATCH_TARGET(
NV_IS_DEVICE, (
// Run all threads, each arriving with arrival count 1
constexpr auto block = cuda::thread_scope_block;
using barrier_t = cuda::barrier<cuda::thread_scope_block>;

__shared__ cuda::barrier<block> bar_1;
init(&bar_1, (int) blockDim.x);
shared_memory_selector<barrier_t, constructor_initializer> sel_1;
barrier_t* bar_1 = sel_1.construct(blockDim.x);
__syncthreads();
thread(bar_1, 1);
thread<split_arrive_and_expect>(*bar_1, 1);

// Run all threads, each arriving with arrival count 2
__shared__ cuda::barrier<block> bar_2;
init(&bar_2, (int) 2 * blockDim.x);
shared_memory_selector<barrier_t, constructor_initializer> sel_2;
barrier_t* bar_2 = sel_2.construct(2 * blockDim.x);
__syncthreads();
thread(bar_2, 2);
thread<split_arrive_and_expect>(*bar_2, 2);
)
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ int main(int, char**)
cuda_thread_count = 256;
),
NV_IS_DEVICE, (
test();
constexpr bool split_arrive_and_expect = false;
test<split_arrive_and_expect>();
)
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ int main(int, char**)
cuda_thread_count = 2;
),
NV_IS_DEVICE, (
test();
constexpr bool split_arrive_and_expect = false;
test<split_arrive_and_expect>();
)
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ int main(int, char**)
cuda_thread_count = 32;
),
NV_IS_DEVICE, (
test();
constexpr bool split_arrive_and_expect = false;
test<split_arrive_and_expect>();
)
);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90

// <cuda/barrier>

#include "arrive_tx.h"

int main(int, char**)
{
NV_DISPATCH_TARGET(
NV_IS_HOST, (
// Required by concurrent_agents_launch to know how many we're
// launching. This can only be an int, because the nvrtc tests use grep
// to figure out how many threads to launch.
cuda_thread_count = 256;
),
NV_IS_DEVICE, (
constexpr bool split_arrive_and_expect = true;
test<split_arrive_and_expect>();
)
);

return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90

// <cuda/barrier>

#include <cuda/barrier>
#include "test_macros.h"

// Suppress warning about barrier in shared memory
TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)

__device__ uint64_t bar_storage;

int main(int, char**){
NV_IF_TARGET(
NV_IS_DEVICE, (
cuda::barrier<cuda::thread_scope_block> *bar_ptr;
bar_ptr = reinterpret_cast<cuda::barrier<cuda::thread_scope_block> *>(bar_storage);

if (threadIdx.x == 0) {
init(bar_ptr, blockDim.x);
}
__syncthreads();

// Should fail because the barrier is in device memory.
cuda::device::barrier_expect_tx(*bar_ptr, 1);
));
return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90

// <cuda/barrier>

#include "arrive_tx.h"

int main(int, char**)
{
NV_DISPATCH_TARGET(
NV_IS_HOST, (
// Required by concurrent_agents_launch to know how many we're
// launching. This can only be an int, because the nvrtc tests use grep
// to figure out how many threads to launch.
cuda_thread_count = 2;
),
NV_IS_DEVICE, (
constexpr bool split_arrive_and_expect = true;
test<split_arrive_and_expect>();
)
);

return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90

// <cuda/barrier>

#include "arrive_tx.h"

int main(int, char**)
{
NV_DISPATCH_TARGET(
NV_IS_HOST, (
// Required by concurrent_agents_launch to know how many we're
// launching. This can only be an int, because the nvrtc tests use grep
// to figure out how many threads to launch.
cuda_thread_count = 32;
),
NV_IS_DEVICE, (
constexpr bool split_arrive_and_expect = true;
test<split_arrive_and_expect>();
)
);

return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ If `!(scope == thread_block_scope && __isShared(this))`, then the semantics are
the same as [`cuda::std::barrier`]; otherwise, see below.
The `cuda::barrier` class templates extends `cuda::std::barrier` with the following additional operations:

| [`cuda::barrier::init`] | Initialize a `cuda::barrier`. `(friend function)` |
| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function)` |
| [`cuda::barrier::init`] | Initialize a `cuda::barrier`. `(friend function)` |
| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function)` |
| [`cuda::device::barrier_arrive_tx`] | Arrive on a `cuda::barrier<cuda::thread_scope_block>` with transaction count update. `(function)` |

| [`cuda::device::barrier_expect_tx`] | Update transaction count of `cuda::barrier<cuda::thread_scope_block>`. `(function)` |

If `scope == thread_scope_block && __isShared(this)`, then the
semantics of [[thread.barrier.class]](http://eel.is/c++draft/thread.barrier.class) of ISO/IEC
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
---
grand_parent: Extended API
parent: Barriers
---

# `cuda::device::barrier_expect_tx`

Defined in header `<cuda/barrier>`:

```cuda
__device__
void cuda::device::barrier_expect_tx(
cuda::barrier<cuda::thread_scope_block>& bar,
ptrdiff_t transaction_count_update);
```

Updates the expected transaction count of a barrier in shared memory.

## Preconditions

* `__isShared(&bar) == true`
* `0 <= transaction_count_update && transaction_count_update <= (1 << 20) - 1`

## Effects

* This function increments the expected transaction count by `transaction_count_update`.
* This function executes atomically.

## Notes

This function can only be used under CUDA Compute Capability 9.0 (Hopper) or
higher.

## Example

```cuda
#include <cuda/barrier>
#include <cuda/std/utility> // cuda::std::move
#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900
static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy_expect_tx is not available.");
#endif // __CUDA_MINIMUM_ARCH__
__device__ alignas(16) int gmem_x[2048];
__global__ void example_kernel() {
using barrier_t = cuda::barrier<cuda::thread_scope_block>;
__shared__ alignas(16) int smem_x[1024];
__shared__ barrier_t bar;
if (threadIdx.x == 0) {
init(&bar, blockDim.x);
}
__syncthreads();
if (threadIdx.x == 0) {
cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
cuda::device::barrier_expect_tx(bar, sizeof(smem_x));
}
auto token = bar.arrive(1);
bar.wait(cuda::std::move(token));
// smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
smem_x[threadIdx.x] += 1;
}
```

[See it on Godbolt](https://godbolt.org/z/9Yj89P76z){: .btn }


[`cuda::thread_scope`]: ./memory_model.md
[Tracking asynchronous operations by the mbarrier object]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object
[thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12

27 changes: 27 additions & 0 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,33 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(

return async_contract_fulfillment::async;
}

_LIBCUDACXX_DEVICE inline
void barrier_expect_tx(
barrier<thread_scope_block> & __b,
_CUDA_VSTD::ptrdiff_t __transaction_count_update) {

_LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");

// We do not check for the statespace of the barrier here. This is
// on purpose. This allows debugging tools like memcheck/racecheck
// to detect that we are passing a pointer with the wrong state
// space to mbarrier.arrive. If we checked for the state space here,
// and __trap() if wrong, then those tools would not be able to help
// us in release builds. In debug builds, the error would be caught
// by the asserts at the top of this function.

auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
asm (
"mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
: "memory");
}
#endif // __CUDA_MINIMUM_ARCH__

_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
Expand Down

0 comments on commit a3faeb1

Please sign in to comment.