Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport 574 ptx #663

Merged
merged 2 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

#include "concurrent_agents.h"
#include "cuda_space_selector.h"
#include "test_macros.h"

template <typename ... _Ty>
__device__ inline bool __unused(_Ty...) { return true; }

__global__ void test_compilation() {
using cuda::ptx::sem_release;
using cuda::ptx::space_cluster;
using cuda::ptx::space_shared;
using cuda::ptx::scope_cluster;
using cuda::ptx::scope_cta;

__shared__ uint64_t bar;
bar = 1;
uint64_t state = 1;

#if __cccl_ptx_isa >= 700
NV_IF_TARGET(NV_PROVIDES_SM_80, (
state = cuda::ptx::mbarrier_arrive(&bar); // 1.
state = cuda::ptx::mbarrier_arrive_no_complete(&bar, 1); // 5.
));
#endif // __cccl_ptx_isa >= 700

// This guard is redundant: before PTX ISA 7.8, there was no support for SM_90
#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90, (
state = cuda::ptx::mbarrier_arrive(&bar, 1); // 2.
));
#endif // __cccl_ptx_isa >= 780

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar); // 3a.
state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar); // 3a.

state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3b.
state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 3b.

cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar); // 4a.

cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1); // 4b.

state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); // 8.
state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 8.

cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1); // 9.
));
#endif // __cccl_ptx_isa >= 800
__unused(bar, state);
}

int main(int, char**)
{
return 0;
}
2 changes: 2 additions & 0 deletions libcudacxx/.upstream-tests/test/support/concurrent_agents.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#endif
#endif

#include <cuda/std/cassert>

#include "test_macros.h"

TEST_EXEC_CHECK_DISABLE
Expand Down
2 changes: 2 additions & 0 deletions libcudacxx/docs/extended_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ nav_order: 3

{% include_relative extended_api/functional.md %}

{% include_relative extended_api/ptx.md %}

[Thread Scopes]: ./extended_api/memory_model.md#thread-scopes
[Thread Groups]: ./extended_api/thread_groups.md

669 changes: 669 additions & 0 deletions libcudacxx/docs/extended_api/ptx.md

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions libcudacxx/include/cuda/ptx
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_PTX
#define _CUDA_PTX

#include "std/detail/__config"

#include "std/detail/__pragma_push"

#include "std/detail/libcxx/include/__cuda/ptx.h"

#include "std/detail/__pragma_pop"

#endif // _CUDA_PTX
3 changes: 3 additions & 0 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__config
Original file line number Diff line number Diff line change
Expand Up @@ -1505,6 +1505,9 @@ typedef __char32_t char32_t;
#define _LIBCUDACXX_END_NAMESPACE_CUDA } }
#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE namespace cuda { namespace device { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
#define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE } } }
#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX namespace cuda { namespace ptx { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
#define _LIBCUDACXX_END_NAMESPACE_CUDA_PTX } } }
#define _CUDA_VPTX ::cuda::ptx::_LIBCUDACXX_ABI_NAMESPACE
#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL namespace cuda { namespace device { namespace experimental { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
#define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL } } } }
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER

#include "../cstdlib" // _LIBCUDACXX_UNREACHABLE
#include "../__type_traits/void_t.h" // _CUDA_VSTD::__void_t
#include "../__cuda/ptx.h" // cuda::ptx::*

#if defined(_LIBCUDACXX_COMPILER_NVRTC)
#define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member))
Expand Down Expand Up @@ -206,29 +207,21 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
else if (!__isShared(&__barrier)) {
__trap();
}

asm volatile ("mbarrier.arrive.shared.b64 %0, [%1], %2;"
: "=l"(__token)
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__update))
: "memory");
// Cannot use cuda::device::barrier_native_handle here, as it is
// only defined for block-scope barriers. This barrier may be a
// non-block scoped barrier.
auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
__token = _CUDA_VPTX::mbarrier_arrive(__bh, __update);
), NV_PROVIDES_SM_80, (
if (!__isShared(&__barrier)) {
return __barrier.arrive(__update);
}

auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
// Need 2 instructions, can't finish barrier with arrive > 1
if (__update > 1) {
asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
: "=l"(__token)
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__update - 1))
: "memory");
_CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1);
}
asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
: "=l"(__token)
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier)))
: "memory");
__token = _CUDA_VPTX::mbarrier_arrive( __bh);
), NV_IS_DEVICE, (
if (!__isShared(&__barrier)) {
return __barrier.arrive(__update);
Expand Down Expand Up @@ -603,27 +596,22 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
// us in release builds. In debug builds, the error would be caught
// by the asserts at the top of this function.

auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
auto __native_handle = barrier_native_handle(__b);
auto __bh = __cvta_generic_to_shared(__native_handle);
if (__arrive_count_update == 1) {
asm (
"mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
: "=l"(__token)
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
: "memory");
__token = _CUDA_VPTX::mbarrier_arrive_expect_tx(
_CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __transaction_count_update
);
} else {
asm (
"mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
: "memory");
asm (
"mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2;"
: "=l"(__token)
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__arrive_count_update))
: "memory");
__token = _CUDA_VPTX::mbarrier_arrive(
_CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update
);
}
)
);
Expand Down Expand Up @@ -1089,8 +1077,8 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
));

NV_IF_TARGET(NV_PROVIDES_SM_80, (
const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {
const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
if (__can_use_async_group) {
__cp_async_shared_global_mechanism<_Align>(__group, __dest_char, __src_char, __size);
return __completion_mechanism::__async_group;
Expand Down
Loading