Skip to content

Commit

Permalink
Reorganize PTX headers to match generator (#2925)
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber authored Nov 22, 2024
1 parent f6ec34b commit b27d512
Show file tree
Hide file tree
Showing 47 changed files with 6,454 additions and 6,422 deletions.
124 changes: 1 addition & 123 deletions libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,129 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX

// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
/*
// barrier.cluster.arrive; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_arrive();
*/
#if __cccl_ptx_isa >= 780
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void barrier_cluster_arrive()
{
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("barrier.cluster.arrive;"
:
:
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
}
#endif // __cccl_ptx_isa >= 780

/*
// barrier.cluster.wait; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_wait();
*/
#if __cccl_ptx_isa >= 780
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void barrier_cluster_wait()
{
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("barrier.cluster.wait;"
:
:
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
}
#endif // __cccl_ptx_isa >= 780

/*
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .release }
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_release_t);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
{
// __sem == sem_release (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("barrier.cluster.arrive.release;"
:
:
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
}
#endif // __cccl_ptx_isa >= 800

/*
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .relaxed }
// Marked volatile
template <typename=void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_relaxed_t);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
{
// __sem == sem_relaxed (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("barrier.cluster.arrive.relaxed;"
:
:
:);),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
}
#endif // __cccl_ptx_isa >= 800

/*
// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
// .sem = { .acquire }
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_wait(
cuda::ptx::sem_acquire_t);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
{
// __sem == sem_acquire (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("barrier.cluster.wait.acquire;"
:
:
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
}
#endif // __cccl_ptx_isa >= 800
#include <cuda/__ptx/instructions/generated/barrier_cluster.inc>

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX

Expand Down
158 changes: 2 additions & 156 deletions libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,162 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX

// 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
/*
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
SM_90
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
void* dstMem,
const void* srcMem,
const uint32_t& size,
uint64_t* smem_bar);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void cp_async_bulk(
space_cluster_t,
space_global_t,
void* __dstMem,
const void* __srcMem,
const _CUDA_VSTD::uint32_t& __size,
_CUDA_VSTD::uint64_t* __smem_bar)
{
// __space == space_cluster (due to parameter type constraint)
// __space == space_global (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
:
: "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar))
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
}
#endif // __cccl_ptx_isa >= 800

/*
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .shared::cta }
template <typename=void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_cluster_t,
cuda::ptx::space_shared_t,
void* dstMem,
const void* srcMem,
const uint32_t& size,
uint64_t* rdsmem_bar);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void cp_async_bulk(
space_cluster_t,
space_shared_t,
void* __dstMem,
const void* __srcMem,
const _CUDA_VSTD::uint32_t& __size,
_CUDA_VSTD::uint64_t* __rdsmem_bar)
{
// __space == space_cluster (due to parameter type constraint)
// __space == space_shared (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
:
: "r"(__as_ptr_remote_dsmem(__dstMem)),
"r"(__as_ptr_smem(__srcMem)),
"r"(__size),
"r"(__as_ptr_remote_dsmem(__rdsmem_bar))
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
}
#endif // __cccl_ptx_isa >= 800

/*
// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
// .dst = { .global }
// .src = { .shared::cta }
template <typename=void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_global_t,
cuda::ptx::space_shared_t,
void* dstMem,
const void* srcMem,
const uint32_t& size);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void
cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size)
{
// __space == space_global (due to parameter type constraint)
// __space == space_shared (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
:
: "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
}
#endif // __cccl_ptx_isa >= 800
/*
// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar],
ctaMask; // 1. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
void* dstMem,
const void* srcMem,
const uint32_t& size,
uint64_t* smem_bar,
const uint16_t& ctaMask);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();
template <typename = void>
_CCCL_DEVICE static inline void cp_async_bulk(
space_cluster_t,
space_global_t,
void* __dstMem,
const void* __srcMem,
const _CUDA_VSTD::uint32_t& __size,
_CUDA_VSTD::uint64_t* __smem_bar,
const _CUDA_VSTD::uint16_t& __ctaMask)
{
// __space == space_cluster (due to parameter type constraint)
// __space == space_global (due to parameter type constraint)
NV_IF_ELSE_TARGET(
NV_HAS_FEATURE_SM_90a,
(asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], "
"%4; // 1. "
:
: "r"(__as_ptr_smem(__dstMem)),
"l"(__as_ptr_gmem(__srcMem)),
"r"(__size),
"r"(__as_ptr_smem(__smem_bar)),
"h"(__ctaMask)
: "memory");),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
}
#endif // __cccl_ptx_isa >= 800
#include <cuda/__ptx/instructions/generated/cp_async_bulk.inc>
#include <cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc>

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,27 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX

// 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
/*
// cp.async.bulk.commit_group; // PTX ISA 80, SM_90
template <typename=void>
__device__ static inline void cp_async_bulk_commit_group();
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();
template <typename = void>
_CCCL_DEVICE static inline void cp_async_bulk_commit_group()
{
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(asm volatile("cp.async.bulk.commit_group;"
:
:
:);),
(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
}
#endif // __cccl_ptx_isa >= 800
#include <cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc>

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX

Expand Down
Loading

0 comments on commit b27d512

Please sign in to comment.