Skip to content

Commit

Permalink
Reorganize PTX tests to match generator (#2930)
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber authored Nov 22, 2024
1 parent bc45573 commit cee542b
Show file tree
Hide file tree
Showing 49 changed files with 2,427 additions and 2,407 deletions.
40 changes: 40 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
__global__ void test_barrier_cluster(void** fn_ptr)
{
#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// barrier.cluster.arrive;
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));));
#endif // __cccl_ptx_isa >= 780

#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// barrier.cluster.wait;
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));));
#endif // __cccl_ptx_isa >= 780

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// barrier.cluster.arrive.release;
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// barrier.cluster.arrive.relaxed;
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// barrier.cluster.wait.acquire;
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));));
#endif // __cccl_ptx_isa >= 800
}
37 changes: 37 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
__global__ void test_cp_async_bulk(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; //
// 1a. unicast
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const uint32_t&, uint64_t*)>(
cuda::ptx::cp_async_bulk));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size,
// [rdsmem_bar]; // 2.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&, uint64_t*)>(
cuda::ptx::cp_async_bulk));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&)>(
cuda::ptx::cp_async_bulk));));
#endif // __cccl_ptx_isa >= 800
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
__global__ void test_cp_async_bulk_commit_group(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.commit_group;
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));));
#endif // __cccl_ptx_isa >= 800
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
__global__ void test_cp_async_bulk_multicast(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem],
// size, [smem_bar], ctaMask; // 1.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
void*,
const void*,
const uint32_t&,
uint64_t*,
const uint16_t&)>(cuda::ptx::cp_async_bulk));));
#endif // __cccl_ptx_isa >= 800
}
117 changes: 117 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
__global__ void test_cp_async_bulk_tensor(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
// tensorCoords], [smem_bar];// 1a.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[1], uint64_t*)>(
cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[1], const void*)>(
cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
// tensorCoords], [smem_bar];// 1b.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[2], uint64_t*)>(
cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[2], const void*)>(
cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
// tensorCoords], [smem_bar];// 1c.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[3], uint64_t*)>(
cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[3], const void*)>(
cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
// tensorCoords], [smem_bar];// 1d.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[4], uint64_t*)>(
cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[4], const void*)>(
cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
// tensorCoords], [smem_bar];// 1e.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[5], uint64_t*)>(
cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[5], const void*)>(
cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
__global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
void*,
const void*,
const int32_t(&)[1],
uint64_t*,
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
void*,
const void*,
const int32_t(&)[2],
uint64_t*,
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
void*,
const void*,
const int32_t(&)[3],
uint64_t*,
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
void*,
const void*,
const int32_t(&)[4],
uint64_t*,
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
void*,
const void*,
const int32_t(&)[5],
uint64_t*,
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
#endif // __cccl_ptx_isa >= 800
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
__global__ void test_cp_async_bulk_wait_group(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// cp.async.bulk.wait_group N;
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group));));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// cp.async.bulk.wait_group.read N;
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group_read));));
#endif // __cccl_ptx_isa >= 800
}
Loading

0 comments on commit cee542b

Please sign in to comment.