-
Notifications
You must be signed in to change notification settings - Fork 165
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Reorganize PTX tests to match generator (#2930)
- Loading branch information
1 parent
bc45573
commit cee542b
Showing
49 changed files
with
2,427 additions
and
2,407 deletions.
There are no files selected for viewing
40 changes: 40 additions & 0 deletions
40
libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
__global__ void test_barrier_cluster(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 780 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.arrive; | ||
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));)); | ||
#endif // __cccl_ptx_isa >= 780 | ||
|
||
#if __cccl_ptx_isa >= 780 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.wait; | ||
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));)); | ||
#endif // __cccl_ptx_isa >= 780 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.arrive.release; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.arrive.relaxed; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.wait.acquire; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
37 changes: 37 additions & 0 deletions
37
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
__global__ void test_cp_async_bulk(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // | ||
// 1a. unicast | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const uint32_t&, uint64_t*)>( | ||
cuda::ptx::cp_async_bulk));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, | ||
// [rdsmem_bar]; // 2. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&, uint64_t*)>( | ||
cuda::ptx::cp_async_bulk));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&)>( | ||
cuda::ptx::cp_async_bulk));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
10 changes: 10 additions & 0 deletions
10
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
__global__ void test_cp_async_bulk_commit_group(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.commit_group; | ||
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
18 changes: 18 additions & 0 deletions
18
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
__global__ void test_cp_async_bulk_multicast(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], | ||
// size, [smem_bar], ctaMask; // 1. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const uint32_t&, | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
117 changes: 117 additions & 0 deletions
117
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
__global__ void test_cp_async_bulk_tensor(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1a. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[1], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[1], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1b. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[2], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[2], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1c. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[3], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[3], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1d. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[4], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[4], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1e. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[5], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[5], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
82 changes: 82 additions & 0 deletions
82
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
__global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], | ||
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const int32_t(&)[1], | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], | ||
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const int32_t(&)[2], | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], | ||
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const int32_t(&)[3], | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], | ||
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const int32_t(&)[4], | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], | ||
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const int32_t(&)[5], | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
18 changes: 18 additions & 0 deletions
18
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
__global__ void test_cp_async_bulk_wait_group(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.wait_group N; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.wait_group.read N; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group_read));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
Oops, something went wrong.