Reorganize PTX tests to match generator (#2930)

NVIDIA · Nov 22, 2024 · cee542b · cee542b
1 parent bc45573
commit cee542b
Show file tree

Hide file tree

Showing 49 changed files with 2,427 additions and 2,407 deletions.
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
@@ -0,0 +1,40 @@
+__global__ void test_barrier_cluster(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // barrier.cluster.arrive;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // barrier.cluster.wait;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // barrier.cluster.arrive.release;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // barrier.cluster.arrive.relaxed;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // barrier.cluster.wait.acquire;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
@@ -0,0 +1,37 @@
+__global__ void test_cp_async_bulk(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; //
+        // 1a. unicast
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const uint32_t&, uint64_t*)>(
+            cuda::ptx::cp_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size,
+        // [rdsmem_bar]; // 2.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&, uint64_t*)>(
+            cuda::ptx::cp_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&)>(
+            cuda::ptx::cp_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
@@ -0,0 +1,10 @@
+__global__ void test_cp_async_bulk_commit_group(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.commit_group;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc
@@ -0,0 +1,18 @@
+__global__ void test_cp_async_bulk_multicast(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem],
+        // size, [smem_bar], ctaMask; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const uint32_t&,
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
@@ -0,0 +1,117 @@
+__global__ void test_cp_async_bulk_tensor(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
+        // tensorCoords], [smem_bar];// 1a.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[1], uint64_t*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[1], const void*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
+        // tensorCoords], [smem_bar];// 1b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[2], uint64_t*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[2], const void*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
+        // tensorCoords], [smem_bar];// 1c.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[3], uint64_t*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[3], const void*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
+        // tensorCoords], [smem_bar];// 1d.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[4], uint64_t*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[4], const void*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
+        // tensorCoords], [smem_bar];// 1e.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[5], uint64_t*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[5], const void*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc
@@ -0,0 +1,82 @@
+__global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
+        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const int32_t(&)[1],
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
+        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const int32_t(&)[2],
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
+        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const int32_t(&)[3],
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
+        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const int32_t(&)[4],
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
+        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const int32_t(&)[5],
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc
@@ -0,0 +1,18 @@
+__global__ void test_cp_async_bulk_wait_group(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // cp.async.bulk.wait_group N;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // cp.async.bulk.wait_group.read N;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group_read));));
+#endif // __cccl_ptx_isa >= 800
+}