Reorganize PTX docs to match generator (#2929)

Co-authored-by: Allard Hendriksen <[email protected]>
NVIDIA · Nov 22, 2024 · ee46f3e · ee46f3e
1 parent cee542b
commit ee46f3e
Show file tree

Hide file tree

Showing 54 changed files with 1,616 additions and 1,583 deletions.
diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst
@@ -6,25 +6,25 @@ PTX Instructions
 .. toctree::
    :maxdepth: 1
 
-   instructions/barrier.cluster
-   instructions/cp.async.bulk
-   instructions/cp.async.bulk.commit_group
-   instructions/cp.async.bulk.wait_group
-   instructions/cp.async.bulk.tensor
-   instructions/cp.reduce.async.bulk
-   instructions/cp.reduce.async.bulk.tensor
+   instructions/barrier_cluster
+   instructions/cp_async_bulk
+   instructions/cp_async_bulk_commit_group
+   instructions/cp_async_bulk_wait_group
+   instructions/cp_async_bulk_tensor
+   instructions/cp_reduce_async_bulk
+   instructions/cp_reduce_async_bulk_tensor
    instructions/fence
    instructions/getctarank
    instructions/mapa
-   instructions/mbarrier.init
-   instructions/mbarrier.arrive
-   instructions/mbarrier.expect_tx
-   instructions/mbarrier.test_wait
-   instructions/mbarrier.try_wait
-   instructions/red.async
-   instructions/st.async
-   instructions/tensormap.replace
-   instructions/tensormap.cp_fenceproxy
+   instructions/mbarrier_init
+   instructions/mbarrier_arrive
+   instructions/mbarrier_expect_tx
+   instructions/mbarrier_test_wait
+   instructions/mbarrier_try_wait
+   instructions/red_async
+   instructions/st_async
+   instructions/tensormap_replace
+   instructions/tensormap_cp_fenceproxy
    instructions/special_registers
 
 

diff --git a/docs/libcudacxx/ptx/instructions/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/barrier_cluster.rst
@@ -0,0 +1,16 @@
+.. _libcudacxx-ptx-instructions-barrier-cluster:
+
+barrier.cluster
+===============
+
+-  PTX ISA:
+   `barrier.cluster <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster>`__
+
+Similar functionality is provided through the builtins
+``__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(), __cluster_barrier_wait()``,
+as well as the ``cooperative_groups::cluster_group``
+`API <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cluster-group>`__.
+
+The ``.aligned`` variants of the instructions are not exposed.
+
+.. include:: generated/barrier_cluster.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst
@@ -0,0 +1,30 @@
+.. _libcudacxx-ptx-instructions-cp-async-bulk:
+
+cp.async.bulk
+=============
+
+-  PTX ISA:
+   `cp.async.bulk <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`__
+
+Implementation notes
+--------------------
+
+**NOTE.** Both ``srcMem`` and ``dstMem`` must be 16-byte aligned, and
+``size`` must be a multiple of 16.
+
+Changelog
+---------
+
+-  In earlier versions, ``cp_async_bulk_multicast`` was enabled for
+   SM_90. This has been changed to SM_90a.
+
+
+Unicast
+-------
+
+.. include:: generated/cp_async_bulk.rst
+
+Multicast
+---------
+
+.. include:: generated/cp_async_bulk_multicast.rst
diff --git a/...structions/cp.async.bulk.commit_group.rst → ...structions/cp_async_bulk_commit_group.rst b/...structions/cp.async.bulk.commit_group.rst → ...structions/cp_async_bulk_commit_group.rst
@@ -6,10 +6,4 @@ cp.async.bulk.commit_group
 -  PTX ISA:
    `cp.async.bulk.commit_group <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group>`__
 
-cp.async.bulk.commit_group
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_commit_group();
+.. include:: generated/cp_async_bulk_commit_group.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
@@ -0,0 +1,23 @@
+.. _libcudacxx-ptx-instructions-cp-async-bulk-tensor:
+
+cp.async.bulk.tensor
+====================
+
+-  PTX ISA:
+   `cp.async.bulk.tensor <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__
+
+Changelog
+---------
+
+-  In earlier versions, ``cp_async_bulk_tensor_multicast`` was enabled
+   for SM_90. This has been changed to SM_90a.
+
+Unicast
+-------
+
+.. include:: generated/cp_async_bulk_tensor.rst
+
+Multicast
+---------
+
+.. include:: generated/cp_async_bulk_tensor_multicast.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-cp-async-bulk-wait_group:
+
+cp.async.bulk.wait_group
+========================
+
+-  PTX ISA:
+   `cp.async.bulk.wait_group <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group>`__
+
+.. include:: generated/cp_async_bulk_wait_group.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst
@@ -0,0 +1,61 @@
+.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk:
+
+cp.reduce.async.bulk
+====================
+
+-  PTX ISA:
+   `cp.reduce.async.bulk <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk>`__
+
+
+Integer and floating point instructions
+---------------------------------------
+
+.. include:: generated/cp_reduce_async_bulk.rst
+
+Emulation of ``.s64`` instruction
+---------------------------------
+
+PTX does not currently (CTK 12.3) expose
+``cp.reduce.async.bulk.add.s64``. This exposure is emulated in
+``cuda::ptx`` using:
+
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90
+   // .dst       = { .shared::cluster }
+   // .src       = { .shared::cta }
+   // .type      = { .s64 }
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_add_t,
+     int64_t* dstMem,
+     const int64_t* srcMem,
+     uint32_t size,
+     uint64_t* rdsmem_bar);
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.u64  [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .s64 }
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_add_t,
+     int64_t* dstMem,
+     const int64_t* srcMem,
+     uint32_t size);
+
+FP16 instructions
+-----------------
+
+.. include:: generated/cp_reduce_async_bulk_f16.rst
+
+BF16 instructions
+-----------------
+
+.. include:: generated/cp_reduce_async_bulk_bf16.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk-tensor:
+
+cp.reduce.async.bulk.tensor
+===========================
+
+-  PTX ISA:
+   `cp.reduce.async.bulk.tensor <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor>`__
+
+.. include:: generated/cp_reduce_async_bulk_tensor.rst