[doc] Add docstring for segment reduce. (dmlc#2375)

yzh119 · web-flow · commit 6b02babbadce · 2020-11-27T18:04:13.000+08:00
diff --git a/docs/source/api/python/dgl.ops.rst b/docs/source/api/python/dgl.ops.rst
@@ -239,7 +239,7 @@ Like GSpMM, GSDDMM operators support both homogeneous and bipartite graph.
 Edge Softmax module
 -------------------
 
-We also provide framework agnostic edge softmax module which was frequently used in
+DGL also provide framework agnostic edge softmax module which was frequently used in
 GNN-like structures, e.g. 
 `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`_,
 `Transformer <https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>`_,
@@ -250,6 +250,16 @@ GNN-like structures, e.g.
 
     edge_softmax
 
+Segment Reduce Module
+---------------------
+
+DGL provide operators to reduce value tensor along the first dimension by segments.
+
+.. autosummary::
+   :toctree: ../../generated/
+
+   segment_reduce
+
 Relation with Message Passing APIs
 ----------------------------------
 
diff --git a/python/dgl/backend/backend.py b/python/dgl/backend/backend.py
@@ -1512,23 +1512,27 @@ def segment_reduce(op, x, offsets):
     """Segment reduction operator.
 
     It aggregates the value tensor along the first dimension by segments.
-    The first argument ``seglen`` stores the length of each segment. Its
-    summation must be equal to the first dimension of the ``value`` tensor.
-    Zero-length segments are allowed.
+    The argument ``offsets`` specifies the start offset of each segment (and
+    the upper bound of the last segment). Zero-length segments are allowed.
+
+    .. math::
+      y_i = \Phi_{j=\mathrm{offsets}_i}^{\mathrm{offsets}_{i+1}-1} x_j
+
+    where :math:`\Phi` is the reduce operator.
 
     Parameters
     ----------
     op : str
-        Aggregation method. Can be 'sum', 'max', 'min'.
-    seglen : Tensor
-        Segment lengths.
-    value : Tensor
+        Aggregation method. Can be ``sum``, ``max``, ``min``.
+    x : Tensor
         Value to aggregate.
+    offsets : Tensor
+        The start offsets of segments.
 
     Returns
     -------
     Tensor
-        Aggregated tensor of shape ``(len(seglen), value.shape[1:])``.
+        Aggregated tensor of shape ``(len(offsets) - 1, value.shape[1:])``.
     """
     pass
 
diff --git a/python/dgl/ops/segment.py b/python/dgl/ops/segment.py
@@ -69,8 +69,6 @@ def segment_softmax(seglen, value):
         Segment lengths.
     value : Tensor
         Value to aggregate.
-    reducer : str, optional
-        Aggregation method. Can be 'sum', 'max', 'min', 'mean'.
 
     Returns
     -------
diff --git a/python/dgl/sparse.py b/python/dgl/sparse.py
@@ -252,18 +252,22 @@ def _segment_reduce(op, feat, offsets):
     r"""Segment reduction operator.
 
     It aggregates the value tensor along the first dimension by segments.
-    The first argument ``seglen`` stores the length of each segment. Its
-    summation must be equal to the first dimension of the ``value`` tensor.
-    Zero-length segments are allowed.
+    The argument ``offsets`` specifies the start offset of each segment (and
+    the upper bound of the last segment). Zero-length segments are allowed.
+
+    .. math::
+      y_i = \Phi_{j=\mathrm{offsets}_i}^{\mathrm{offsets}_{i+1}-1} x_j
+
+    where :math:`\Phi` is the reduce operator.
 
     Parameters
     ----------
     op : str
-        Aggregation method. Can be 'sum', 'max', 'min'.
-    seglen : Tensor
-        Segment lengths.
-    value : Tensor
+        Aggregation method. Can be ``sum``, ``max``, ``min``.
+    x : Tensor
         Value to aggregate.
+    offsets : Tensor
+        The start offsets of segments.
 
     Returns
     -------
diff --git a/src/array/cpu/segment_reduce.h b/src/array/cpu/segment_reduce.h
@@ -12,6 +12,12 @@ namespace dgl {
 namespace aten {
 namespace cpu {
 
+/*!
+ * \brief CPU kernel of segment sum.
+ * \param feat The input tensor.
+ * \param offsets The offset tensor storing the ranges of segments.
+ * \param out The output tensor.
+ */
 template <typename IdType, typename DType>
 void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
   int n = out->shape[0];
@@ -31,6 +37,14 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
   }
 }
 
+/*!
+ * \brief CPU kernel of segment min/max.
+ * \param feat The input tensor.
+ * \param offsets The offset tensor storing the ranges of segments.
+ * \param out The output tensor.
+ * \param arg An auxiliary tensor storing the argmin/max information
+ *        used in backward phase.
+ */
 template <typename IdType, typename DType, typename Cmp>
 void SegmentCmp(NDArray feat, NDArray offsets,
                 NDArray out, NDArray arg) {
@@ -58,6 +72,12 @@ void SegmentCmp(NDArray feat, NDArray offsets,
   }
 }
 
+/*!
+ * \brief CPU kernel of backward phase of segment min/max.
+ * \param feat The input tensor.
+ * \param arg The argmin/argmax tensor.
+ * \param out The output tensor.
+ */
 template <typename IdType, typename DType>
 void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
   int n = feat->shape[0];
diff --git a/src/array/cuda/sddmm.cuh b/src/array/cuda/sddmm.cuh
@@ -146,7 +146,7 @@ __device__ __forceinline__ Idx BinarySearchSrc(const Idx *array, Idx length, Idx
  *       is responsible for the computation on different edges. Threadblocks
  *       on the x-axis are responsible for the computation on different positions
  *       in feature dimension.
- *       To efficiently find the source node idx and destination node index of an 
+ *       To efficiently find the source node idx and destination node index of an
  *       given edge on Csr format, it uses binary search (time complexity O(log N)).
  */
 template <typename Idx, typename DType, typename BinaryOp,
@@ -239,7 +239,7 @@ void SDDMMCoo(
           coo.num_rows, coo.num_cols, nnz, reduce_dim,
           lhs_off, rhs_off,
           lhs_len, rhs_len, len);
-    });        
+    });
   } else {
     const int ntx = FindNumThreads(len);
     const int nty = CUDA_MAX_NUM_THREADS / ntx;
diff --git a/src/array/cuda/segment_reduce.cuh b/src/array/cuda/segment_reduce.cuh
@@ -19,6 +19,8 @@ namespace cuda {
 
 /*!
  * \brief CUDA kernel of segment reduce.
+ * \note each blockthread is responsible for aggregation on a row
+ *       in the result tensor.
  */
 template <typename IdType, typename DType,
           typename ReduceOp>
@@ -41,7 +43,9 @@ __global__ void SegmentReduceKernel(
 }
 
 /*!
- * \brief CUDA kernel of segment reduce.
+ * \brief CUDA kernel of backward phase in segment min/max.
+ * \note each blockthread is responsible for writing a row in the
+ *       result gradient tensor by lookup the ArgMin/Max for index information.
  */
 template <typename IdType, typename DType>
 __global__ void BackwardSegmentCmpKernel(
@@ -57,6 +61,13 @@ __global__ void BackwardSegmentCmpKernel(
   }
 }
 
+/*!
+ * \brief CUDA implementation of forward phase of Segment Reduce.
+ * \param feat The input tensor.
+ * \param offsets The offsets tensor.
+ * \param out The output tensor.
+ * \param arg An auxiliary tensor storing ArgMax/Min information,
+ */
 template <typename IdType, typename DType, typename ReduceOp>
 void SegmentReduce(
     NDArray feat,
@@ -80,12 +91,19 @@ void SegmentReduce(
   const int nty = 1;
   const dim3 nblks(nbx, nby);
   const dim3 nthrs(ntx, nty);
+  // TODO(zihao): try cub's DeviceSegmentedReduce and compare the performance.
   CUDA_KERNEL_CALL((SegmentReduceKernel<IdType, DType, ReduceOp>),
       nblks, nthrs, 0, thr_entry->stream,
       feat_data, offsets_data, out_data, arg_data,
       n, dim);
 }
 
+/*!
+ * \brief CUDA implementation of backward phase of Segment Reduce with Min/Max reducer.
+ * \param feat The input tensor.
+ * \param arg The ArgMin/Max information, used for indexing.
+ * \param out The output tensor.
+ */
 template <typename IdType, typename DType>
 void BackwardSegmentCmp(
     NDArray feat,
diff --git a/src/array/cuda/spmm.cuh b/src/array/cuda/spmm.cuh
@@ -19,7 +19,7 @@ using namespace cuda;
 namespace aten {
 namespace cuda {
 
-/*! 
+/*!
  * \brief CUDA Kernel of filling the vector started from ptr of size length
  *        with val.
  * \note internal use only.
@@ -134,7 +134,7 @@ __global__ void ArgSpMMCooKernel(
 /*!
  * \brief CUDA kernel of g-SpMM on Coo format.
  * \note it uses node parallel strategy, different threadblocks (on y-axis)
- *       is responsible for the computation on different destination nodes. 
+ *       is responsible for the computation on different destination nodes.
  *       Threadblocks on the x-axis are responsible for the computation on
  *       different positions in feature dimension.
  */
@@ -191,10 +191,10 @@ __global__ void SpMMCsrKernel(
  * \param ufeat The feature on source nodes.
  * \param efeat The feature on edges.
  * \param out The result feature on destination nodes.
- * \param argu Arg-Min/Max on source nodes, which refers the source node indices 
+ * \param argu Arg-Min/Max on source nodes, which refers the source node indices
  *        correspond to the minimum/maximum values of reduction result on
  *        destination nodes. It's useful in computing gradients of Min/Max reducer.
- * \param arge Arg-Min/Max on edges. which refers the source node indices 
+ * \param arge Arg-Min/Max on edges. which refers the source node indices
  *        correspond to the minimum/maximum values of reduction result on
  *        destination nodes. It's useful in computing gradients of Min/Max reducer.
  */
@@ -263,10 +263,10 @@ void SpMMCoo(
  * \param ufeat The feature on source nodes.
  * \param efeat The feature on edges.
  * \param out The result feature on destination nodes.
- * \param argu Arg-Min/Max on source nodes, which refers the source node indices 
+ * \param argu Arg-Min/Max on source nodes, which refers the source node indices
  *        correspond to the minimum/maximum values of reduction result on
  *        destination nodes. It's useful in computing gradients of Min/Max reducer.
- * \param arge Arg-Min/Max on edges. which refers the source node indices 
+ * \param arge Arg-Min/Max on edges. which refers the source node indices
  *        correspond to the minimum/maximum values of reduction result on
  *        destination nodes. It's useful in computing gradients of Min/Max reducer.
  */