doudoubobo
diff --git a/‎python/dgl/backend/backend.py
+42 b/‎python/dgl/backend/backend.py
+42
diff --git a/‎python/dgl/backend/mxnet/sparse.py
+35-3 b/‎python/dgl/backend/mxnet/sparse.py
+35-3
diff --git a/‎python/dgl/backend/mxnet/tensor.py
+3 b/‎python/dgl/backend/mxnet/tensor.py
+3
diff --git a/‎python/dgl/backend/pytorch/sparse.py
+32-2 b/‎python/dgl/backend/pytorch/sparse.py
+32-2
diff --git a/‎python/dgl/backend/pytorch/tensor.py
+3 b/‎python/dgl/backend/pytorch/tensor.py
+3
diff --git a/‎python/dgl/backend/tensorflow/sparse.py
+28-3 b/‎python/dgl/backend/tensorflow/sparse.py
+28-3
diff --git a/‎python/dgl/backend/tensorflow/tensor.py
+6 b/‎python/dgl/backend/tensorflow/tensor.py
+6
diff --git a/‎python/dgl/ops/__init__.py
+1 b/‎python/dgl/ops/__init__.py
+1
diff --git a/‎python/dgl/ops/segment.py
+15-16 b/‎python/dgl/ops/segment.py
+15-16
diff --git a/‎python/dgl/sparse.py
+77 b/‎python/dgl/sparse.py
+77
@@ -370,6 +370,23 @@ def reduce_sum(input):
     """
     pass
 
+def cumsum(input, dim):
+    """Return the cumulative sum of the elements along a given axis.
+
+    Parameters
+    ----------
+    input : Tensor
+        The input tensor.
+    dim : int
+        The cumulative dimension.
+
+    Returns
+    -------
+    Tensor
+        A framework-specific tensor.
+    """
+    pass
+
 def mean(input, dim):
     """Reduce average the input tensor along the given dim.
 
@@ -1489,6 +1506,31 @@ def edge_softmax(gidx, logits, eids, norm_by):
     Tensor
         Softmax value
     """
+    pass
+
+def segment_reduce(op, x, offsets):
+    """Segment reduction operator.
+
+    It aggregates the value tensor along the first dimension by segments.
+    The first argument ``seglen`` stores the length of each segment. Its
+    summation must be equal to the first dimension of the ``value`` tensor.
+    Zero-length segments are allowed.
+
+    Parameters
+    ----------
+    op : str
+        Aggregation method. Can be 'sum', 'max', 'min'.
+    seglen : Tensor
+        Segment lengths.
+    value : Tensor
+        Value to aggregate.
+
+    Returns
+    -------
+    Tensor
+        Aggregated tensor of shape ``(len(seglen), value.shape[1:])``.
+    """
+    pass
 
 
 ###############################################################################
 
@@ -1,11 +1,11 @@
 import mxnet as mx
 import numpy as np
 from mxnet import nd
-from ...sparse import _gspmm, _gsddmm
+from ...sparse import _gspmm, _gsddmm, _segment_reduce, _bwd_segment_cmp
 from ...base import dgl_warning, is_all, ALL
 from .tensor import asnumpy, copy_to, zerocopy_from_numpy, context, to_backend_ctx
 
-__all__ = ['gspmm', 'gsddmm', 'edge_softmax']
+__all__ = ['gspmm', 'gsddmm', 'edge_softmax', 'segment_reduce']
 
 
 def _scatter_nd(index, src, n_rows):
@@ -28,7 +28,7 @@ def _scatter_nd(index, src, n_rows):
     if ndim > 1:
         new_idx = index * stride + sum(offsets)
     else:
-        new_idx = index 
+        new_idx = index
     src = src.reshape(-1)
     new_idx = new_idx.reshape(-1)
     rst = np.zeros((stride * n_rows,), dtype=src.dtype)
@@ -328,3 +328,35 @@ def backward(self, grad_out):
 def edge_softmax(gidx, logits, eids=ALL, norm_by='dst'):
     softmax_op = EdgeSoftmax(gidx, eids, norm_by)
     return softmax_op(logits)
+
+
+class SegmentReduce(mx.autograd.Function):
+    def __init__(self, op, offsets):
+        super(SegmentReduce, self).__init__()
+        self.op = op
+        self.offsets = offsets
+
+    def forward(self, x):
+        y, arg = _segment_reduce(self.op, x, self.offsets)
+        self.save_for_backward(arg)
+        return y
+
+    def backward(self, dy):
+        arg, = self.saved_tensors
+        offsets = self.offsets
+        m = offsets[-1].asscalar()
+        if self.op == 'sum':
+            offsets_np = asnumpy(offsets[1:-1])
+            indices_np = np.zeros((m,), dtype=offsets_np.dtype)
+            np.add.at(indices_np, offsets_np, np.ones_like(offsets_np))
+            indices_np = np.cumsum(indices_np, -1)
+            indices = zerocopy_from_numpy(indices_np)
+            dx = dy[indices]
+        else:
+            dx = _bwd_segment_cmp(dy, arg, m)
+        return dx
+
+
+def segment_reduce(op, x, offsets):
+    segment_reduce_op = SegmentReduce(op, offsets)
+    return segment_reduce_op(x)
@@ -152,6 +152,9 @@ def sum(input, dim, keepdims=False):
 def reduce_sum(input):
     return input.sum()
 
+def cumsum(input, dim):
+    return nd.cumsum(input, axis=dim)
+
 def mean(input, dim):
     return nd.mean(input, axis=dim)
 
 
@@ -1,8 +1,8 @@
 import torch as th
 from ...base import is_all, ALL
-from ...sparse import _gspmm, _gsddmm
+from ...sparse import _gspmm, _gsddmm, _segment_reduce, _bwd_segment_cmp
 
-__all__ = ['gspmm', 'gsddmm', 'edge_softmax']
+__all__ = ['gspmm', 'gsddmm', 'edge_softmax', 'segment_reduce']
 
 
 def _reduce_grad(grad, shape):
@@ -231,6 +231,32 @@ def backward(ctx, grad_out):
         return None, grad_score, None, None
 
 
+class SegmentReduce(th.autograd.Function):
+    @staticmethod
+    def forward(ctx, op, x, offsets):
+        y, arg = _segment_reduce(op, x, offsets)
+        print(arg)
+        ctx.save_for_backward(arg, offsets)
+        ctx.backward_cache = op
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        op = ctx.backward_cache
+        arg, offsets = ctx.saved_tensors
+        m = offsets[-1].item()
+        if op == 'sum':
+            offsets = offsets[1:-1]
+            indices = th.zeros(
+                (m,), device=offsets.device, dtype=offsets.dtype)
+            indices.scatter_add_(0, offsets, th.ones_like(offsets))
+            indices = th.cumsum(indices, -1)
+            dx = dy[indices]
+        else:
+            dx = _bwd_segment_cmp(dy, arg, m)
+        return None, dx, None
+
+
 def gspmm(gidx, op, reduce_op, lhs_data, rhs_data):
     return GSpMM.apply(gidx, op, reduce_op, lhs_data, rhs_data)
 
@@ -241,3 +267,7 @@ def gsddmm(gidx, op, lhs_data, rhs_data, lhs_target='u', rhs_target='v'):
 
 def edge_softmax(gidx, logits, eids=ALL, norm_by='dst'):
     return EdgeSoftmax.apply(gidx, logits, eids, norm_by)
+
+
+def segment_reduce(op, x, offsets):
+    return SegmentReduce.apply(op, x, offsets)
@@ -120,6 +120,9 @@ def sum(input, dim, keepdims=False):
 def reduce_sum(input):
     return input.sum()
 
+def cumsum(input, dim):
+    return th.cumsum(input, dim=dim)
+
 def mean(input, dim):
     return th.mean(input, dim=dim)
 
 
@@ -1,10 +1,10 @@
 import tensorflow as tf
 import numpy as np
-from .tensor import tensor, copy_to, context
+from .tensor import tensor, copy_to, context, asnumpy, zerocopy_from_numpy
 from ...base import is_all, ALL
-from ...sparse import _gspmm, _gsddmm
+from ...sparse import _gspmm, _gsddmm, _segment_reduce, _bwd_segment_cmp
 
-__all__ = ['gspmm', 'gsddmm', 'edge_softmax']
+__all__ = ['gspmm', 'gsddmm', 'edge_softmax', 'segment_reduce']
 
 
 def _scatter_nd(index, src, n_rows):
@@ -254,3 +254,28 @@ def _lambda(logits):
         return edge_softmax_real(gidx, logits, eids, norm_by)
     return _lambda(logits)
 
+
+def segment_reduce_real(op, x, offsets):
+    y, arg = _segment_reduce(op, x, offsets)
+
+    def segment_reduce_backward(dy):
+        m = x.shape[0]
+        if op == 'sum':
+            offsets_np = asnumpy(offsets[1:-1])
+            indices_np = np.zeros((m,), dtype=offsets_np.dtype)
+            np.add.at(indices_np, offsets_np, np.ones_like(offsets_np))
+            indices_np = np.cumsum(indices_np, -1)
+            indices = zerocopy_from_numpy(indices_np)
+            dx = tf.gather(dy, indices)
+        else:
+            dx = _bwd_segment_cmp(dy, arg, m)
+        return dx
+
+    return y, segment_reduce_backward
+
+
+def segment_reduce(op, x, offsets):
+    @tf.custom_gradient
+    def _lambda(x):
+        return segment_reduce_real(op, x, offsets)
+    return _lambda(x)
@@ -175,6 +175,12 @@ def reduce_sum(input):
     return tf.reduce_sum(input)
 
 
+def cumsum(input, dim):
+    if input.dtype == tf.bool:
+        input = tf.cast(input, tf.int32)
+    return tf.cumsum(input, axis=dim)
+
+
 def mean(input, dim):
     return tf.reduce_mean(input, axis=dim)
 
 
@@ -2,3 +2,4 @@
 from .spmm import *
 from .sddmm import *
 from .edge_softmax import *
+from .segment import *
@@ -2,8 +2,6 @@
 
 from ..base import DGLError
 from .. import backend as F
-from .. import convert
-from .. import function as fn
 
 
 def segment_reduce(seglen, value, reducer='sum'):
@@ -41,20 +39,21 @@ def segment_reduce(seglen, value, reducer='sum'):
             [5., 5., 5.],
             [4., 4., 4.]])
     """
-    ctx = F.context(seglen)
-    # TODO(minjie): a more efficient implementation is to create a graph
-    #   directly from a CSR structure.
-    u = F.copy_to(F.arange(0, F.shape(value)[0], F.int32), ctx)
-    v = F.repeat(F.copy_to(F.arange(0, len(seglen), F.int32), ctx),
-                 seglen, dim=0)
-    if len(u) != len(v):
-        raise DGLError("Invalid seglen array:", seglen,
-                       ". Its summation must be equal to value.shape[0].")
-    num_nodes = {'_U': len(u), '_V': len(seglen)}
-    g = convert.heterograph({('_U', '_E', '_V'): (u, v)}, num_nodes_dict=num_nodes)
-    g.srcdata['h'] = value
-    g.update_all(fn.copy_u('h', 'm'), getattr(fn, reducer)('m', 'h'))
-    return g.dstdata['h']
+    offsets = F.cumsum(
+        F.cat([F.zeros((1,), F.dtype(seglen), F.context(seglen)), seglen], 0), 0)
+    if reducer == 'mean':
+        rst = F.segment_reduce('sum', value, offsets)
+        rst_shape = F.shape(rst)
+        z = F.astype(F.clamp(seglen, 1, len(value)), F.dtype(rst))
+        z_shape = (rst_shape[0],) + (1,) * (len(rst_shape) - 1)
+        return rst / F.reshape(z, z_shape)
+    elif reducer in ['min', 'sum', 'max']:
+        rst = F.segment_reduce(reducer, value, offsets)
+        if reducer in ['min', 'max']:
+            rst = F.replace_inf_with_zero(rst)
+        return rst
+    else:
+        raise DGLError("reducer {} not recognized.".format(reducer))
 
 
 def segment_softmax(seglen, value):
 
@@ -248,4 +248,81 @@ def _gsddmm(gidx, op, lhs, rhs, lhs_target='u', rhs_target='v'):
     return out
 
 
+def _segment_reduce(op, feat, offsets):
+    r"""Segment reduction operator.
+
+    It aggregates the value tensor along the first dimension by segments.
+    The first argument ``seglen`` stores the length of each segment. Its
+    summation must be equal to the first dimension of the ``value`` tensor.
+    Zero-length segments are allowed.
+
+    Parameters
+    ----------
+    op : str
+        Aggregation method. Can be 'sum', 'max', 'min'.
+    seglen : Tensor
+        Segment lengths.
+    value : Tensor
+        Value to aggregate.
+
+    Returns
+    -------
+    tuple(Tensor)
+        The first tensor correspond to aggregated tensor of shape
+        ``(len(seglen), value.shape[1:])``, and the second tensor records
+        the argmin/max at each position for computing gradients.
+
+    Notes
+    -----
+    This function does not handle gradients.
+    """
+    n = F.shape(offsets)[0] - 1
+    out_shp = (n,) + F.shape(feat)[1:]
+    ctx = F.context(feat)
+    dtype = F.dtype(feat)
+    idtype = F.dtype(offsets)
+    out = F.zeros(out_shp, dtype, ctx)
+    arg = None
+    if op in ['min', 'max']:
+        arg = F.zeros(out_shp, idtype, ctx)
+    arg_nd = to_dgl_nd_for_write(arg)
+    _CAPI_DGLKernelSegmentReduce(op,
+                                 to_dgl_nd(feat),
+                                 to_dgl_nd(offsets),
+                                 to_dgl_nd_for_write(out),
+                                 arg_nd)
+    arg = None if arg is None else F.zerocopy_from_dgl_ndarray(arg_nd)
+    return out, arg
+
+
+def _bwd_segment_cmp(feat, arg, m):
+    r""" Backward phase of segment reduction (for 'min'/'max' reduction).
+
+    It computes the gradient of input feature given output gradient of
+    the segment reduction result.
+
+    Parameters
+    ----------
+    feat : Tensor
+        The output gradient
+    arg : Tensor
+        The ArgMin/Max tensor produced by segment_reduce op.
+    m : int
+        The length of input gradients' first dimension.
+
+    Returns
+    -------
+    Tensor
+        The input gradient.
+    """
+    out_shp = (m,) + F.shape(feat)[1:]
+    ctx = F.context(feat)
+    dtype = F.dtype(feat)
+    out = F.zeros(out_shp, dtype, ctx)
+    _CAPI_DGLKernelBwdSegmentCmp(to_dgl_nd(feat),
+                                 to_dgl_nd(arg),
+                                 to_dgl_nd_for_write(out))
+    return out
+
+
 _init_api("dgl.sparse")