PaddlePaddle · swgu98 · Aug 20, 2025 · Aug 14, 2025 · Aug 14, 2025 · Aug 15, 2025
diff --git a/paddle/phi/kernels/cpu/cum_grad_kernel.cc b/paddle/phi/kernels/cpu/cum_grad_kernel.cc
@@ -54,6 +54,8 @@ PD_REGISTER_KERNEL(cumsum_grad,
                    phi::CumsumGradKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,

diff --git a/paddle/phi/kernels/cpu/cum_kernel.cc b/paddle/phi/kernels/cpu/cum_kernel.cc
@@ -273,6 +273,8 @@ PD_REGISTER_KERNEL(cumsum,
                    phi::CumsumKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,

diff --git a/paddle/phi/kernels/gpu/cum_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
@@ -81,6 +81,8 @@ PD_REGISTER_KERNEL(cumsum_grad,
                    phi::CumsumGradKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,

diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -508,6 +508,8 @@ PD_REGISTER_KERNEL(cumsum,
                    phi::CumsumKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,

diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
@@ -4308,7 +4308,7 @@ def cumsum(
     Args:
         x (Tensor): The input tensor needed to be cumsumed.
         axis (int, optional): The dimension to accumulate along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
-        dtype (str|paddle.dtype|np.dtype|None, optional): The data type of the output tensor, can be bfloat16, float16, float32, float64, int32, int64, complex64, complex128. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None.
+        dtype (str|paddle.dtype|np.dtype|None, optional): The data type of the output tensor, can be bfloat16, float16, float32, float64, int32, int64, complex64, complex128. By default, it is int64 if the input x is int8/int16/int32; otherwise, it is None. If it is not None, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows.
         name (str|None, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -4350,6 +4350,13 @@ def cumsum(
         flatten = False
     if dtype is not None and x.dtype != convert_np_dtype_to_dtype_(dtype):
         x = cast(x, dtype)
+    elif isinstance(x, paddle.Tensor) and x.dtype in [
+        paddle.uint8,
+        paddle.int8,
+        paddle.int16,
+        paddle.int32,
+    ]:
+        x = cast(x, "int64")
 
     if in_dynamic_or_pir_mode():
         if axis is None:

diff --git a/test/legacy_test/test_cumsum_op.py b/test/legacy_test/test_cumsum_op.py
@@ -124,6 +124,219 @@ def test_name(self):
             self.assertTrue('out' in y.name)
 
 
+class TestCumsumOp_INT(unittest.TestCase):
+    def run_cases(self):
+        data_np = np.arange(12).reshape(3, 4).astype(np.uint8)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+        data_np = np.arange(12).reshape(3, 4).astype(np.int8)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+        data_np = np.arange(12).reshape(3, 4).astype(np.int16)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+        data_np = np.arange(12).reshape(3, 4).astype(np.int32)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+    def run_static_uint8(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.uint8)
+            x = paddle.static.data('X', [100, 100], dtype='uint8')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def run_static_int8(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.int8)
+            x = paddle.static.data('X', [100, 100], dtype='int8')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def run_static_int16(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.int16)
+            x = paddle.static.data('X', [100, 100], dtype='int16')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def run_static_uint16(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.uint16)
+            x = paddle.static.data('X', [100, 100], dtype='uint16')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def test_cpu_dygraph(self):
+        paddle.disable_static(paddle.base.CPUPlace())
+        self.run_cases()
+        paddle.enable_static()
+
+    def test_cpu_static(self):
+        self.run_static_uint8()
+        self.run_static_int8()
+        self.run_static_int16()
+
+    def test_gpu_dygraph(self):
+        if not base.core.is_compiled_with_cuda():
+            return
+        paddle.disable_static(paddle.base.CUDAPlace(0))
+        self.run_cases()
+        paddle.enable_static()
+
+    def test_gpu_static(self):
+        if not base.core.is_compiled_with_cuda():
+            return
+        self.run_static_uint8(use_gpu=True)
+        self.run_static_int8(use_gpu=True)
+        self.run_static_uint16(use_gpu=True)
+        self.run_static_int16(use_gpu=True)
+
+    def test_name(self):
+        with (
+            paddle.pir_utils.OldIrGuard(),
+            base.program_guard(base.Program()),
+        ):
+            x = paddle.static.data('x', [3, 4])
+            y = paddle.cumsum(x, name='out')
+            self.assertTrue('out' in y.name)
+
+
 def cumsum_wrapper(x, axis=-1, flatten=False, exclusive=False, reverse=False):
     return paddle._C_ops.cumsum(x, axis, flatten, exclusive, reverse)