[Accuracy diff No.167] Fix accuracy (output type) diff for paddle.cumsum API (#74625)

hushenwei2000 · web-flow · commit b040e89b84b7 · 2025-08-20T21:12:35.000+08:00
* fix(math.py, unary.cc): fix output type diff for cumsum kernel

* fix(math.py): fix output type diff for cumsum kernel

* fix(math.py): fix `cumsum` documentation

* fix(cum/cum_grad.cc/cu, test_cumsum_op.py): fix output type diff for cumsum kernel and add unit test
diff --git a/paddle/phi/kernels/cpu/cum_grad_kernel.cc b/paddle/phi/kernels/cpu/cum_grad_kernel.cc
@@ -54,6 +54,8 @@ PD_REGISTER_KERNEL(cumsum_grad,
                    phi::CumsumGradKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,
diff --git a/paddle/phi/kernels/cpu/cum_kernel.cc b/paddle/phi/kernels/cpu/cum_kernel.cc
@@ -273,6 +273,8 @@ PD_REGISTER_KERNEL(cumsum,
                    phi::CumsumKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,
diff --git a/paddle/phi/kernels/gpu/cum_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
@@ -81,6 +81,8 @@ PD_REGISTER_KERNEL(cumsum_grad,
                    phi::CumsumGradKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -508,6 +508,8 @@ PD_REGISTER_KERNEL(cumsum,
                    phi::CumsumKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
@@ -4152,7 +4152,7 @@ def cumsum(
     Args:
         x (Tensor): The input tensor needed to be cumsumed.
         axis (int, optional): The dimension to accumulate along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
-        dtype (str|paddle.dtype|np.dtype|None, optional): The data type of the output tensor, can be bfloat16, float16, float32, float64, int32, int64, complex64, complex128. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None.
+        dtype (str|paddle.dtype|np.dtype|None, optional): The data type of the output tensor, can be bfloat16, float16, float32, float64, int32, int64, complex64, complex128. By default, it is int64 if the input x is int8/int16/int32; otherwise, it is None. If it is not None, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows.
         name (str|None, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -4194,6 +4194,13 @@ def cumsum(
         flatten = False
     if dtype is not None and x.dtype != convert_np_dtype_to_dtype_(dtype):
         x = cast(x, dtype)
+    elif isinstance(x, paddle.Tensor) and x.dtype in [
+        paddle.uint8,
+        paddle.int8,
+        paddle.int16,
+        paddle.int32,
+    ]:
+        x = cast(x, "int64")
 
     if in_dynamic_or_pir_mode():
         if axis is None:
diff --git a/test/legacy_test/test_cumsum_op.py b/test/legacy_test/test_cumsum_op.py
@@ -124,6 +124,219 @@ def test_name(self):
             self.assertTrue('out' in y.name)
 
 
+class TestCumsumOp_INT(unittest.TestCase):
+    def run_cases(self):
+        data_np = np.arange(12).reshape(3, 4).astype(np.uint8)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+        data_np = np.arange(12).reshape(3, 4).astype(np.int8)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+        data_np = np.arange(12).reshape(3, 4).astype(np.int16)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+        data_np = np.arange(12).reshape(3, 4).astype(np.int32)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+    def run_static_uint8(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.uint8)
+            x = paddle.static.data('X', [100, 100], dtype='uint8')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def run_static_int8(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.int8)
+            x = paddle.static.data('X', [100, 100], dtype='int8')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def run_static_int16(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.int16)
+            x = paddle.static.data('X', [100, 100], dtype='int16')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def run_static_uint16(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.uint16)
+            x = paddle.static.data('X', [100, 100], dtype='uint16')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def test_cpu_dygraph(self):
+        paddle.disable_static(paddle.base.CPUPlace())
+        self.run_cases()
+        paddle.enable_static()
+
+    def test_cpu_static(self):
+        self.run_static_uint8()
+        self.run_static_int8()
+        self.run_static_int16()
+
+    def test_gpu_dygraph(self):
+        if not base.core.is_compiled_with_cuda():
+            return
+        paddle.disable_static(paddle.base.CUDAPlace(0))
+        self.run_cases()
+        paddle.enable_static()
+
+    def test_gpu_static(self):
+        if not base.core.is_compiled_with_cuda():
+            return
+        self.run_static_uint8(use_gpu=True)
+        self.run_static_int8(use_gpu=True)
+        self.run_static_uint16(use_gpu=True)
+        self.run_static_int16(use_gpu=True)
+
+    def test_name(self):
+        with (
+            paddle.pir_utils.OldIrGuard(),
+            base.program_guard(base.Program()),
+        ):
+            x = paddle.static.data('x', [3, 4])
+            y = paddle.cumsum(x, name='out')
+            self.assertTrue('out' in y.name)
+
+
 def cumsum_wrapper(x, axis=-1, flatten=False, exclusive=False, reverse=False):
     return paddle._C_ops.cumsum(x, axis, flatten, exclusive, reverse)