PaddlePaddle · luotao1 · Jun 6, 2025 · Jun 4, 2025 · Jun 5, 2025 · DanielSun11
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
@@ -1368,21 +1368,6 @@ void DistInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    float p,
                    MetaTensor* out) {
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-
-  PADDLE_ENFORCE_NE(common::product(x_dims),
-                    0,
-                    common::errors::InvalidArgument(
-                        "The Input(X) has not been initialized properly. The "
-                        "shape of Input(X) = [%s].",
-                        x_dims));
-  PADDLE_ENFORCE_NE(common::product(y_dims),
-                    0,
-                    common::errors::InvalidArgument(
-                        "The Input(Y) has not been initialized properly. The "
-                        "shape of Input(Y) = [%s].",
-                        y_dims));
   out->set_dims(common::make_ddim({}));
   out->set_dtype(x.dtype());
 }

diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
@@ -28,6 +28,10 @@ void DiagonalGradKernel(const Context& dev_ctx,
                         int axis1,
                         int axis2,
                         DenseTensor* in_grad) {
+  if (in_grad->numel() == 0) {
+    dev_ctx.template Alloc<T>(in_grad);
+    return;
+  }
   const auto* dout = &out_grad;
   const T* dout_data = dout->data<T>();
   auto dout_dim = common::vectorize(dout->dims());

diff --git a/paddle/phi/kernels/cpu/diagonal_kernel.cc b/paddle/phi/kernels/cpu/diagonal_kernel.cc
@@ -16,8 +16,8 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -27,6 +27,12 @@ void DiagonalKernel(const Context& dev_ctx,
                     int axis1,
                     int axis2,
                     DenseTensor* out) {
+  if (x.numel() == 0) {
+    phi::Full<T, Context>(
+        dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
+    return;
+  }
+
   auto* input = &x;
   const T* input_data = input->data<T>();
   auto input_dim = common::vectorize(input->dims());

diff --git a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
@@ -56,6 +56,9 @@ void KthvalueGradKernel(const Context& dev_ctx,
   auto in_dims = x.dims();
   auto out_dims = indices.dims();
   T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+  if (d_x && d_x->numel() == 0) {
+    return;
+  }
 
   // For 0D Tensor
   if (in_dims.size() == 0) {

diff --git a/paddle/phi/kernels/cpu/kthvalue_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
@@ -16,9 +16,9 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
 namespace phi {
 template <typename T, typename Type>
 static void getKthvalue(Type input_height,
@@ -80,6 +80,13 @@ void KthvalueKernel(const Context& dev_ctx,
                     bool keepdim,
                     DenseTensor* output,
                     DenseTensor* indices) {
+  if (x.numel() == 0) {
+    phi::Full<T, Context>(
+        dev_ctx, phi::IntArray(common::vectorize(output->dims())), NAN, output);
+    phi::Full<int64_t, Context>(
+        dev_ctx, phi::IntArray(common::vectorize(indices->dims())), 0, indices);
+    return;
+  }
   const auto& in_dims = x.dims();
   if (axis < 0) axis += in_dims.size();
 

diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc
@@ -19,6 +19,7 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/elementwise_subtract_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/p_norm_grad_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 #include "paddle/phi/kernels/scale_kernel.h"
@@ -56,6 +57,28 @@ void DistGradKernel(const Context& dev_ctx,
     return;
   }
 
+  if ((x_grad && x_grad->numel() == 0) || (y_grad && y_grad->numel() == 0)) {
+    if (x_grad) {
+      dev_ctx.template Alloc<T>(x_grad);
+      if (x_grad->numel() != 0) {
+        phi::Full<T, Context>(dev_ctx,
+                              phi::IntArray(common::vectorize(x_grad->dims())),
+                              0,
+                              x_grad);
+      }
+    }
+    if (y_grad) {
+      dev_ctx.template Alloc<T>(y_grad);
+      if (y_grad->numel() != 0) {
+        phi::Full<T, Context>(dev_ctx,
+                              phi::IntArray(common::vectorize(y_grad->dims())),
+                              0,
+                              y_grad);
+      }
+    }
+    return;
+  }
+
   auto t = Subtract<T, Context>(dev_ctx, x, y);
   DenseTensor x_grad_tmp;
   x_grad_tmp.Resize(t.dims());

diff --git a/paddle/phi/kernels/dist_kernel.cc b/paddle/phi/kernels/dist_kernel.cc
@@ -17,8 +17,8 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/elementwise_subtract_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/p_norm_kernel.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -27,6 +27,11 @@ void DistKernel(const Context& dev_ctx,
                 const DenseTensor& y,
                 float p,
                 DenseTensor* out) {
+  if (x.numel() == 0 || y.numel() == 0) {
+    phi::Full<T, Context>(
+        dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
+    return;
+  }
   auto t = Subtract<T, Context>(dev_ctx, x, y);
   PNormKernel<T, Context>(dev_ctx, t, p, -1, 1e-12, false, true, out);
 }

diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
@@ -31,6 +31,10 @@ void DiagonalGradKernel(const Context& dev_ctx,
                         int axis1,
                         int axis2,
                         DenseTensor* in_grad) {
+  if (in_grad->numel() == 0) {
+    dev_ctx.template Alloc<T>(in_grad);
+    return;
+  }
   const auto* dout = &out_grad;
   const auto* dout_data = dout->data<T>();
   auto dout_dim = dout->dims().Get();

diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu
@@ -17,8 +17,8 @@
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
-
 namespace phi {
 using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T, typename Context>
@@ -28,6 +28,11 @@ void DiagonalKernel(const Context& dev_ctx,
                     int axis1,
                     int axis2,
                     DenseTensor* out) {
+  if (x.numel() == 0) {
+    phi::Full<T, Context>(
+        dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
+    return;
+  }
   auto* input = &x;
   const auto* input_data = input->data<T>();
   auto input_dim = input->dims().Get();

diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -19,6 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/dist_kernel.h"
 #include "paddle/phi/kernels/elementwise_subtract_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
 #include "paddle/phi/kernels/legacy/reduce_max_kernel.h"
@@ -123,6 +124,12 @@ void DistKernel(const Context& dev_ctx,
                 const DenseTensor& y,
                 float p,
                 DenseTensor* out) {
+  if (x.numel() == 0 || y.numel() == 0) {
+    phi::Full<T, Context>(
+        dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
+    return;
+  }
+
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   DenseTensor intermediate;
   const T* x_ptr = x.data<T>();

diff --git a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
@@ -44,6 +44,10 @@ void KthvalueGradKernel(const Context& dev_ctx,
   const auto& in_dims = x.dims();
   auto out_dims = indices.dims();
   T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+  if (d_x && d_x->numel() == 0) {
+    return;
+  }
+
   // For 0D Tensor
   if (in_dims.size() == 0) {
     phi::funcs::set_constant(dev_ctx, d_x, static_cast<T>(1.0));

diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -16,6 +16,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -160,6 +161,14 @@ void KthvalueKernel(const Context& dev_ctx,
                     bool keepdim,
                     DenseTensor* output,
                     DenseTensor* indices) {
+  if (x.numel() == 0) {
+    phi::Full<T, Context>(
+        dev_ctx, phi::IntArray(common::vectorize(output->dims())), NAN, output);
+    phi::Full<int64_t, Context>(
+        dev_ctx, phi::IntArray(common::vectorize(indices->dims())), 0, indices);
+    return;
+  }
+
   const auto& in_dims = x.dims();
   if (axis < 0) axis += in_dims.size();
   auto out_dims = output->dims();

diff --git a/paddle/phi/kernels/stride/diagonal_grad_kernel.cc b/paddle/phi/kernels/stride/diagonal_grad_kernel.cc
@@ -38,6 +38,9 @@ void DiagonalGradStridedKernel(const Context& dev_ctx,
   }
   dev_ctx.Alloc(in_grad, in_grad->dtype());
   in_grad->set_strides(DenseTensorMeta::calc_strides(in_grad->dims()));
+  if (in_grad->numel() == 0) {
+    return;
+  }
   PD_VISIT_ALL_TYPES(in_grad->dtype(), "DiagonalGradStridedKernel", ([&] {
                        phi::StridedTensorFill<data_t>(*in_grad, 0, in_grad);
                      }));

diff --git a/paddle/phi/kernels/xpu/activation_grad_kernel.cc b/paddle/phi/kernels/xpu/activation_grad_kernel.cc
@@ -679,6 +679,9 @@ void ExpGradKernel(const Context& dev_ctx,
                    DenseTensor* dx) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   dev_ctx.template Alloc<T>(dx);
+  if (dx && dx->numel() == 0) {
+    return;
+  }
   const XPUType* y_data = reinterpret_cast<const XPUType*>(out.data<T>());
   const XPUType* y_grad = reinterpret_cast<const XPUType*>(dout.data<T>());
   XPUType* x_grad = reinterpret_cast<XPUType*>(dx->data<T>());

diff --git a/paddle/phi/kernels/xpu/diagonal_kernel.cc b/paddle/phi/kernels/xpu/diagonal_kernel.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"
-
+#include "paddle/phi/kernels/full_kernel.h"
 namespace phi {
 
 template <typename T, typename Context>
@@ -26,6 +26,12 @@ void DiagonalKernel(const Context& dev_ctx,
                     int axis1,
                     int axis2,
                     DenseTensor* out) {
+  if (x.numel() == 0) {
+    phi::Full<T, Context>(
+        dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
+    return;
+  }
+
   using XPUType = typename XPUTypeTrait<T>::Type;
   T* out_data = dev_ctx.template Alloc<T>(out);
   std::vector<int64_t> xshape = common::vectorize<int64_t>(x.dims());

diff --git a/test/legacy_test/test_diagonal_op.py b/test/legacy_test/test_diagonal_op.py
@@ -134,6 +134,19 @@ def init_config(self):
         )
 
 
+class TestDiagonalOp_ZeroSize(TestDiagonalOp):
+    def init_config(self):
+        self.case = np.random.randn(0, 2, 4, 4).astype(self.dtype)
+        self.inputs = {'Input': self.case}
+        self.attrs = {'offset': -2, 'axis1': 0, 'axis2': 3}
+        self.target = np.diagonal(
+            self.inputs['Input'],
+            offset=self.attrs['offset'],
+            axis1=self.attrs['axis1'],
+            axis2=self.attrs['axis2'],
+        )
+
+
 class TestDiagonalAPI(unittest.TestCase):
     def setUp(self):
         self.shape = [10, 3, 4]

diff --git a/test/legacy_test/test_dist_op.py b/test/legacy_test/test_dist_op.py
@@ -236,6 +236,39 @@ def init_case(self):
         self.p = 1.5
 
 
+class TestDistOp_ZeroSize1(TestDistOp):
+    def setUp(self):
+        self.op_type = 'dist'
+        self.python_api = paddle.dist
+        self.attrs = {}
+        self.init_case()
+        self.init_data_type()
+        self.inputs = {
+            "X": np.random.random(self.x_shape).astype(self.data_type),
+            "Y": np.random.random(self.y_shape).astype(self.data_type),
+        }
+
+        self.attrs["p"] = self.p
+        self.outputs = {
+            "Out": dist(self.inputs["X"], self.inputs["Y"], self.attrs["p"])
+        }
+
+    def test_check_grad(self):
+        self.check_grad(["X", "Y"], "Out", check_pir=True)
+
+    def init_case(self):
+        self.x_shape = (0, 1, 5, 6)
+        self.y_shape = (0, 5, 6)
+        self.p = 1.0
+
+
+class TestDistOp_ZeroSize2(TestDistOp_ZeroSize1):
+    def init_case(self):
+        self.x_shape = (0, 1, 5, 6)
+        self.y_shape = (1, 5, 6)
+        self.p = 1.0
+
+
 class TestDistAPI(unittest.TestCase):
     def init_data_type(self):
         self.data_type = (

diff --git a/test/legacy_test/test_kthvalue_op.py b/test/legacy_test/test_kthvalue_op.py
@@ -43,13 +43,17 @@ def init_args(self):
     def init_dtype(self):
         self.dtype = np.float64
 
+    def init_shape(self):
+        self.shape = [2, 1, 2, 4, 10]
+
     def setUp(self):
         self.op_type = "kthvalue"
         self.prim_op_type = "prim"
         self.python_api = paddle.kthvalue
         self.public_python_api = paddle.kthvalue
         self.init_dtype()
-        self.input_data = np.random.random([2, 1, 2, 4, 10]).astype(self.dtype)
+        self.init_shape()
+        self.input_data = np.random.random(self.shape).astype(self.dtype)
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis}
@@ -77,6 +81,11 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
+class TestKthvalueOp_ZeroSize(TestKthvalueOp):
+    def init_shape(self):
+        self.shape = [2, 1, 0, 4, 10]
+
+
 class TestKthvalueOpWithKeepdim(OpTest):
     def init_args(self):
         self.k = 2