PaddlePaddle · wanghuancoder · Jun 3, 2025 · Apr 20, 2025 · Apr 23, 2025 · Apr 27, 2025
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -106,9 +106,9 @@ class ElementwiseOp : public framework::OperatorWithKernel {
                             axis));
       axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
                        : axis);
-      std::vector<int> x_dims_array(max_dim);
-      std::vector<int> y_dims_array(max_dim);
-      std::vector<int> out_dims_array(max_dim);
+      std::vector<int64_t> x_dims_array(max_dim);
+      std::vector<int64_t> y_dims_array(max_dim);
+      std::vector<int64_t> out_dims_array(max_dim);
 #ifdef PADDLE_WITH_DNNL
       // Broadcasting of dims has to be done on Paddle shapes (NHWC)
       // if model is using NHWC and any of shapes in at least 3D
@@ -120,8 +120,8 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       if (should_rotate) {
         // Pick bigger shape and rotate this one
         bool x_over_y = (x_dims.size() > y_dims.size());
-        auto vdims = x_over_y ? common::vectorize<int>(x_dims)
-                              : common::vectorize<int>(y_dims);
+        auto vdims = x_over_y ? common::vectorize<int64_t>(x_dims)
+                              : common::vectorize<int64_t>(y_dims);
         std::rotate(vdims.begin() + 1, vdims.begin() + 2, vdims.end());
         if (x_over_y) {
           x_dims = common::make_ddim(vdims);

@@ -1308,7 +1308,13 @@ void max_grad(const Tensor& x,
     auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
     auto out_tmp = out.expand(IntArray(x_dim));
     auto mask = equal<T>(x, out_tmp);
-    x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    if (axis_size == 0) {
+      auto mask_sum = sum<T>(mask, axis, x.dtype(), keepdim = true);
+      auto grad_tmp = out_grad_tmp / mask_sum;
+      x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
+    } else {
+      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    }
   } else {
     auto axis_ = std::vector<int64_t>();
     if (reduce_all) {
@@ -1329,7 +1335,13 @@ void max_grad(const Tensor& x,
     auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
     auto out_tmp = out_.expand(IntArray(x_dim));
     auto mask = equal<T>(x, out_tmp);
-    x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    if (axis_size == 0) {
+      auto mask_sum = sum<T>(mask, axis_, x.dtype(), keepdim = true);
+      auto grad_tmp = out_grad_tmp / mask_sum;
+      x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
+    } else {
+      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    }
   }
   set_output<T>(x_grad_tmp, x_grad);
 }
@@ -1361,7 +1373,13 @@ void min_grad(const Tensor& x,
     auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
     auto out_tmp = out.expand(IntArray(x_dim));
     auto mask = equal<T>(x, out_tmp);
-    x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    if (axis_size == 0) {
+      auto mask_sum = sum<T>(mask, axis, x.dtype(), keepdim = true);
+      auto grad_tmp = out_grad_tmp / mask_sum;
+      x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
+    } else {
+      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    }
   } else {
     auto axis_ = std::vector<int64_t>();
     if (reduce_all) {
@@ -1382,7 +1400,13 @@ void min_grad(const Tensor& x,
     auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
     auto out_tmp = out_.expand(IntArray(x_dim));
     auto mask = equal<T>(x, out_tmp);
-    x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    if (axis_size == 0) {
+      auto mask_sum = sum<T>(mask, axis_, x.dtype(), keepdim = true);
+      auto grad_tmp = out_grad_tmp / mask_sum;
+      x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
+    } else {
+      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    }
   }
   set_output<T>(x_grad_tmp, x_grad);
 }

diff --git a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
@@ -909,7 +909,7 @@ Tensor flatten_decomp(const Tensor& x, int start_axis, int end_axis) {
       return reshape<T>(x, x_dim);
     }
 
-    int slice_numel = 1;
+    int64_t slice_numel = 1;
     for (int i = start_axis; i <= end_axis; ++i) {
       slice_numel *= x_dim[i];
     }

diff --git a/paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h b/paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h
@@ -1575,107 +1575,6 @@ void pad_grad(const Tensor& input,
   }
 }
 
-template <typename T>
-void max_grad(const Tensor& x,
-              const Tensor& out,
-              const Tensor& out_grad,
-              const IntArray& axis,
-              bool keepdim,
-              bool reduce_all,
-              Tensor* x_grad) {
-  if (!x_grad) {
-    return;
-  }
-
-  Tensor x_grad_tmp;
-  if (has_dynamic_shape(x.shape())) {
-    const Tensor x_shape = shape64<T>(x);
-    const Tensor zero_tensor =
-        backend::full_with_tensor<T>(x_shape, 0.0, x.dtype(), x.place());
-    const int64_t axis_size = axis.size();
-    const int64_t x_dim_size = x.dims().size();
-
-    reduce_all = false;
-    if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
-      reduce_all = true;
-    } else {
-      reduce_all = false;
-    }
-
-    if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
-      auto out_grad_tmp = backend::expand<T>(out_grad, x_shape);
-      auto out_tmp = backend::expand<T>(out, x_shape);
-      auto mask = equal<T>(x, out_tmp);
-      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
-    } else {
-      const Tensor out_grad_shape = shape64<T>(out_grad);
-      auto axis_ = std::vector<int64_t>();
-
-      if (reduce_all) {
-        for (int64_t i = 0; i < x_dim_size; i++) {
-          axis_.push_back(i);
-        }
-      } else {
-        axis_ = axis.GetData();
-        for (int64_t i = 0; i < axis_size; i++) {
-          if (axis[i] < 0) {
-            axis_[i] = axis[i] + x_dim_size;
-          }
-        }
-      }
-      const Tensor out_grad_shape_extend =
-          get_unsqueeze_dims<T>(out_grad_shape, axis_);
-      auto out_grad_ = backend::reshape<T>(out_grad, out_grad_shape_extend);
-      auto out_ = backend::reshape<T>(out, out_grad_shape_extend);
-      auto out_grad_tmp = backend::expand<T>(out_grad_, x_shape);
-      auto out_tmp = backend::expand<T>(out_, x_shape);
-      auto mask = equal<T>(x, out_tmp);
-      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
-    }
-  } else {
-    auto zero_tensor =
-        full<T>(common::vectorize(x.dims()), 0.0, x.dtype(), x.place());
-    std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
-    int64_t axis_size = axis.size();
-    int64_t x_dim_size = x_dim.size();
-    reduce_all = false;
-    if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
-      reduce_all = true;
-    } else {
-      reduce_all = false;
-    }
-
-    if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
-      auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
-      auto out_tmp = out.expand(IntArray(x_dim));
-      auto mask = equal<T>(x, out_tmp);
-      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
-    } else {
-      auto axis_ = std::vector<int64_t>();
-      if (reduce_all) {
-        for (int64_t i = 0; i < x_dim_size; i++) {
-          axis_.push_back(i);
-        }
-      } else {
-        axis_ = axis.GetData();
-        for (int64_t i = 0; i < axis_size; i++) {
-          if (axis[i] < 0) {
-            axis_[i] = axis[i] + x_dim_size;
-          }
-        }
-      }
-      auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
-      auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
-      auto out_ = reshape<T>(out, out_grad_shape);
-      auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
-      auto out_tmp = out_.expand(IntArray(x_dim));
-      auto mask = equal<T>(x, out_tmp);
-      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
-    }
-  }
-  set_output<T>(x_grad_tmp, x_grad);
-}
-
 template <typename T>
 void slice_grad(const Tensor& input,
                 const Tensor& out_grad,
@@ -3498,6 +3397,114 @@ void amin_grad(const Tensor& x,
   }
 }
 
+template <typename T>
+void max_grad(const Tensor& x,
+              const Tensor& out,
+              const Tensor& out_grad,
+              const IntArray& axis,
+              bool keepdim,
+              bool reduce_all,
+              Tensor* x_grad) {
+  if (!x_grad) {
+    return;
+  }
+
+  if (axis.size() == 0) {
+    Tensor x_grad_tmp;
+    amax_grad<T>(x, out, out_grad, axis, keepdim, reduce_all, &x_grad_tmp);
+    set_output<T>(x_grad_tmp, x_grad);
+    return;
+  }
+
+  Tensor x_grad_tmp;
+  if (has_dynamic_shape(x.shape())) {
+    const Tensor x_shape = shape64<T>(x);
+    const Tensor zero_tensor =
+        backend::full_with_tensor<T>(x_shape, 0.0, x.dtype(), x.place());
+    const int64_t axis_size = axis.size();
+    const int64_t x_dim_size = x.dims().size();
+
+    reduce_all = false;
+    if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
+      reduce_all = true;
+    } else {
+      reduce_all = false;
+    }
+
+    if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
+      auto out_grad_tmp = backend::expand<T>(out_grad, x_shape);
+      auto out_tmp = backend::expand<T>(out, x_shape);
+      auto mask = equal<T>(x, out_tmp);
+      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    } else {
+      const Tensor out_grad_shape = shape64<T>(out_grad);
+      auto axis_ = std::vector<int64_t>();
+
+      if (reduce_all) {
+        for (int64_t i = 0; i < x_dim_size; i++) {
+          axis_.push_back(i);
+        }
+      } else {
+        axis_ = axis.GetData();
+        for (int64_t i = 0; i < axis_size; i++) {
+          if (axis[i] < 0) {
+            axis_[i] = axis[i] + x_dim_size;
+          }
+        }
+      }
+      const Tensor out_grad_shape_extend =
+          get_unsqueeze_dims<T>(out_grad_shape, axis_);
+      auto out_grad_ = backend::reshape<T>(out_grad, out_grad_shape_extend);
+      auto out_ = backend::reshape<T>(out, out_grad_shape_extend);
+      auto out_grad_tmp = backend::expand<T>(out_grad_, x_shape);
+      auto out_tmp = backend::expand<T>(out_, x_shape);
+      auto mask = equal<T>(x, out_tmp);
+      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    }
+  } else {
+    auto zero_tensor =
+        full<T>(common::vectorize(x.dims()), 0.0, x.dtype(), x.place());
+    std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
+    int64_t axis_size = axis.size();
+    int64_t x_dim_size = x_dim.size();
+    reduce_all = false;
+    if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
+      reduce_all = true;
+    } else {
+      reduce_all = false;
+    }
+
+    if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
+      auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
+      auto out_tmp = out.expand(IntArray(x_dim));
+      auto mask = equal<T>(x, out_tmp);
+      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    } else {
+      auto axis_ = std::vector<int64_t>();
+      if (reduce_all) {
+        for (int64_t i = 0; i < x_dim_size; i++) {
+          axis_.push_back(i);
+        }
+      } else {
+        axis_ = axis.GetData();
+        for (int64_t i = 0; i < axis_size; i++) {
+          if (axis[i] < 0) {
+            axis_[i] = axis[i] + x_dim_size;
+          }
+        }
+      }
+      auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
+      auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
+      auto out_ = reshape<T>(out, out_grad_shape);
+      auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
+      auto out_tmp = out_.expand(IntArray(x_dim));
+      auto mask = equal<T>(x, out_tmp);
+      x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+    }
+  }
+  set_output<T>(x_grad_tmp, x_grad);
+}
+
 template <typename T>
 void p_norm_grad(const Tensor& x,
                  /*output of forward was reserved for efficient backward*/

diff --git a/paddle/phi/core/platform/device/gpu/gpu_launch_config.h b/paddle/phi/core/platform/device/gpu/gpu_launch_config.h
@@ -176,14 +176,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
   return config;
 }
 
-template <typename Context>
-void LimitGridDim(const Context& ctx, dim3* grid_dim) {
-  auto max_grid_dim =
-      reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
-  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
-  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
-  grid_dim->z = grid_dim->z < max_grid_dim[2] ? grid_dim->z : max_grid_dim[2];
-}
 }  // namespace platform
 }  // namespace paddle