Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b62a062
refine forrange (#72360)
wanghuancoder Apr 20, 2025
b40d76b
reduce support big tensor (#71970)
wanghuancoder Apr 23, 2025
df4697f
[PHI] Fix gridDim limit for reduce kernel (#72507)
lshpku Apr 27, 2025
f179a6a
[API] isclose support bigtensor (#72516)
wanghuancoder Apr 28, 2025
d8c1ad1
[API] isnan isinf isfinite support bigtensor (#72517)
wanghuancoder Apr 28, 2025
04d7eb9
[PHI] Fix cum kernel for big tensor (#72562)
lshpku May 6, 2025
752eb6e
[PHI] Preliminary fix for elementwise broadcast int32 shape overflow …
Enigmatisms May 7, 2025
98b4876
[PHI] Align linalg.solve kernel with torch (#72608)
lshpku May 9, 2025
4e331fd
Update strided copy kernel (#72662)
Dmovic May 13, 2025
831663e
[PHI] Fix grid sample kernel for big tensor (#72628)
lshpku May 14, 2025
cc3177f
[PHI] Fix argsort big tensor bug (#72712)
Enigmatisms May 15, 2025
1bf96d4
[PHI] Fix contiguous kernel for big tensor (#72705)
Dmovic May 15, 2025
7ae5373
[PHI] Fix flatten and split kernel for big tensor (#72634)
ggggxm May 19, 2025
7c1c28f
[PHI] Fix out-of-bound issue of paddle.take_along_axis (#72757)
xkkkkkk23 May 19, 2025
63ce668
[PHI] fix paddle.diag with big tensor (#72638)
ZhangX-21 May 19, 2025
8c64555
[API] fix paddle.cross with big tensor (#72652)
ZhangX-21 May 19, 2025
3a81f1c
[PHI] Fix paddle.where api for big tensor (#72717)
huangjiyi May 19, 2025
6ffb631
[PHI] Fix bincount kernel for big tensor (#72706)
ggggxm May 20, 2025
49b0b27
[PHI] Fix full_like kernel for big tensor (#72831)
lshpku May 21, 2025
4f98369
[API] Fix int overflow and float16 support for paddle.frac (#72815)
xkkkkkk23 May 21, 2025
578d5a9
[PHI] Align paddle.inner with torch in matmul logic (#72843)
lshpku May 22, 2025
07ba4c9
[PHI] Fix paddle.var & paddle.std float16 overflow (#72650)
Enigmatisms May 12, 2025
134cc48
[PHI] Fix logsumexp precision problem (#72681)
Enigmatisms May 16, 2025
84b2680
[Accuracy diff No.55-56、76-77] Fix accuracy diff for var&std API (#72…
ooooo-create May 23, 2025
c3ef9a5
[Accuracy diff No.21] Fix accuracy diff for heaviside API (#72894)
ooooo-create May 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions paddle/fluid/operators/elementwise/elementwise_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ class ElementwiseOp : public framework::OperatorWithKernel {
axis));
axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
: axis);
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
std::vector<int64_t> x_dims_array(max_dim);
std::vector<int64_t> y_dims_array(max_dim);
std::vector<int64_t> out_dims_array(max_dim);
#ifdef PADDLE_WITH_DNNL
// Broadcasting of dims has to be done on Paddle shapes (NHWC)
// if model is using NHWC and any of shapes in at least 3D
Expand All @@ -120,8 +120,8 @@ class ElementwiseOp : public framework::OperatorWithKernel {
if (should_rotate) {
// Pick bigger shape and rotate this one
bool x_over_y = (x_dims.size() > y_dims.size());
auto vdims = x_over_y ? common::vectorize<int>(x_dims)
: common::vectorize<int>(y_dims);
auto vdims = x_over_y ? common::vectorize<int64_t>(x_dims)
: common::vectorize<int64_t>(y_dims);
std::rotate(vdims.begin() + 1, vdims.begin() + 2, vdims.end());
if (x_over_y) {
x_dims = common::make_ddim(vdims);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1308,7 +1308,13 @@ void max_grad(const Tensor& x,
auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
auto out_tmp = out.expand(IntArray(x_dim));
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
if (axis_size == 0) {
auto mask_sum = sum<T>(mask, axis, x.dtype(), keepdim = true);
auto grad_tmp = out_grad_tmp / mask_sum;
x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
} else {
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
}
} else {
auto axis_ = std::vector<int64_t>();
if (reduce_all) {
Expand All @@ -1329,7 +1335,13 @@ void max_grad(const Tensor& x,
auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
auto out_tmp = out_.expand(IntArray(x_dim));
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
if (axis_size == 0) {
auto mask_sum = sum<T>(mask, axis_, x.dtype(), keepdim = true);
auto grad_tmp = out_grad_tmp / mask_sum;
x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
} else {
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
}
}
set_output<T>(x_grad_tmp, x_grad);
}
Expand Down Expand Up @@ -1361,7 +1373,13 @@ void min_grad(const Tensor& x,
auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
auto out_tmp = out.expand(IntArray(x_dim));
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
if (axis_size == 0) {
auto mask_sum = sum<T>(mask, axis, x.dtype(), keepdim = true);
auto grad_tmp = out_grad_tmp / mask_sum;
x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
} else {
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
}
} else {
auto axis_ = std::vector<int64_t>();
if (reduce_all) {
Expand All @@ -1382,7 +1400,13 @@ void min_grad(const Tensor& x,
auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
auto out_tmp = out_.expand(IntArray(x_dim));
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
if (axis_size == 0) {
auto mask_sum = sum<T>(mask, axis_, x.dtype(), keepdim = true);
auto grad_tmp = out_grad_tmp / mask_sum;
x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
} else {
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
}
}
set_output<T>(x_grad_tmp, x_grad);
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
Original file line number Diff line number Diff line change
Expand Up @@ -909,7 +909,7 @@ Tensor flatten_decomp(const Tensor& x, int start_axis, int end_axis) {
return reshape<T>(x, x_dim);
}

int slice_numel = 1;
int64_t slice_numel = 1;
for (int i = start_axis; i <= end_axis; ++i) {
slice_numel *= x_dim[i];
}
Expand Down
209 changes: 108 additions & 101 deletions paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h
Original file line number Diff line number Diff line change
Expand Up @@ -1575,107 +1575,6 @@ void pad_grad(const Tensor& input,
}
}

template <typename T>
void max_grad(const Tensor& x,
const Tensor& out,
const Tensor& out_grad,
const IntArray& axis,
bool keepdim,
bool reduce_all,
Tensor* x_grad) {
if (!x_grad) {
return;
}

Tensor x_grad_tmp;
if (has_dynamic_shape(x.shape())) {
const Tensor x_shape = shape64<T>(x);
const Tensor zero_tensor =
backend::full_with_tensor<T>(x_shape, 0.0, x.dtype(), x.place());
const int64_t axis_size = axis.size();
const int64_t x_dim_size = x.dims().size();

reduce_all = false;
if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
reduce_all = true;
} else {
reduce_all = false;
}

if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
auto out_grad_tmp = backend::expand<T>(out_grad, x_shape);
auto out_tmp = backend::expand<T>(out, x_shape);
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
} else {
const Tensor out_grad_shape = shape64<T>(out_grad);
auto axis_ = std::vector<int64_t>();

if (reduce_all) {
for (int64_t i = 0; i < x_dim_size; i++) {
axis_.push_back(i);
}
} else {
axis_ = axis.GetData();
for (int64_t i = 0; i < axis_size; i++) {
if (axis[i] < 0) {
axis_[i] = axis[i] + x_dim_size;
}
}
}
const Tensor out_grad_shape_extend =
get_unsqueeze_dims<T>(out_grad_shape, axis_);
auto out_grad_ = backend::reshape<T>(out_grad, out_grad_shape_extend);
auto out_ = backend::reshape<T>(out, out_grad_shape_extend);
auto out_grad_tmp = backend::expand<T>(out_grad_, x_shape);
auto out_tmp = backend::expand<T>(out_, x_shape);
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
}
} else {
auto zero_tensor =
full<T>(common::vectorize(x.dims()), 0.0, x.dtype(), x.place());
std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
int64_t axis_size = axis.size();
int64_t x_dim_size = x_dim.size();
reduce_all = false;
if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
reduce_all = true;
} else {
reduce_all = false;
}

if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
auto out_tmp = out.expand(IntArray(x_dim));
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
} else {
auto axis_ = std::vector<int64_t>();
if (reduce_all) {
for (int64_t i = 0; i < x_dim_size; i++) {
axis_.push_back(i);
}
} else {
axis_ = axis.GetData();
for (int64_t i = 0; i < axis_size; i++) {
if (axis[i] < 0) {
axis_[i] = axis[i] + x_dim_size;
}
}
}
auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
auto out_ = reshape<T>(out, out_grad_shape);
auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
auto out_tmp = out_.expand(IntArray(x_dim));
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
}
}
set_output<T>(x_grad_tmp, x_grad);
}

template <typename T>
void slice_grad(const Tensor& input,
const Tensor& out_grad,
Expand Down Expand Up @@ -3498,6 +3397,114 @@ void amin_grad(const Tensor& x,
}
}

template <typename T>
void max_grad(const Tensor& x,
const Tensor& out,
const Tensor& out_grad,
const IntArray& axis,
bool keepdim,
bool reduce_all,
Tensor* x_grad) {
if (!x_grad) {
return;
}

if (axis.size() == 0) {
Tensor x_grad_tmp;
amax_grad<T>(x, out, out_grad, axis, keepdim, reduce_all, &x_grad_tmp);
set_output<T>(x_grad_tmp, x_grad);
return;
}

Tensor x_grad_tmp;
if (has_dynamic_shape(x.shape())) {
const Tensor x_shape = shape64<T>(x);
const Tensor zero_tensor =
backend::full_with_tensor<T>(x_shape, 0.0, x.dtype(), x.place());
const int64_t axis_size = axis.size();
const int64_t x_dim_size = x.dims().size();

reduce_all = false;
if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
reduce_all = true;
} else {
reduce_all = false;
}

if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
auto out_grad_tmp = backend::expand<T>(out_grad, x_shape);
auto out_tmp = backend::expand<T>(out, x_shape);
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
} else {
const Tensor out_grad_shape = shape64<T>(out_grad);
auto axis_ = std::vector<int64_t>();

if (reduce_all) {
for (int64_t i = 0; i < x_dim_size; i++) {
axis_.push_back(i);
}
} else {
axis_ = axis.GetData();
for (int64_t i = 0; i < axis_size; i++) {
if (axis[i] < 0) {
axis_[i] = axis[i] + x_dim_size;
}
}
}
const Tensor out_grad_shape_extend =
get_unsqueeze_dims<T>(out_grad_shape, axis_);
auto out_grad_ = backend::reshape<T>(out_grad, out_grad_shape_extend);
auto out_ = backend::reshape<T>(out, out_grad_shape_extend);
auto out_grad_tmp = backend::expand<T>(out_grad_, x_shape);
auto out_tmp = backend::expand<T>(out_, x_shape);
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
}
} else {
auto zero_tensor =
full<T>(common::vectorize(x.dims()), 0.0, x.dtype(), x.place());
std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
int64_t axis_size = axis.size();
int64_t x_dim_size = x_dim.size();
reduce_all = false;
if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
reduce_all = true;
} else {
reduce_all = false;
}

if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
auto out_tmp = out.expand(IntArray(x_dim));
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
} else {
auto axis_ = std::vector<int64_t>();
if (reduce_all) {
for (int64_t i = 0; i < x_dim_size; i++) {
axis_.push_back(i);
}
} else {
axis_ = axis.GetData();
for (int64_t i = 0; i < axis_size; i++) {
if (axis[i] < 0) {
axis_[i] = axis[i] + x_dim_size;
}
}
}
auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
auto out_ = reshape<T>(out, out_grad_shape);
auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
auto out_tmp = out_.expand(IntArray(x_dim));
auto mask = equal<T>(x, out_tmp);
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
}
}
set_output<T>(x_grad_tmp, x_grad);
}

template <typename T>
void p_norm_grad(const Tensor& x,
/*output of forward was reserved for efficient backward*/
Expand Down
8 changes: 0 additions & 8 deletions paddle/phi/core/platform/device/gpu/gpu_launch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,14 +176,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
return config;
}

template <typename Context>
void LimitGridDim(const Context& ctx, dim3* grid_dim) {
auto max_grid_dim =
reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
grid_dim->z = grid_dim->z < max_grid_dim[2] ? grid_dim->z : max_grid_dim[2];
}
} // namespace platform
} // namespace paddle

Expand Down
Loading
Loading