Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] fix some op bugs #31855

Merged
merged 5 commits into from
Mar 25, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion paddle/fluid/operators/activation_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -364,4 +364,5 @@ REGISTER_OP_NPU_KERNEL(
REGISTER_OP_NPU_KERNEL(
square, ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::SquareNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
paddle::platform::float16>,
ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, int>);
2 changes: 1 addition & 1 deletion paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ class LazyZerosNPU {

auto place = dev_ctx.GetPlace();
auto stream = dev_ctx.stream();
auto g = out->mutable_data<int>(place);
auto g = out->mutable_data<T>(place);
platform::NPUMemsetAsync(static_cast<void*>(g), 0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sizeof(int) -> sizeof(T)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

out->numel() * sizeof(int), stream);
}
Expand Down
46 changes: 25 additions & 21 deletions paddle/fluid/operators/concat_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,6 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> {
auto outs =
ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));

{
auto dx = outs;
auto x = ins;
for (size_t i = 0; i < dx.size(); ++i) {
if (dx[i] != nullptr) {
dx[i]->set_lod(x[i]->lod());
}
}
}
PADDLE_ENFORCE_NOT_NULL(ins[0],
platform::errors::NotFound(
"The first input tensor is not initalized."));
Expand All @@ -88,26 +79,39 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> {

axis = ComputeAxis(static_cast<int64_t>(axis),
static_cast<int64_t>(ins[0]->dims().size()));
// get output tensor that the name is not kEmptyVarName
std::vector<framework::Tensor> outputs;

std::vector<int> sizes;
int offset = 0;
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't write stream by default?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By now, the new feature of default stream is not tested enough. So, I keep the stream now and can remove all streams in the future.

for (size_t j = 0; j < outs.size(); ++j) {
// For stop gradient
// get output tensor that the name is not kEmptyVarName
if (out_var_names[j] != framework::kEmptyVarName &&
outs[j]->numel() != 0UL) {
outs[j]->mutable_data<T>(ctx.GetPlace());
outputs.push_back(*outs[j]);
sizes.push_back(outs[j]->dims()[axis]);
std::vector<int> offsets;
std::vector<int> sizes;
for (int dim = 0; dim < ins[j]->dims().size(); ++dim) {
if (dim == axis) {
offsets.push_back(offset);
sizes.push_back(ins[j]->dims()[dim]);
} else {
offsets.push_back(0);
sizes.push_back(ins[j]->dims()[dim]);
}
}
auto runner =
NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
{{"offsets", offset}, {"size", ins[j]->dims()[axis]}});
runner.Run(stream);
}
if (ins[j]->numel() != 0UL) {
offset += ins[j]->dims()[axis];
}
}
auto runner =
NpuOpRunner("SplitVD", {*out_grad}, outputs,
{{"split_dim", axis},
{"size_splits", sizes},
{"num_split", static_cast<int>(outputs.size())}});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};

Expand Down
39 changes: 37 additions & 2 deletions paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,58 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> {

out->mutable_data<T>(ctx.GetPlace());

// special case
if (x->dims().size() == 1 && keep_dims == false) {
keep_dims = true;
}

auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();

framework::Tensor cast_x;
framework::Tensor cast_out;
// NOTE: ReduceSumD only supports fp32 and fp16
if (x->type() != framework::proto::VarType::FP32 &&
x->type() != framework::proto::VarType::FP16) {
cast_x.Resize(x->dims());
cast_x.mutable_data<float>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
auto runner_cast = NpuOpRunner(
"Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream);

cast_out.Resize(out->dims());
cast_out.mutable_data<float>(ctx.GetPlace());
} else {
cast_x.ShareDataWith(*x);
cast_out.ShareDataWith(*out);
}

if (reduce_all) {
std::vector<int> dim_vec;
for (int i = 0; i < x->dims().size(); i++) {
dim_vec.push_back(i);
}
auto runner = NpuOpRunner("ReduceSumD", {*x}, {*out},

auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dim_vec}, {"keep_dims", keep_dims}});
runner.Run(stream);

} else {
auto runner = NpuOpRunner("ReduceSumD", {*x}, {*out},
auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dims}, {"keep_dims", keep_dims}});
runner.Run(stream);
}

if (x->type() != framework::proto::VarType::FP32 &&
x->type() != framework::proto::VarType::FP16) {
auto dst_dtype = ConvertToNpuDtype(out->type());
auto runner_cast =
NpuOpRunner("Cast", {cast_out}, {*out},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream);
}
}
};

Expand Down
10 changes: 9 additions & 1 deletion paddle/fluid/operators/slice_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,15 @@ void UpdateAttr(const framework::DDim in_dims, const std::vector<int> axes,

if (axis == i) {
start = starts[cnt];
end = ends[cnt] <= in_dims[i] ? ends[cnt] : end;
if (start < 0) {
start = (start + in_dims[i]);
}
start = std::max(start, static_cast<int>(0));
end = ends[cnt];
if (end < 0) {
end = (end + in_dims[i]);
}
end = std::min(end, static_cast<int>(in_dims[i]));
cnt++;
}

Expand Down