Skip to content

Commit

Permalink
[NPU] fix some op bugs (#31855)
Browse files Browse the repository at this point in the history
* fix some op bugs

* fix some bugs

* follow comments

* fix log level

* add ut
  • Loading branch information
zhiqiu authored Mar 25, 2021
1 parent 9754d0a commit f354e1d
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 32 deletions.
3 changes: 2 additions & 1 deletion paddle/fluid/operators/activation_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -364,4 +364,5 @@ REGISTER_OP_NPU_KERNEL(
REGISTER_OP_NPU_KERNEL(
square, ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::SquareNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
paddle::platform::float16>,
ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, int>);
6 changes: 3 additions & 3 deletions paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,13 @@ class LazyZerosNPU {
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
if (found_inf_vec[0]) {
VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --";
VLOG(4) << "-- UpdateLossScaling: Find infinite grads. --";

auto place = dev_ctx.GetPlace();
auto stream = dev_ctx.stream();
auto g = out->mutable_data<int>(place);
auto g = out->mutable_data<T>(place);
platform::NPUMemsetAsync(static_cast<void*>(g), 0,
out->numel() * sizeof(int), stream);
out->numel() * sizeof(T), stream);
}
}
}
Expand Down
46 changes: 25 additions & 21 deletions paddle/fluid/operators/concat_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,6 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> {
auto outs =
ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));

{
auto dx = outs;
auto x = ins;
for (size_t i = 0; i < dx.size(); ++i) {
if (dx[i] != nullptr) {
dx[i]->set_lod(x[i]->lod());
}
}
}
PADDLE_ENFORCE_NOT_NULL(ins[0],
platform::errors::NotFound(
"The first input tensor is not initalized."));
Expand All @@ -88,26 +79,39 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> {

axis = ComputeAxis(static_cast<int64_t>(axis),
static_cast<int64_t>(ins[0]->dims().size()));
// get output tensor that the name is not kEmptyVarName
std::vector<framework::Tensor> outputs;

std::vector<int> sizes;
int offset = 0;
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
for (size_t j = 0; j < outs.size(); ++j) {
// For stop gradient
// get output tensor that the name is not kEmptyVarName
if (out_var_names[j] != framework::kEmptyVarName &&
outs[j]->numel() != 0UL) {
outs[j]->mutable_data<T>(ctx.GetPlace());
outputs.push_back(*outs[j]);
sizes.push_back(outs[j]->dims()[axis]);
std::vector<int> offsets;
std::vector<int> sizes;
for (int dim = 0; dim < ins[j]->dims().size(); ++dim) {
if (dim == axis) {
offsets.push_back(offset);
sizes.push_back(ins[j]->dims()[dim]);
} else {
offsets.push_back(0);
sizes.push_back(ins[j]->dims()[dim]);
}
}
auto runner =
NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
{{"offsets", offset}, {"size", ins[j]->dims()[axis]}});
runner.Run(stream);
}
if (ins[j]->numel() != 0UL) {
offset += ins[j]->dims()[axis];
}
}
auto runner =
NpuOpRunner("SplitVD", {*out_grad}, outputs,
{{"split_dim", axis},
{"size_splits", sizes},
{"num_split", static_cast<int>(outputs.size())}});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};

Expand Down
39 changes: 37 additions & 2 deletions paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,58 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> {

out->mutable_data<T>(ctx.GetPlace());

// special case
if (x->dims().size() == 1 && keep_dims == false) {
keep_dims = true;
}

auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();

framework::Tensor cast_x;
framework::Tensor cast_out;
// NOTE: ReduceSumD only supports fp32 and fp16
if (x->type() != framework::proto::VarType::FP32 &&
x->type() != framework::proto::VarType::FP16) {
cast_x.Resize(x->dims());
cast_x.mutable_data<float>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
auto runner_cast = NpuOpRunner(
"Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream);

cast_out.Resize(out->dims());
cast_out.mutable_data<float>(ctx.GetPlace());
} else {
cast_x.ShareDataWith(*x);
cast_out.ShareDataWith(*out);
}

if (reduce_all) {
std::vector<int> dim_vec;
for (int i = 0; i < x->dims().size(); i++) {
dim_vec.push_back(i);
}
auto runner = NpuOpRunner("ReduceSumD", {*x}, {*out},

auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dim_vec}, {"keep_dims", keep_dims}});
runner.Run(stream);

} else {
auto runner = NpuOpRunner("ReduceSumD", {*x}, {*out},
auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dims}, {"keep_dims", keep_dims}});
runner.Run(stream);
}

if (x->type() != framework::proto::VarType::FP32 &&
x->type() != framework::proto::VarType::FP16) {
auto dst_dtype = ConvertToNpuDtype(out->type());
auto runner_cast =
NpuOpRunner("Cast", {cast_out}, {*out},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream);
}
}
};

Expand Down
10 changes: 9 additions & 1 deletion paddle/fluid/operators/slice_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,15 @@ void UpdateAttr(const framework::DDim in_dims, const std::vector<int> axes,

if (axis == i) {
start = starts[cnt];
end = ends[cnt] <= in_dims[i] ? ends[cnt] : end;
if (start < 0) {
start = (start + in_dims[i]);
}
start = std::max(start, static_cast<int>(0));
end = ends[cnt];
if (end < 0) {
end = (end + in_dims[i]);
}
end = std::min(end, static_cast<int>(in_dims[i]));
cnt++;
}

Expand Down
6 changes: 3 additions & 3 deletions python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class TestConcat(OpTest):
def setUp(self):
self.set_npu()
self.op_type = "concat"
self.place = paddle.NPUPlace(4)
self.place = paddle.NPUPlace(0)
self.init_dtype()
self.init_test_data()

Expand Down Expand Up @@ -66,7 +66,7 @@ def init_test_data(self):

def test_check_grad(self):
self.check_grad_with_place(
self.place, ['x0'], 'Out', check_dygraph=False)
self.place, ['x0', 'x2'], 'Out', check_dygraph=False)
self.check_grad_with_place(
self.place, ['x1'], 'Out', check_dygraph=False)
self.check_grad_with_place(
Expand All @@ -77,7 +77,7 @@ class TestConcatFP16(OpTest):
def setUp(self):
self.set_npu()
self.op_type = "concat"
self.place = paddle.NPUPlace(4)
self.place = paddle.NPUPlace(0)
self.init_dtype()
self.init_test_data()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class TestReduceSum(OpTest):
def setUp(self):
np.random.seed(SEED)
self.set_npu()
self.init_dtype()
self.place = paddle.NPUPlace(0)
self.init_op_type()
self.initTestCase()
Expand All @@ -42,7 +43,7 @@ def setUp(self):
'keep_dim': self.keep_dim,
'reduce_all': self.reduce_all
}
self.inputs = {'X': np.random.random(self.shape).astype("float32")}
self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
if self.attrs['reduce_all']:
self.outputs = {'Out': self.inputs['X'].sum()}
else:
Expand Down Expand Up @@ -78,6 +79,11 @@ def test_check_output(self):
#


class TestReduceSum2(OpTest):
def init_dtype(self):
self.dtype = np.int32


@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestReduceSumNet(unittest.TestCase):
Expand Down
12 changes: 12 additions & 0 deletions python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,22 @@ def test_check_output(self):
self.check_output_with_place(self.place, check_dygraph=False)

def test_check_grad_normal(self):
if self.dtype == np.float16:
return
self.check_grad_with_place(
self.place, ['Input'], 'Out', check_dygraph=False)


class TestSliceOp2(TestSliceOp):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
self.starts = [1, 0, -3]
self.ends = [3, 3, -1]
self.axes = [0, 1, 2]
self.infer_flags = [1, 1, 1]
self.out = self.input[1:3, 0:3, -3:-1, :]


@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestSliceOpFp16(TestSliceOp):
Expand Down

1 comment on commit f354e1d

@paddle-bot-old
Copy link

@paddle-bot-old paddle-bot-old bot commented on f354e1d Mar 25, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🕵️ CI failures summary

🔍PR: #31855 Commit ID: f354e1d contains failed CI.

Please sign in to comment.