PaddlePaddle · zhiqiu · Mar 25, 2021 · Mar 24, 2021 · Mar 25, 2021 · Mar 25, 2021
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
@@ -364,4 +364,5 @@ REGISTER_OP_NPU_KERNEL(
 REGISTER_OP_NPU_KERNEL(
     square, ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, float>,
     ops::SquareNPUKernel<paddle::platform::NPUDeviceContext,
-                         paddle::platform::float16>);
+                         paddle::platform::float16>,
+    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -152,7 +152,7 @@ class LazyZerosNPU {
 
         auto place = dev_ctx.GetPlace();
         auto stream = dev_ctx.stream();
-        auto g = out->mutable_data<int>(place);
+        auto g = out->mutable_data<T>(place);
         platform::NPUMemsetAsync(static_cast<void*>(g), 0,
                                  out->numel() * sizeof(int), stream);
       }

diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc
@@ -71,15 +71,6 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> {
     auto outs =
         ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
 
-    {
-      auto dx = outs;
-      auto x = ins;
-      for (size_t i = 0; i < dx.size(); ++i) {
-        if (dx[i] != nullptr) {
-          dx[i]->set_lod(x[i]->lod());
-        }
-      }
-    }
     PADDLE_ENFORCE_NOT_NULL(ins[0],
                             platform::errors::NotFound(
                                 "The first input tensor is not initalized."));
@@ -88,26 +79,39 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> {
 
     axis = ComputeAxis(static_cast<int64_t>(axis),
                        static_cast<int64_t>(ins[0]->dims().size()));
-    // get output tensor that the name is not kEmptyVarName
-    std::vector<framework::Tensor> outputs;
+
     std::vector<int> sizes;
+    int offset = 0;
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
     for (size_t j = 0; j < outs.size(); ++j) {
+      // For stop gradient
+      // get output tensor that the name is not kEmptyVarName
       if (out_var_names[j] != framework::kEmptyVarName &&
           outs[j]->numel() != 0UL) {
         outs[j]->mutable_data<T>(ctx.GetPlace());
-        outputs.push_back(*outs[j]);
         sizes.push_back(outs[j]->dims()[axis]);
+        std::vector<int> offsets;
+        std::vector<int> sizes;
+        for (int dim = 0; dim < ins[j]->dims().size(); ++dim) {
+          if (dim == axis) {
+            offsets.push_back(offset);
+            sizes.push_back(ins[j]->dims()[dim]);
+          } else {
+            offsets.push_back(0);
+            sizes.push_back(ins[j]->dims()[dim]);
+          }
+        }
+        auto runner =
+            NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
+                        {{"offsets", offset}, {"size", ins[j]->dims()[axis]}});
+        runner.Run(stream);
+      }
+      if (ins[j]->numel() != 0UL) {
+        offset += ins[j]->dims()[axis];
       }
     }
-    auto runner =
-        NpuOpRunner("SplitVD", {*out_grad}, outputs,
-                    {{"split_dim", axis},
-                     {"size_splits", sizes},
-                     {"num_split", static_cast<int>(outputs.size())}});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
   }
 };
 

diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -34,23 +34,58 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> {
 
     out->mutable_data<T>(ctx.GetPlace());
 
+    // special case
+    if (x->dims().size() == 1 && keep_dims == false) {
+      keep_dims = true;
+    }
+
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
+
+    framework::Tensor cast_x;
+    framework::Tensor cast_out;
+    // NOTE: ReduceSumD only supports fp32 and fp16
+    if (x->type() != framework::proto::VarType::FP32 &&
+        x->type() != framework::proto::VarType::FP16) {
+      cast_x.Resize(x->dims());
+      cast_x.mutable_data<float>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
+      auto runner_cast = NpuOpRunner(
+          "Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast.Run(stream);
+
+      cast_out.Resize(out->dims());
+      cast_out.mutable_data<float>(ctx.GetPlace());
+    } else {
+      cast_x.ShareDataWith(*x);
+      cast_out.ShareDataWith(*out);
+    }
+
     if (reduce_all) {
       std::vector<int> dim_vec;
       for (int i = 0; i < x->dims().size(); i++) {
         dim_vec.push_back(i);
       }
-      auto runner = NpuOpRunner("ReduceSumD", {*x}, {*out},
+
+      auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
                                 {{"axes", dim_vec}, {"keep_dims", keep_dims}});
       runner.Run(stream);
 
     } else {
-      auto runner = NpuOpRunner("ReduceSumD", {*x}, {*out},
+      auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
                                 {{"axes", dims}, {"keep_dims", keep_dims}});
       runner.Run(stream);
     }
+
+    if (x->type() != framework::proto::VarType::FP32 &&
+        x->type() != framework::proto::VarType::FP16) {
+      auto dst_dtype = ConvertToNpuDtype(out->type());
+      auto runner_cast =
+          NpuOpRunner("Cast", {cast_out}, {*out},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast.Run(stream);
+    }
   }
 };
 

diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
@@ -36,7 +36,15 @@ void UpdateAttr(const framework::DDim in_dims, const std::vector<int> axes,
 
     if (axis == i) {
       start = starts[cnt];
-      end = ends[cnt] <= in_dims[i] ? ends[cnt] : end;
+      if (start < 0) {
+        start = (start + in_dims[i]);
+      }
+      start = std::max(start, static_cast<int>(0));
+      end = ends[cnt];
+      if (end < 0) {
+        end = (end + in_dims[i]);
+      }
+      end = std::min(end, static_cast<int>(in_dims[i]));
       cnt++;
     }