From 896b4504de31dec5410e42e6ff34fee0813a3f5e Mon Sep 17 00:00:00 2001
From: xiayanming <xiayanming@baidu.com>
Date: Sat, 6 Mar 2021 23:15:55 +0800
Subject: [PATCH 1/6] add gather npu op

---
 paddle/fluid/operators/CMakeLists.txt         |   5 +
 paddle/fluid/operators/gather_op_npu.cc       | 118 ++++++++++++
 paddle/fluid/operators/gather_op_npu_test.cc  | 172 ++++++++++++++++++
 .../tests/unittests/test_gather_op_npu.py     | 109 +++++++++++
 4 files changed, 404 insertions(+)
 create mode 100644 paddle/fluid/operators/gather_op_npu.cc
 create mode 100644 paddle/fluid/operators/gather_op_npu_test.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_gather_op_npu.py
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 17234edb116e3..a3964b28eab31 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -151,6 +151,11 @@ else()
     cc_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc DEPS tensor device_context eigen3)
 endif()
 
+# ascend gather_op_npu unittest
+if (WITH_ASCEND_CL)
+    cc_test(gather_op_npu_test SRCS gather_op_npu_test.cc DEPS gather_op tensor op_registry scope device_context enforce executor)
+endif()
+
 cc_library(tensor_formatter SRCS tensor_formatter.cc DEPS ${OP_HEADER_DEPS})
 if (WITH_PYTHON)
   cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind)
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
new file mode 100644
index 0000000000000..cb80c61796d38
--- /dev/null
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/gather_op.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace operators {
+
+inline framework::Tensor UnsqueezeTo(const framework::Tensor& src, int ndims) {
+  const framework::DDim& shape = src.dims();
+  int rank = shape.size();
+  framework::Tensor res;
+  res.ShareDataWith(src);
+  PADDLE_ENFORCE_LE(
+      rank, ndims,
+      platform::errors::InvalidArgument(
+          "The input Tensor's rank should be less than or equal to ndims"
+          "Received input Tensor's rank = %d, ndims = %d",
+          rank, ndims));
+  if (rank < ndims) {
+    std::vector<int64_t> new_dim(ndims, 1);
+    for (int i = ndims - rank; i < ndims; i++) {
+      new_dim[i] = shape[i - ndims + rank];
+    }
+    res.Resize(framework::make_ddim(new_dim));
+  }
+  return res;
+}
+
+template <typename DeviceContext, typename T>
+class GatherOpNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *out = ctx.Output<Tensor>("Out");
+    
+    out->mutable_data<T>(ctx.GetPlace());
+    auto runner = NpuOpRunner("Gather", {*x, *index}, {*out}, {{"validate_indices", true}});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GatherGradOpNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *x = ctx.Input<Tensor>("X");
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    
+    // step1: Unsqueeze index
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 1) {
+      framework::Tensor tmp_index = UnsqueezeTo(*index, 2);
+      index = &tmp_index;
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+ 
+    // step2: ZerosLike x in device 
+    Tensor* tmp_zerox = const_cast<Tensor*>(x);
+    Tensor zeroslike_xout(x->type());
+    zeroslike_xout.Resize(x->dims());
+    zeroslike_xout.mutable_data<T>(ctx.GetPlace());
+
+    auto runner_zeroslike = NpuOpRunner("ZerosLike", {*x}, {zeroslike_xout}, {});
+    runner_zeroslike.Run(stream);
+    tmp_zerox = &zeroslike_xout;
+
+    // step3: scatter(x_grad)
+    dx->mutable_data<T>(ctx.GetPlace());
+    auto runner_scatter = NpuOpRunner("TensorScatterUpdate", 
+                                        {*tmp_zerox, *index, *dout}, 
+                                        {*dx}, {});
+    runner_scatter.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    gather, 
+    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, 
+                           paddle::platform::float16>);
+                           
+REGISTER_OP_NPU_KERNEL(
+    gather_grad, 
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, 
+                               paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
new file mode 100644
index 0000000000000..025b04ac77b75
--- /dev/null
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -0,0 +1,172 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/gather_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(gather);
+USE_OP_DEVICE_KERNEL(gather, NPU);
+USE_OP(gather_grad);
+USE_OP_DEVICE_KERNEL(gather_grad, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {            
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  
+  auto index = scope->Var("Index");
+  auto tensor_index = index->GetMutable<f::LoDTensor>();
+  
+  std::vector<T> init_x;
+  for (int64_t i = 1; i < 7; ++i) {
+    // 1,2,3,4,5,6
+    init_x.push_back(static_cast<T>(i));
+  }
+
+  // [[1, 2],[3, 4],[5, 6]]
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize(paddle::framework::make_ddim({3, 2}));
+
+  std::vector<int> init_index = {1, 2};
+  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
+  tensor_index->Resize(paddle::framework::make_ddim({2}));
+
+  ctx.Wait();
+
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  f::AttributeMap attrs = {{"validate_indices", true}};
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}, {"Index", {"Index"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+ 
+  // ref:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/tensor/manipulation/gather_cn.html#gather
+  for(int i=0; i< static_cast<int>(out_vec.size()); ++i){
+    VLOG(3) << "out_vec[" << i<< "] : "<< out_vec[i];
+  }
+  uint32_t expected_size = 4;
+  EXPECT_EQ((uint32_t)out_vec.size(), expected_size);
+
+  // {3, 4, 5, 6}
+  std::vector<T> expected_out_vec;
+  for (int64_t i = 3; i < 7; ++i) {
+    expected_out_vec.push_back(static_cast<T>(i));
+  }
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], expected_out_vec[i]);
+  }
+}
+
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
+                 std::string op_type) {
+  // init
+  auto index = scope->Var("Index");
+  auto tensor_index = index->GetMutable<f::LoDTensor>();
+
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+
+  //https://tensorflow.google.cn/api_docs/python/tf/raw_ops/TensorScatterUpdate
+  //https://tensorflow.google.cn/api_docs/python/tf/tensor_scatter_nd_update
+  std::vector<int> init_index = {0, 1, 2, 0};
+  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
+  tensor_index->Resize(paddle::framework::make_ddim({2, 2}));
+
+  std::vector<T> init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize(paddle::framework::make_ddim({3, 2}));
+
+  std::vector<T> init_dout = {5.0, 10.0};
+  TensorFromVector(init_dout, ctx, tensor_dout);
+  tensor_dout->Resize(paddle::framework::make_ddim({2}));
+
+  ctx.Wait();
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  // run
+  f::AttributeMap attrs;
+  auto op = f::OpRegistry::CreateOp(op_type,
+    {{"X", {"X"}}, {"Index", {"Index"}}, {"Out@GRAD", {"DOut"}}},
+    {{"X@GRAD", {"DX"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+
+  std::vector<T> dx_vec;
+  TensorToVector(*tensor_dx, ctx, &dx_vec);
+
+  ctx.Wait();
+
+  uint32_t expected_size = 3 * 2;
+  EXPECT_EQ((uint32_t)dx_vec.size(), expected_size);
+
+  std::vector<T> expected_dx_vec = {0.0, 5.0, 0.0, 0.0, 10.0, 0.0};
+  for (uint32_t i = 0; i < dx_vec.size(); i++) {
+    VLOG(3) << "dx_vec[i]=" << dx_vec[i];
+    EXPECT_EQ(dx_vec[i], expected_dx_vec[i]);
+  }
+}
+
+TEST(gather, NPU_fp32) {
+    f::Scope scope;
+    p::NPUDeviceContext ctx(p::NPUPlace(0));
+    Compare<float>(&scope, ctx, "gather");
+}
+
+TEST(gather, NPU_fp16) {
+    f::Scope scope;
+    p::NPUDeviceContext ctx(p::NPUPlace(0));
+    Compare<p::float16>(&scope, ctx, "gather");
+}
+
+TEST(gather_grad, NPU) {
+    f::Scope scope;
+    p::NPUDeviceContext ctx(p::NPUPlace(0));
+    CompareGrad<float>(&scope, ctx, "gather_grad");
+}
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/test_gather_op_npu.py
new file mode 100644
index 0000000000000..b026861c7e9fa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_gather_op_npu.py
@@ -0,0 +1,109 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+import paddle
+import paddle.fluid as fluid
+
+
+paddle.enable_static()
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGatherOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "gather"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Index': OpTest.np_dtype_to_fluid_dtype(self.index)
+        }
+        self.attrs = {'validate_indices': True}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_input_output(self):
+        self.x = np.array([[1, 2], [3, 4], [5, 6]]).astype(self.dtype)
+        self.index = np.array([1, 2]).astype(np.int)
+        self.out = np.array([[3, 4], [5, 6]]).astype(self.dtype)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGatherAPI(unittest.TestCase):
+    def test_name(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name="x", shape=[3, 2], dtype="float32")
+            index = paddle.static.data(name='index', shape=[1], dtype='int32')
+
+            out = paddle.gather(x, index, name='gather')
+            self.assertEqual(('gather' in out.name), True)
+
+    def test_static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+
+            x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype('float32')
+            index_np = np.array([1, 2]).astype('int32')
+
+            x = paddle.static.data(name="x", shape=[3, 2], dtype='float32')
+            index = paddle.static.data(name="index", shape=[2], dtype='int32')
+
+            z = paddle.gather(x, index)
+
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_value, index_value, z_value = exe.run(
+                feed={"x": x_np,
+                      "index": index_np}, fetch_list=[x, index, z])
+
+            z_expected = np.array([[3, 4], [5, 6]])
+            self.assertEqual(
+                (x_value == x_np).all(),
+                True,
+                msg="x_value = {}, but expected {}".format(x_value, x_np))
+            self.assertEqual(
+                (index_value == index_np).all(),
+                True,
+                msg="index_value = {}, but expected {}".format(index_value,
+                                                               index_np))
+            self.assertEqual(
+                (z_value == z_expected).all(),
+                True,
+                msg="z_value = {}, but expected {}".format(z_value, z_expected))
+
+    def test_backward(self):
+        # TODO(ascendrc): Test backward after add grad npu op implemented.
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file

From eea89c124c9b47f646129ceffe62a76e82d0b28b Mon Sep 17 00:00:00 2001
From: xiayanming <xiayanming@baidu.com>
Date: Mon, 8 Mar 2021 15:14:15 +0800
Subject: [PATCH 2/6] code review done

---
 paddle/fluid/operators/gather_op_npu.cc                   | 5 ++---
 python/paddle/fluid/tests/unittests/test_gather_op_npu.py | 3 ++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index cb80c61796d38..796617efc1833 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND_CL
 #include <memory>
 #include <string>
 #include <vector>
@@ -115,4 +114,4 @@ REGISTER_OP_NPU_KERNEL(
     ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
     ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, 
                                paddle::platform::float16>);
-#endif
+
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/test_gather_op_npu.py
index b026861c7e9fa..9f9e260b8cff9 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op_npu.py
@@ -106,4 +106,5 @@ def test_backward(self):
 
 
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main()
+    
\ No newline at end of file

From 203e842fe82329ed943e7516beccdfa112cc97f2 Mon Sep 17 00:00:00 2001
From: xiayanming <xiayanming@baidu.com>
Date: Mon, 8 Mar 2021 15:19:48 +0800
Subject: [PATCH 3/6] update python new line

---
 python/paddle/fluid/tests/unittests/test_gather_op_npu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/test_gather_op_npu.py
index 9f9e260b8cff9..87f0cd2359995 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op_npu.py
@@ -107,4 +107,3 @@ def test_backward(self):
 
 if __name__ == '__main__':
     unittest.main()
-    
\ No newline at end of file

From f6d5f8545276704d1254267e64650970d8e48ab6 Mon Sep 17 00:00:00 2001
From: xiayanming <xiayanming@baidu.com>
Date: Tue, 9 Mar 2021 15:29:45 +0800
Subject: [PATCH 4/6] precommit

---
 paddle/fluid/operators/gather_op_npu.cc      | 41 ++++++++--------
 paddle/fluid/operators/gather_op_npu_test.cc | 49 ++++++++++----------
 2 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index 796617efc1833..0ef54aca08528 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -12,18 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/gather_op.h"
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/operators/npu_op_runner.h"
-#include "paddle/fluid/operators/gather_op.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 
-inline framework::Tensor UnsqueezeTo(const framework::Tensor& src, int ndims) {
-  const framework::DDim& shape = src.dims();
+inline framework::Tensor UnsqueezeTo(const framework::Tensor &src, int ndims) {
+  const framework::DDim &shape = src.dims();
   int rank = shape.size();
   framework::Tensor res;
   res.ShareDataWith(src);
@@ -50,9 +50,10 @@ class GatherOpNPUKernel : public framework::OpKernel<T> {
     auto *x = ctx.Input<Tensor>("X");
     auto *index = ctx.Input<Tensor>("Index");
     auto *out = ctx.Output<Tensor>("Out");
-    
+
     out->mutable_data<T>(ctx.GetPlace());
-    auto runner = NpuOpRunner("Gather", {*x, *index}, {*out}, {{"validate_indices", true}});
+    auto runner = NpuOpRunner("Gather", {*x, *index}, {*out},
+                              {{"validate_indices", true}});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -68,7 +69,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
     auto *x = ctx.Input<Tensor>("X");
     auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    
+
     // step1: Unsqueeze index
     const auto index_dims = index->dims();
     if (index_dims.size() == 1) {
@@ -79,22 +80,22 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
- 
-    // step2: ZerosLike x in device 
-    Tensor* tmp_zerox = const_cast<Tensor*>(x);
+
+    // step2: ZerosLike x in device
+    Tensor *tmp_zerox = const_cast<Tensor *>(x);
     Tensor zeroslike_xout(x->type());
     zeroslike_xout.Resize(x->dims());
     zeroslike_xout.mutable_data<T>(ctx.GetPlace());
 
-    auto runner_zeroslike = NpuOpRunner("ZerosLike", {*x}, {zeroslike_xout}, {});
+    auto runner_zeroslike =
+        NpuOpRunner("ZerosLike", {*x}, {zeroslike_xout}, {});
     runner_zeroslike.Run(stream);
     tmp_zerox = &zeroslike_xout;
 
     // step3: scatter(x_grad)
     dx->mutable_data<T>(ctx.GetPlace());
-    auto runner_scatter = NpuOpRunner("TensorScatterUpdate", 
-                                        {*tmp_zerox, *index, *dout}, 
-                                        {*dx}, {});
+    auto runner_scatter = NpuOpRunner("TensorScatterUpdate",
+                                      {*tmp_zerox, *index, *dout}, {*dx}, {});
     runner_scatter.Run(stream);
   }
 };
@@ -104,14 +105,12 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
-    gather, 
-    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, 
+    gather, ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext,
                            paddle::platform::float16>);
-                           
+
 REGISTER_OP_NPU_KERNEL(
-    gather_grad, 
+    gather_grad,
     ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, 
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext,
                                paddle::platform::float16>);
-
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
index 025b04ac77b75..9348e1c0b516c 100644
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -39,14 +39,14 @@ USE_OP_DEVICE_KERNEL(gather_grad, NPU);
 
 template <typename T>
 void Compare(f::Scope* scope, const p::DeviceContext& ctx,
-             std::string op_type) {            
+             std::string op_type) {
   // init
   auto x = scope->Var("X");
   auto tensor_x = x->GetMutable<f::LoDTensor>();
-  
+
   auto index = scope->Var("Index");
   auto tensor_index = index->GetMutable<f::LoDTensor>();
-  
+
   std::vector<T> init_x;
   for (int64_t i = 1; i < 7; ++i) {
     // 1,2,3,4,5,6
@@ -68,8 +68,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
 
   // run
   f::AttributeMap attrs = {{"validate_indices", true}};
-  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}, {"Index", {"Index"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"X", {"X"}}, {"Index", {"Index"}}}, {{"Out", {"Out"}}}, attrs);
 
   auto place = ctx.GetPlace();
   op->Run(*scope, place);
@@ -78,10 +78,10 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   TensorToVector(*tensor_out, ctx, &out_vec);
 
   ctx.Wait();
- 
+
   // ref:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/tensor/manipulation/gather_cn.html#gather
-  for(int i=0; i< static_cast<int>(out_vec.size()); ++i){
-    VLOG(3) << "out_vec[" << i<< "] : "<< out_vec[i];
+  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
+    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
   }
   uint32_t expected_size = 4;
   EXPECT_EQ((uint32_t)out_vec.size(), expected_size);
@@ -96,7 +96,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   }
 }
 
-
 template <typename T>
 void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
                  std::string op_type) {
@@ -106,12 +105,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 
   auto x = scope->Var("X");
   auto tensor_x = x->GetMutable<f::LoDTensor>();
-  
+
   auto dout = scope->Var("DOut");
   auto tensor_dout = dout->GetMutable<f::LoDTensor>();
 
-  //https://tensorflow.google.cn/api_docs/python/tf/raw_ops/TensorScatterUpdate
-  //https://tensorflow.google.cn/api_docs/python/tf/tensor_scatter_nd_update
+  // https://tensorflow.google.cn/api_docs/python/tf/raw_ops/TensorScatterUpdate
+  // https://tensorflow.google.cn/api_docs/python/tf/tensor_scatter_nd_update
   std::vector<int> init_index = {0, 1, 2, 0};
   paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
   tensor_index->Resize(paddle::framework::make_ddim({2, 2}));
@@ -131,9 +130,9 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 
   // run
   f::AttributeMap attrs;
-  auto op = f::OpRegistry::CreateOp(op_type,
-    {{"X", {"X"}}, {"Index", {"Index"}}, {"Out@GRAD", {"DOut"}}},
-    {{"X@GRAD", {"DX"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"X", {"X"}}, {"Index", {"Index"}}, {"Out@GRAD", {"DOut"}}},
+      {{"X@GRAD", {"DX"}}}, attrs);
 
   auto place = ctx.GetPlace();
   op->Run(*scope, place);
@@ -154,19 +153,19 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 }
 
 TEST(gather, NPU_fp32) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "gather");
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx, "gather");
 }
 
 TEST(gather, NPU_fp16) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<p::float16>(&scope, ctx, "gather");
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<p::float16>(&scope, ctx, "gather");
 }
 
-TEST(gather_grad, NPU) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    CompareGrad<float>(&scope, ctx, "gather_grad");
+TEST(gather_grad, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx, "gather_grad");
 }

From 6941eb783a20111066abcad418078eefe42f96f0 Mon Sep 17 00:00:00 2001
From: xiayanming <xiayanming@baidu.com>
Date: Tue, 9 Mar 2021 15:57:43 +0800
Subject: [PATCH 5/6] fix review

---
 paddle/fluid/operators/gather_op_npu.cc       | 22 +------------------
 .../unittests/{ => npu}/test_gather_op_npu.py |  2 +-
 2 files changed, 2 insertions(+), 22 deletions(-)
 rename python/paddle/fluid/tests/unittests/{ => npu}/test_gather_op_npu.py (100%)

diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index 0ef54aca08528..2d7b5b93ad651 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -17,32 +17,12 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/kron_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 
-inline framework::Tensor UnsqueezeTo(const framework::Tensor &src, int ndims) {
-  const framework::DDim &shape = src.dims();
-  int rank = shape.size();
-  framework::Tensor res;
-  res.ShareDataWith(src);
-  PADDLE_ENFORCE_LE(
-      rank, ndims,
-      platform::errors::InvalidArgument(
-          "The input Tensor's rank should be less than or equal to ndims"
-          "Received input Tensor's rank = %d, ndims = %d",
-          rank, ndims));
-  if (rank < ndims) {
-    std::vector<int64_t> new_dim(ndims, 1);
-    for (int i = ndims - rank; i < ndims; i++) {
-      new_dim[i] = shape[i - ndims + rank];
-    }
-    res.Resize(framework::make_ddim(new_dim));
-  }
-  return res;
-}
-
 template <typename DeviceContext, typename T>
 class GatherOpNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_gather_op_npu.py
rename to python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
index 87f0cd2359995..0fcb2bee658fa 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
@@ -22,9 +22,9 @@
 import paddle
 import paddle.fluid as fluid
 
-
 paddle.enable_static()
 
+
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestGatherOp(OpTest):

From 98b75bef92ac306216097b3f9c8555bf4476af35 Mon Sep 17 00:00:00 2001
From: xiayanming <xiayanming@baidu.com>
Date: Tue, 9 Mar 2021 19:53:57 +0800
Subject: [PATCH 6/6] del commit

---
 paddle/fluid/operators/gather_op_npu_test.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
index 9348e1c0b516c..4cd46da6f26f8 100644
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -109,8 +109,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
   auto dout = scope->Var("DOut");
   auto tensor_dout = dout->GetMutable<f::LoDTensor>();
 
-  // https://tensorflow.google.cn/api_docs/python/tf/raw_ops/TensorScatterUpdate
-  // https://tensorflow.google.cn/api_docs/python/tf/tensor_scatter_nd_update
   std::vector<int> init_index = {0, 1, 2, 0};
   paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
   tensor_index->Resize(paddle::framework::make_ddim({2, 2}));