PaddlePaddle · tianshuo78520a · Mar 29, 2022
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -280,58 +280,13 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
   }
 };
 
-template <typename T>
-struct SelectedRowsAddToTensor<phi::CPUContext, T> {
-  void operator()(const phi::CPUContext& context,
-                  const phi::SelectedRows& input1, framework::Tensor* input2) {
-    if (UNLIKELY(input1.rows().size() == 0)) {
-      LOG(WARNING) << "input selected rows is empty!";
-      return;
-    }
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height, in2_dims[0],
-        platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
-                                          "[%d], second input height = [%d]",
-                                          in1_height, in2_dims[0]));
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel, input2->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
-            in1_row_numel, input2->numel() / in1_height));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
-
-    for (size_t i = 0; i < in1_rows.size(); i++) {
-      for (int64_t j = 0; j < in1_row_numel; j++) {
-        input2_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-      }
-    }
-  }
-};
-
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
                                         platform::bfloat16>;
 
-template struct SelectedRowsAddToTensor<phi::CPUContext, float>;
-template struct SelectedRowsAddToTensor<phi::CPUContext, double>;
-template struct SelectedRowsAddToTensor<phi::CPUContext, int>;
-template struct SelectedRowsAddToTensor<phi::CPUContext, int64_t>;
-template struct SelectedRowsAddToTensor<phi::CPUContext, platform::bfloat16>;
 // This is a separated namespace for manipulate SelectedRows typed
 // data. Like merge duplicated rows, adding two SelectedRows etc.
 //

diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -174,77 +174,12 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
   }
 };
 
-template <typename T>
-struct SelectedRowsAddTensor<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& context,
-                  const phi::SelectedRows& input1,
-                  const framework::Tensor& input2, framework::Tensor* output) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2.dims();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height, in2_dims[0],
-        platform::errors::InvalidArgument(
-            "The two inputs height must be equal."
-            "But recieved first input height = [%d], first input height = [%d]",
-            in1_height, in2_dims[0]));
-    PADDLE_ENFORCE_EQ(
-        in1_height, out_dims[0],
-        platform::errors::InvalidArgument(
-            "The input and output height must be equal."
-            "But recieved input height = [%d], output height = [%d]",
-            in1_height, out_dims[0]));
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel, input2.numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
-            in1_row_numel, input2.numel() / in1_height));
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel, output->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The input and output width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
-            in1_row_numel, output->numel() / in1_height));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2.data<T>();
-    auto* out_data = output->data<T>();
-
-    phi::funcs::SetConstant<phi::GPUContext, T> functor;
-    functor(context, output, static_cast<T>(0));
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(in1_rows.size(), 1);
-    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
-        in1_row_numel);
-
-    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
-    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
-  }
-};
-
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
                                       platform::float16>;
 
-template struct SelectedRowsAddTensor<phi::GPUContext, float>;
-template struct SelectedRowsAddTensor<phi::GPUContext, double>;
-template struct SelectedRowsAdd<phi::GPUContext, platform::float16>;
-template struct SelectedRowsAddTensor<phi::GPUContext, platform::float16>;
-
 template <typename T>
 struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
@@ -350,54 +285,12 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
   }
 };
 
-template <typename T>
-struct SelectedRowsAddToTensor<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& context,
-                  const phi::SelectedRows& input1, framework::Tensor* input2) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height, in2_dims[0],
-        platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
-                                          "[%d], second input height = [%d]",
-                                          in1_height, in2_dims[0]));
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel, input2->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
-            in1_row_numel, input2->numel() / in1_height));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2->data<T>();
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(in1_rows.size(), 1);
-    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddToTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
-        in1_row_numel);
-  }
-};
-
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
                                         platform::float16>;
-template struct SelectedRowsAddToTensor<phi::GPUContext, float>;
-template struct SelectedRowsAddToTensor<phi::GPUContext, double>;
-template struct SelectedRowsAddToTensor<phi::GPUContext, int>;
-template struct SelectedRowsAddToTensor<phi::GPUContext, int64_t>;
-template struct SelectedRowsAddToTensor<phi::GPUContext, platform::float16>;
 
 namespace scatter {
 

diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/meshgrid_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
 namespace paddle {
 namespace operators {
 
@@ -146,3 +145,29 @@ REGISTER_OPERATOR(meshgrid, ops::MeshgridOp, ops::MeshgridOpMaker,
                   ops::MeshgridGradOpMaker<paddle::framework::OpDesc>,
                   ops::MeshgridGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(meshgrid_grad, ops::MeshgridGradOp);
+REGISTER_OP_CPU_KERNEL(
+    meshgrid, ops::MeshgridKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeshgridKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MeshgridKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MeshgridKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
+REGISTER_OP_CPU_KERNEL(
+    meshgrid_grad,
+    ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, double>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    meshgrid, ops::MeshgridKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    meshgrid_grad,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif