[XPU] fixed the bug of tile op in large input and add XPU implementation. (PaddlePaddle#9102)

wbn03 · newway · commit ccd7a5beb24c · 2022-08-21T16:48:57.000+08:00
diff --git a/lite/kernels/host/tile_compute.cc b/lite/kernels/host/tile_compute.cc
@@ -85,9 +85,10 @@ void TileCompute<T, PType>::Run() {
       int dst_stride = in_stride[i + 1] * right;
       for (int m = 0; m < num; m++) {
         for (int j = 0; j < bcast_dims[i]; j++) {
-          std::memcpy(tmp_dst + j * dst_stride / bcast_dims[i] + m * dst_stride,
-                      tmp_src + m * dst_stride / bcast_dims[i],
-                      dst_stride / bcast_dims[i] * sizeof(T));
+          std::memcpy(
+              tmp_dst + j * (dst_stride / bcast_dims[i]) + m * dst_stride,
+              tmp_src + m * (dst_stride / bcast_dims[i]),
+              dst_stride / bcast_dims[i] * sizeof(T));
         }
       }
       tmp_src_tensor.CopyDataFrom(tmp_dst_tensor);
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
@@ -30,6 +30,7 @@ add_kernel(gru_compute_xpu XPU basic SRCS gru_compute.cc)
 add_kernel(gru_unit_compute_xpu XPU basic SRCS gru_unit_compute.cc)
 add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc)
 add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc)
+add_kernel(tile_compute_xpu XPU basic SRCS tile_compute.cc)
 add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc)
 add_kernel(sequence_topk_avg_pooling_compute_xpu XPU basic SRCS sequence_topk_avg_pooling_compute.cc)
 add_kernel(concat_compute_xpu XPU basic SRCS concat_compute.cc)
diff --git a/lite/kernels/xpu/tile_compute.cc b/lite/kernels/xpu/tile_compute.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/tile_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+void TileCompute<T, PType>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+  auto repeat_times = param.repeat_times;
+  if (param.RepeatTimes) {
+    auto repeat_times_size = param.RepeatTimes->data_size();
+    for (int64_t i = 0; i < repeat_times_size; i++) {
+      repeat_times.push_back(param.RepeatTimes->template data<int>()[i]);
+    }
+  } else if (param.repeat_times_tensor.size() != 0) {
+    for (int i = 0; i < param.repeat_times_tensor.size(); i++) {
+      auto temp = param.repeat_times_tensor[i];
+      repeat_times.push_back(*(temp->template data<int>()));
+    }
+  }
+  auto in_dims = param.X->dims();
+  auto vec_in_dims = in_dims.Vectorize();
+  // broadcast for vec_in_dims.size() equal to repeat_times.size()
+  if (repeat_times.size() < vec_in_dims.size()) {
+    int diff = vec_in_dims.size() - repeat_times.size();
+    repeat_times.insert(repeat_times.begin(), diff, 1);
+  } else {
+    int diff = repeat_times.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  }
+
+  std::vector<int> new_in_dims(vec_in_dims.begin(), vec_in_dims.end());
+  std::vector<int> out_dims(param.Out->dims().data().begin(),
+                            param.Out->dims().data().end());
+  int r = xdnn::broadcast<T>(ctx.GetRawContext(),
+                             param.X->template data<T>(),
+                             param.Out->template mutable_data<T>(TARGET(kXPU)),
+                             new_in_dims,
+                             out_dims);
+
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using tile_float =
+    paddle::lite::kernels::xpu::TileCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(tile, kXPU, kFloat, kNCHW, tile_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("RepeatTimes",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("repeat_times_tensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/tile_compute.h b/lite/kernels/xpu/tile_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+class TileCompute : public KernelLite<TARGET(kXPU), PType> {
+ public:
+  using param_t = operators::TileParam;
+
+  virtual void Run();
+
+  virtual ~TileCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/tile_op.cc b/lite/operators/tile_op.cc
@@ -118,6 +118,7 @@ bool TileOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   } else if (opdesc.HasInput("repeat_times_tensor") &&
              (opdesc.Input("repeat_times_tensor").size() != 0)) {
     auto temp = opdesc.Input("repeat_times_tensor");
+    param_.repeat_times_tensor.clear();
     for (auto var : temp) {
       param_.repeat_times_tensor.push_back(
           scope->FindVar(var)->GetMutable<lite::Tensor>());
diff --git a/lite/tests/kernels/tile_compute_test.cc b/lite/tests/kernels/tile_compute_test.cc
@@ -199,6 +199,9 @@ TEST(tile, precision) {
 #else
   return;
 #endif
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+  alias = "def";
 #elif defined(LITE_WITH_ARM) || defined(LITE_WITH_X86)
   place = TARGET(kHost);
 #else

Original file line number	Diff line number	Diff line change
`@@ -85,9 +85,10 @@ void TileCompute<T, PType>::Run() {`
`85`	`85`	`int dst_stride = in_stride[i + 1] * right;`
`86`	`86`	`for (int m = 0; m < num; m++) {`
`87`	`87`	`for (int j = 0; j < bcast_dims[i]; j++) {`
`88`		`- std::memcpy(tmp_dst + j * dst_stride / bcast_dims[i] + m * dst_stride,`
`89`		`- tmp_src + m * dst_stride / bcast_dims[i],`
`90`		`- dst_stride / bcast_dims[i] * sizeof(T));`
	`88`	`+ std::memcpy(`
	`89`	`+ tmp_dst + j * (dst_stride / bcast_dims[i]) + m * dst_stride,`
	`90`	`+ tmp_src + m * (dst_stride / bcast_dims[i]),`
	`91`	`+ dst_stride / bcast_dims[i] * sizeof(T));`
`91`	`92`	`}`
`92`	`93`	`}`
`93`	`94`	`tmp_src_tensor.CopyDataFrom(tmp_dst_tensor);`