From 41b6e67f35648b0ef05e4993244d1cf040672e8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Wed, 29 Jun 2022 13:20:19 +0800 Subject: [PATCH 01/16] [xpu] delete kernel.precision()==float --- lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h | 1 - 1 file changed, 1 deletion(-) diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h index cb3d25d3309..af9fa0435ac 100644 --- a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h +++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h @@ -94,7 +94,6 @@ class XPUStaticKernelPickPass : public mir::StmtPass { if (kernel_pick_factors_.IsPrecisionConsidered() && (place.precision == kernel.precision() || - kernel.precision() == PRECISION(kFloat) || kernel.precision() == PRECISION(kAny) || place.precision == PRECISION(kAny))) { // score skipped, if kernel is int8, but op is not int8 From 953305e25f0aef311d081497a7273d7ec6fc9094 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Thu, 7 Jul 2022 17:29:40 +0800 Subject: [PATCH 02/16] support fp16 data presion, op:relu sigmoid tanh leakyrelu --- lite/kernels/xpu/activation_compute.cc | 84 +++++++++++++++++--------- lite/kernels/xpu/activation_compute.h | 12 ++-- 2 files changed, 63 insertions(+), 33 deletions(-) diff --git a/lite/kernels/xpu/activation_compute.cc b/lite/kernels/xpu/activation_compute.cc index 867acb68205..bb92854f0b8 100644 --- a/lite/kernels/xpu/activation_compute.cc +++ b/lite/kernels/xpu/activation_compute.cc @@ -21,13 +21,14 @@ namespace lite { namespace kernels { namespace xpu { -void ReluCompute::Run() { +template +void ReluCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); int r = xdnn::relu(ctx.GetRawContext(), - param.X->data(), - param.Out->mutable_data(TARGET(kXPU)), + param.X->template data(), + param.Out->template mutable_data(TARGET(kXPU)), param.X->numel()); CHECK_EQ(r, 0); } @@ -54,24 +55,26 @@ void GeluCompute::Run() { CHECK_EQ(r, 0); } -void TanhCompute::Run() { +template +void TanhCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); int r = xdnn::tanh(ctx.GetRawContext(), - param.X->data(), - param.Out->mutable_data(TARGET(kXPU)), + param.X->template data(), + param.Out->template mutable_data(TARGET(kXPU)), param.X->numel()); CHECK_EQ(r, 0); } -void SigmoidCompute::Run() { +template +void SigmoidCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); int r = xdnn::sigmoid(ctx.GetRawContext(), - param.X->data(), - param.Out->mutable_data(TARGET(kXPU)), + param.X->template data(), + param.Out->template mutable_data(TARGET(kXPU)), param.X->numel()); CHECK_EQ(r, 0); } @@ -205,13 +208,13 @@ void HardSigmoidCompute::Run() { CHECK_EQ(r, 0); } -void LeakyReluCompute::Run() { +template +void LeakyReluCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); - int r = xdnn::leaky_relu(ctx.GetRawContext(), - param.X->data(), - param.Out->mutable_data(TARGET(kXPU)), + param.X->template data(), + param.Out->template mutable_data(TARGET(kXPU)), param.X->numel(), param.Leaky_relu_alpha); CHECK_EQ(r, 0); @@ -274,12 +277,20 @@ void PReluCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL( - relu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ReluCompute, def) +using reluFP32 = + paddle::lite::kernels::xpu::ReluCompute; +using reluFP16 = + paddle::lite::kernels::xpu::ReluCompute; +REGISTER_LITE_KERNEL(relu, kXPU, kFloat, kNCHW, reluFP32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(relu, kXPU, kFP16, kNCHW, reluFP16, reluFP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL( relu6, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Relu6Compute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) @@ -292,21 +303,31 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); -REGISTER_LITE_KERNEL( - tanh, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TanhCompute, def) +using tanhFP32 = + paddle::lite::kernels::xpu::TanhCompute; +using tanhFP16 = + paddle::lite::kernels::xpu::TanhCompute; +REGISTER_LITE_KERNEL(tanh, kXPU, kFloat, kNCHW, tanhFP32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(tanh, kXPU, kFP16, kNCHW, tanhFP16, tanhFP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); -REGISTER_LITE_KERNEL(sigmoid, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::SigmoidCompute, - def) +using sigmoidFP32 = + paddle::lite::kernels::xpu::SigmoidCompute; +using sigmoidFP16 = + paddle::lite::kernels::xpu::SigmoidCompute; +REGISTER_LITE_KERNEL(sigmoid, kXPU, kFloat, kNCHW, sigmoidFP32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(sigmoid, kXPU, kFP16, kNCHW, sigmoidFP16, sigmoidFP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); REGISTER_LITE_KERNEL( abs, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::AbsCompute, def) @@ -386,16 +407,21 @@ REGISTER_LITE_KERNEL(hard_swish, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); -REGISTER_LITE_KERNEL(leaky_relu, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::LeakyReluCompute, - def) +using leaky_reluFP32 = + paddle::lite::kernels::xpu::LeakyReluCompute; +using leaky_reluFP16 = + paddle::lite::kernels::xpu::LeakyReluCompute; +REGISTER_LITE_KERNEL(leaky_relu, kXPU, kFloat, kNCHW, leaky_reluFP32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL( + leaky_relu, kXPU, kFP16, kNCHW, leaky_reluFP16, leaky_reluFP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(softsign, kXPU, kFloat, diff --git a/lite/kernels/xpu/activation_compute.h b/lite/kernels/xpu/activation_compute.h index 057d527ef89..ab47e5ed580 100644 --- a/lite/kernels/xpu/activation_compute.h +++ b/lite/kernels/xpu/activation_compute.h @@ -20,7 +20,8 @@ namespace lite { namespace kernels { namespace xpu { -class ReluCompute : public KernelLite { +template +class ReluCompute : public KernelLite { public: using param_t = operators::ActivationParam; @@ -47,7 +48,8 @@ class GeluCompute : public KernelLite { virtual ~GeluCompute() = default; }; -class TanhCompute : public KernelLite { +template +class TanhCompute : public KernelLite { public: using param_t = operators::ActivationParam; @@ -56,7 +58,8 @@ class TanhCompute : public KernelLite { virtual ~TanhCompute() = default; }; -class SigmoidCompute : public KernelLite { +template +class SigmoidCompute : public KernelLite { public: using param_t = operators::ActivationParam; @@ -164,7 +167,8 @@ class HardSigmoidCompute : public KernelLite { virtual ~HardSigmoidCompute() = default; }; -class LeakyReluCompute : public KernelLite { +template +class LeakyReluCompute : public KernelLite { public: using param_t = operators::ActivationParam; From 2aa49ec073ae91e34419f2ef56869c39046e14f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Thu, 7 Jul 2022 17:37:19 +0800 Subject: [PATCH 03/16] support fp16 data pression,op:elementwise_mul elementwise_add --- lite/kernels/xpu/elementwise_compute.cc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/lite/kernels/xpu/elementwise_compute.cc b/lite/kernels/xpu/elementwise_compute.cc index aaf1c913209..4b8e0e158c5 100644 --- a/lite/kernels/xpu/elementwise_compute.cc +++ b/lite/kernels/xpu/elementwise_compute.cc @@ -132,10 +132,15 @@ void ElementwiseCompute::Run() { namespace xpu = paddle::lite::kernels::xpu; using AddFloat32 = xpu::ElementwiseCompute>; +using AddFloat16 = xpu::ElementwiseCompute>; using AddInt32 = xpu::ElementwiseCompute>; using AddInt64 = xpu::ElementwiseCompute>; + using SubFloat32 = xpu::ElementwiseCompute>; + using MulFloat32 = xpu::ElementwiseCompute>; +using MulFloat16 = xpu::ElementwiseCompute>; + using MulInt64 = xpu::ElementwiseCompute>; using DivFloat32 = xpu::ElementwiseCompute>; using MaxFloat32 = xpu::ElementwiseCompute>; @@ -147,6 +152,13 @@ REGISTER_LITE_KERNEL(elementwise_add, kXPU, kFloat, kNCHW, AddFloat32, def) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL( + elementwise_add, kXPU, kFloat, kNCHW, AddFloat16, DISABLE_XPU1_AddFloat16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(elementwise_add, kXPU, kFloat, kNCHW, AddInt32, int32) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) @@ -171,6 +183,13 @@ REGISTER_LITE_KERNEL(elementwise_mul, kXPU, kFloat, kNCHW, MulFloat32, def) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL( + elementwise_mul, kXPU, kFloat, kNCHW, MulFloat16, DISABLE_XPU1_MulFloat16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(elementwise_mul, kXPU, kFloat, kNCHW, MulInt64, int64) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) From 5ec55029b2d7af373ae922a16f5565b625315bb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Fri, 8 Jul 2022 10:24:15 +0800 Subject: [PATCH 04/16] support fp16 data pression,op:conv2d_transpose --- lite/kernels/xpu/conv2d_transpose_compute.cc | 234 +++++++++++++------ lite/kernels/xpu/conv2d_transpose_compute.h | 13 +- 2 files changed, 173 insertions(+), 74 deletions(-) diff --git a/lite/kernels/xpu/conv2d_transpose_compute.cc b/lite/kernels/xpu/conv2d_transpose_compute.cc index 7949b193c56..440fe571240 100644 --- a/lite/kernels/xpu/conv2d_transpose_compute.cc +++ b/lite/kernels/xpu/conv2d_transpose_compute.cc @@ -22,8 +22,42 @@ namespace lite { namespace kernels { namespace xpu { -template <> -void Conv2dTransposeCompute::Run() { +template +void Conv2dTransposeCompute::PrepareForRun() { + auto& param = this->template Param(); + auto filter_ptr = param.filter->template data(); + auto filter_dims = param.filter->dims(); + xpu_quant_filter_ = + TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( + filter_ptr, filter_dims, false); + +#ifdef LITE_WITH_XPU + int cur_dev_idx = 0; + + XPU_CALL(xpu_current_device(&cur_dev_idx)); + XPU_CALL(xpu_device_get_attr(&cur_dev_attr_, XPUATTR_MODEL, cur_dev_idx)); + if (cur_dev_attr_ <= 1) { + VLOG(4) << "Currents XPU device : XPU1"; + } else if (cur_dev_attr_ >= 2 && cur_dev_attr_ <= 299) { + VLOG(4) << "Currents XPU device : XPU2"; + } else if (cur_dev_attr_ >= 300 && cur_dev_attr_ <= 599) { + VLOG(4) << "Currents XPU device : XPU3"; + } else { + VLOG(4) << "invaid XPU device"; + } +#endif +} + +template +void Conv2dTransposeCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -37,28 +71,61 @@ void Conv2dTransposeCompute::Run() { auto dilations = *param.dilations; if (param.output_padding.empty()) { - int ret = xdnn::conv2d_transpose( - ctx.GetRawContext(), - param.x->data(), - param.filter->data(), - param.output->mutable_data(TARGET(kXPU)), - in_dims[0], - in_dims[1], - in_dims[2], - in_dims[3], - out_dims[1], - std::vector{static_cast(w_dims[2]), - static_cast(w_dims[3])}, - strides, - paddings, - dilations, - groups, - nullptr, - nullptr, - nullptr, - true); - CHECK_EQ(ret, 0); + if (cur_dev_attr_ <= 1) { + // conv2d_transpose_fusion only support kl2,conv2d_transpose only support + // data precision FP32 + CHECK_EQ(sizeof(DX), sizeof(float)); + int ret = xdnn::conv2d_transpose( + ctx.GetRawContext(), + param.x->template data(), + reinterpret_cast( + xpu_quant_filter_.data_ptr_), /* weight */ + param.output->template mutable_data(TARGET(kXPU)), + in_dims[0], + in_dims[1], + in_dims[2], + in_dims[3], + out_dims[1], + std::vector{static_cast(w_dims[2]), + static_cast(w_dims[3])}, + strides, + paddings, + dilations, + groups, + nullptr, + reinterpret_cast(xpu_quant_filter_.max_ptr_), + nullptr, + true); + CHECK_EQ(ret, 0); + } else { + int ret = xdnn::conv2d_transpose_fusion( + ctx.GetRawContext(), + param.x->template data(), + reinterpret_cast(xpu_quant_filter_.data_ptr_), /* weight */ + param.output->template mutable_data(TARGET(kXPU)), + in_dims[0], + in_dims[1], + in_dims[2], + in_dims[3], + out_dims[1], + std::vector{static_cast(w_dims[2]), + static_cast(w_dims[3])}, + strides, + paddings, + dilations, + groups, + nullptr, + reinterpret_cast(xpu_quant_filter_.max_ptr_), + nullptr, + nullptr, + xdnn::Activation_t::LINEAR, + true); + CHECK_EQ(ret, 0); + } + } else { + CHECK_EQ(sizeof(DX), sizeof(DY)); + int n = in_dims[0]; int yc = in_dims[1]; int yh = in_dims[2]; @@ -68,65 +135,67 @@ void Conv2dTransposeCompute::Run() { int xw = out_dims[3]; int kh = w_dims[2]; int kw = w_dims[3]; - float* x_trans = nullptr; + DX* x_trans = nullptr; XPU_CALL(xpu_malloc(reinterpret_cast(&x_trans), - (param.x->numel()) * sizeof(float))); - float* x_col_before_concat = nullptr; + (param.x->numel()) * sizeof(DX))); + DX* x_col_before_concat = nullptr; XPU_CALL(xpu_malloc(reinterpret_cast(&x_col_before_concat), - (n * yh * yw * kh * kw * xc) * sizeof(float))); - float* x_col = nullptr; + (n * yh * yw * kh * kw * xc) * sizeof(DX))); + DX* x_col = nullptr; XPU_CALL(xpu_malloc(reinterpret_cast(&x_col), - (n * yh * yw * kh * kw * xc) * sizeof(float))); - const float* weight = param.filter->data(); - int ret = xdnn::transpose(ctx.GetRawContext(), - param.x->data(), - x_trans, - {n, groups, yc / groups, yh, yw}, - {1, 0, 3, 4, 2}); + (n * yh * yw * kh * kw * xc) * sizeof(DX))); + const TW* weight = reinterpret_cast(xpu_quant_filter_.data_ptr_); + int ret = xdnn::transpose(ctx.GetRawContext(), + param.x->template data(), + x_trans, + {n, groups, yc / groups, yh, yw}, + {1, 0, 3, 4, 2}); CHECK_EQ(ret, 0); + for (int g = 0; g < groups; g++) { - const float* curr_y = x_trans + g * n * yh * yw * (yc / groups); - const float* curr_w = - weight + g * (yc / groups) * (xc / groups) * kh * kw; - float* curr_x = + const DX* curr_y = x_trans + g * n * yh * yw * (yc / groups); + const TW* curr_w = weight + g * (yc / groups) * (xc / groups) * kh * kw; + DX* curr_x = x_col_before_concat + g * n * yh * yw * (xc / groups) * kh * kw; int mac_m = n * yh * yw; int mac_k = yc / groups; int mac_n = xc / groups * kh * kw; - ret = xdnn::fc(ctx.GetRawContext(), - curr_y, - curr_w, - curr_x, - mac_m, - mac_n, - mac_k, - false, - false, - nullptr, - nullptr, - nullptr); + ret = xdnn::fc( + ctx.GetRawContext(), + curr_y, + reinterpret_cast(curr_w), + curr_x, + mac_m, + mac_n, + mac_k, + false, + false, + nullptr, + reinterpret_cast(xpu_quant_filter_.max_ptr_), + nullptr); CHECK_EQ(ret, 0); } - ret = xdnn::transpose(ctx.GetRawContext(), - x_col_before_concat, - x_col, - {groups, n * yh * yw, (xc / groups) * kh * kw}, - {1, 0, 2}); + ret = xdnn::transpose(ctx.GetRawContext(), + x_col_before_concat, + x_col, + {groups, n * yh * yw, (xc / groups) * kh * kw}, + {1, 0, 2}); CHECK_EQ(ret, 0); - ret = xdnn::col2im(ctx.GetRawContext(), - x_col, - param.output->mutable_data(TARGET(kXPU)), - n, - xc, - xh, - xw, - std::vector{static_cast(w_dims[2]), - static_cast(w_dims[3])}, - strides, - paddings, - dilations, - true); + ret = + xdnn::col2im(ctx.GetRawContext(), + x_col, + param.output->template mutable_data(TARGET(kXPU)), + n, + xc, + xh, + xw, + std::vector{static_cast(w_dims[2]), + static_cast(w_dims[3])}, + strides, + paddings, + dilations, + true); CHECK_EQ(ret, 0); XPU_CALL(xpu_free(x_trans)); XPU_CALL(xpu_free(x_col_before_concat)); @@ -140,12 +209,33 @@ void Conv2dTransposeCompute::Run() { } // namespace paddle namespace xpu = paddle::lite::kernels::xpu; -using Conv2dTransposeFp32 = xpu::Conv2dTransposeCompute; + +using Conv2dTranspose_FP16_FP32_FP32 = xpu:: + Conv2dTransposeCompute; + +using Conv2dTransposeFp16 = xpu::Conv2dTransposeCompute; REGISTER_LITE_KERNEL( - conv2d_transpose, kXPU, kFloat, kNCHW, Conv2dTransposeFp32, def) + conv2d_transpose, kXPU, kFloat, kNCHW, Conv2dTranspose_FP16_FP32_FP32, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) - .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); + +REGISTER_LITE_KERNEL(conv2d_transpose, + kXPU, + kFP16, + kNCHW, + Conv2dTransposeFp16, + DISABLE_XPU1_Conv2dTransposeFp16) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/xpu/conv2d_transpose_compute.h b/lite/kernels/xpu/conv2d_transpose_compute.h index 5a3d8714fd4..a5f7dfa8240 100644 --- a/lite/kernels/xpu/conv2d_transpose_compute.h +++ b/lite/kernels/xpu/conv2d_transpose_compute.h @@ -23,14 +23,23 @@ namespace lite { namespace kernels { namespace xpu { -template -class Conv2dTransposeCompute : public KernelLite { +template +class Conv2dTransposeCompute : public KernelLite { public: using param_t = operators::ConvParam; + void PrepareForRun() override; void Run() override; virtual ~Conv2dTransposeCompute() = default; + + private: + XPUQuantData xpu_quant_filter_; + uint64_t cur_dev_attr_ = 0; }; } // namespace xpu From ade29793b4c3841d7d80770c02aa5653574daa5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Fri, 8 Jul 2022 10:28:56 +0800 Subject: [PATCH 05/16] add op: transpose2 registe kernel data precision: int32 --- lite/kernels/xpu/transpose_compute.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lite/kernels/xpu/transpose_compute.cc b/lite/kernels/xpu/transpose_compute.cc index d1c9553ba71..19441de2849 100644 --- a/lite/kernels/xpu/transpose_compute.cc +++ b/lite/kernels/xpu/transpose_compute.cc @@ -75,6 +75,18 @@ REGISTER_LITE_KERNEL(transpose2, .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); +REGISTER_LITE_KERNEL(transpose2, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::TransposeCompute, + def_int32) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("XShape", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .Finalize(); + REGISTER_LITE_KERNEL(transpose2, kXPU, kFloat, From 9656f3be74f3bee65e5c234c8268e6d2e15b4fb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Fri, 8 Jul 2022 10:31:02 +0800 Subject: [PATCH 06/16] support fp16 data pression,op:reduce_mean --- lite/kernels/xpu/reduce_compute.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lite/kernels/xpu/reduce_compute.cc b/lite/kernels/xpu/reduce_compute.cc index da2477d48ba..8563ec4d601 100644 --- a/lite/kernels/xpu/reduce_compute.cc +++ b/lite/kernels/xpu/reduce_compute.cc @@ -154,6 +154,8 @@ using ReduceAll = xpu::ReduceCompute>; using ReduceAny = xpu::ReduceCompute>; using ReduceMeanFloat32 = xpu::ReduceCompute>; +using ReduceMeanFloat16 = + xpu::ReduceCompute>; using ReduceSumFloat32 = xpu::ReduceCompute>; using ReduceProdFloat32 = @@ -178,6 +180,16 @@ REGISTER_LITE_KERNEL(reduce_mean, kXPU, kFloat, kNCHW, ReduceMeanFloat32, def) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(reduce_mean, + kXPU, + kFloat, + kNCHW, + ReduceMeanFloat16, + DISABLE_XPU1_ReduceMeanFloat16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(reduce_sum, kXPU, kFloat, kNCHW, ReduceSumFloat32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) From 13a477767307a3a7474288fbc63971fad3fd7f00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Fri, 8 Jul 2022 10:34:41 +0800 Subject: [PATCH 07/16] support fp16 data pression,op:reshape,reshape2,flatten2,flatten --- lite/kernels/xpu/reshape_compute.cc | 54 +++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/lite/kernels/xpu/reshape_compute.cc b/lite/kernels/xpu/reshape_compute.cc index 78359443991..c82e367e9eb 100644 --- a/lite/kernels/xpu/reshape_compute.cc +++ b/lite/kernels/xpu/reshape_compute.cc @@ -69,6 +69,21 @@ REGISTER_LITE_KERNEL(reshape2, .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); +REGISTER_LITE_KERNEL(reshape2, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ReshapeCompute, + float16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("ShapeTensor", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); + REGISTER_LITE_KERNEL(reshape2, kXPU, kFloat, @@ -113,6 +128,20 @@ REGISTER_LITE_KERNEL(reshape, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(reshape, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ReshapeCompute, + float16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("ShapeTensor", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(flatten, kXPU, kFloat, @@ -125,6 +154,18 @@ REGISTER_LITE_KERNEL(flatten, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(flatten, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ReshapeCompute, + float16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(flatten2, kXPU, kFloat, @@ -137,3 +178,16 @@ REGISTER_LITE_KERNEL(flatten2, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); + +REGISTER_LITE_KERNEL(flatten2, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ReshapeCompute, + float16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); From b8e539c4aafcdf8c561f7a0d2401867733c0cba8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Fri, 8 Jul 2022 11:02:37 +0800 Subject: [PATCH 08/16] support fp16 data pression --- .../mir/__xpu__static_kernel_pick_pass.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h index af9fa0435ac..7e97704034d 100644 --- a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h +++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h @@ -93,9 +93,12 @@ class XPUStaticKernelPickPass : public mir::StmtPass { VLOG(4) << "[score s1]:" << score; if (kernel_pick_factors_.IsPrecisionConsidered() && - (place.precision == kernel.precision() || - kernel.precision() == PRECISION(kAny) || - place.precision == PRECISION(kAny))) { + (place.precision == kernel.precision() || + kernel.precision() == PRECISION(kAny) || + place.precision == PRECISION(kAny)) || + // fp16 may also pick FP32 kernel preciison + (xpu_use_fp16_optimizer_ && + kernel.precision() == PRECISION(kFloat))) { // score skipped, if kernel is int8, but op is not int8 if (!(kernel.precision() == PRECISION(kInt8) && !instruct.op_info()->HasAttr("enable_int8"))) { @@ -314,7 +317,15 @@ class XPUStaticKernelPickPass : public mir::StmtPass { "gather", "pool2d", "concat", - "calib"}; + "calib", + "relu", + "tanh", + "sigmoid", + "leaky_relu", + "conv2d_transpose", + "elementwise_mul", + "elementwise_add", + "reduce_mean"}; const std::set xpu_inplace_op_{"reshape", "reshape2", "flatten", From 1420802c4dc093676e1608c1121d2cfd323ad613 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Fri, 8 Jul 2022 11:06:50 +0800 Subject: [PATCH 09/16] support fp16 data pression --- .../optimizer/mir/__xpu__static_kernel_pick_pass.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h index 7e97704034d..65849721daf 100644 --- a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h +++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h @@ -93,12 +93,12 @@ class XPUStaticKernelPickPass : public mir::StmtPass { VLOG(4) << "[score s1]:" << score; if (kernel_pick_factors_.IsPrecisionConsidered() && - (place.precision == kernel.precision() || - kernel.precision() == PRECISION(kAny) || - place.precision == PRECISION(kAny)) || - // fp16 may also pick FP32 kernel preciison - (xpu_use_fp16_optimizer_ && - kernel.precision() == PRECISION(kFloat))) { + (place.precision == kernel.precision() || + kernel.precision() == PRECISION(kAny) || + place.precision == PRECISION(kAny) || + // fp16 may also pick FP32 kernel preciison + (xpu_use_fp16_optimizer_ && + kernel.precision() == PRECISION(kFloat)))) { // score skipped, if kernel is int8, but op is not int8 if (!(kernel.precision() == PRECISION(kInt8) && !instruct.op_info()->HasAttr("enable_int8"))) { From 928e53068b6a55d2f473ee1ee9102daa813d68d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Fri, 8 Jul 2022 17:28:58 +0800 Subject: [PATCH 10/16] xpu pick kernel force use fp16 --- lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc | 8 +++++++- lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc index 038e7e22678..d55b9aad45c 100644 --- a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc +++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc @@ -44,8 +44,8 @@ void XPUStaticKernelPickPass::Apply(const std::unique_ptr& graph) { // Collect input data precision for each node in the graph #ifdef LITE_WITH_XPU DicideUseFP16Optimizer(graph); + GetXPUDeviceType(); if (xpu_use_fp16_optimizer_) { - GetXPUDeviceType(); for (auto& node : graph->StmtTopologicalOrder()) { if (!node->IsStmt()) continue; if (xpu_special_op_.count(node->AsStmt().op_type())) { @@ -235,6 +235,12 @@ void XPUStaticKernelPickPass::Apply(const std::unique_ptr& graph) { #ifdef LITE_WITH_XPU void XPUStaticKernelPickPass::DicideUseFP16Optimizer( const std::unique_ptr& graph) { + if (GetStringFromEnv("XPUForceUseFP16", "false") == "true") { + xpu_use_fp16_optimizer_ = false; + VLOG(2) << "XPU force use data precision: FP16 "; + return; + } + if (graph->valid_places()[0].precision == PrecisionType::kFP16) { xpu_use_fp16_optimizer_ = true; VLOG(2) << "XPU auto use data precision: FP16/FP32/INT16 "; diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h index 65849721daf..8e2a6bffd2e 100644 --- a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h +++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h @@ -297,8 +297,9 @@ class XPUStaticKernelPickPass : public mir::StmtPass { private: core::KernelPickFactor kernel_pick_factors_; -#ifdef LITE_WITH_XPU + bool xpu_use_fp16_optimizer_{false}; +#ifdef LITE_WITH_XPU // TODO(quwei:) addn more op const std::set PRECISION_INT31_OP_{"__xpu__fc"}; const std::set PRECISION_INT8_OP_{"__xpu__fc"}; From 5073d6fbbee9aba10b486601ef7a43c88d123185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Sun, 10 Jul 2022 09:41:30 +0800 Subject: [PATCH 11/16] update --- lite/kernels/xpu/conv2d_transpose_compute.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lite/kernels/xpu/conv2d_transpose_compute.cc b/lite/kernels/xpu/conv2d_transpose_compute.cc index 440fe571240..ba2ca788015 100644 --- a/lite/kernels/xpu/conv2d_transpose_compute.cc +++ b/lite/kernels/xpu/conv2d_transpose_compute.cc @@ -14,6 +14,8 @@ #include "lite/kernels/xpu/conv2d_transpose_compute.h" #include +#include "lite/backends/xpu/math.h" +#include "lite/backends/xpu/target_wrapper.h" #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_registry.h" From 147f0613bf431f5503de860c908d4f8bbd8b7e7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Mon, 11 Jul 2022 09:43:06 +0800 Subject: [PATCH 12/16] update --- lite/kernels/xpu/conv2d_transpose_compute.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lite/kernels/xpu/conv2d_transpose_compute.cc b/lite/kernels/xpu/conv2d_transpose_compute.cc index ba2ca788015..8ca66afae4a 100644 --- a/lite/kernels/xpu/conv2d_transpose_compute.cc +++ b/lite/kernels/xpu/conv2d_transpose_compute.cc @@ -31,6 +31,8 @@ template void Conv2dTransposeCompute::PrepareForRun() { auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto filter_ptr = param.filter->template data(); auto filter_dims = param.filter->dims(); xpu_quant_filter_ = From 618ac8ab3275703dfc126a570afc3297112ecd13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Mon, 11 Jul 2022 11:40:07 +0800 Subject: [PATCH 13/16] update --- lite/kernels/xpu/conv2d_transpose_compute.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/lite/kernels/xpu/conv2d_transpose_compute.cc b/lite/kernels/xpu/conv2d_transpose_compute.cc index 8ca66afae4a..9fe64ee389c 100644 --- a/lite/kernels/xpu/conv2d_transpose_compute.cc +++ b/lite/kernels/xpu/conv2d_transpose_compute.cc @@ -32,6 +32,7 @@ template ::PrepareForRun() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); + ctx.GetRawContext(); auto filter_ptr = param.filter->template data(); auto filter_dims = param.filter->dims(); From 1c9bee9cd1cfacb57513c45eaaed187e09b0e49c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Mon, 11 Jul 2022 16:30:59 +0800 Subject: [PATCH 14/16] update --- lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h index 8e2a6bffd2e..38f786b5216 100644 --- a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h +++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h @@ -98,7 +98,8 @@ class XPUStaticKernelPickPass : public mir::StmtPass { place.precision == PRECISION(kAny) || // fp16 may also pick FP32 kernel preciison (xpu_use_fp16_optimizer_ && - kernel.precision() == PRECISION(kFloat)))) { + kernel.precision() == PRECISION(kFloat) && + place.precision == PRECISION(kFP16)))) { // score skipped, if kernel is int8, but op is not int8 if (!(kernel.precision() == PRECISION(kInt8) && !instruct.op_info()->HasAttr("enable_int8"))) { From 0f96642974a4b26625f2fe0f94415997871e58af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cquwei03=E2=80=9D?= Date: Mon, 11 Jul 2022 21:15:50 +0800 Subject: [PATCH 15/16] update --- lite/kernels/xpu/conv2d_transpose_compute.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lite/kernels/xpu/conv2d_transpose_compute.cc b/lite/kernels/xpu/conv2d_transpose_compute.cc index 9fe64ee389c..eff1d1543f2 100644 --- a/lite/kernels/xpu/conv2d_transpose_compute.cc +++ b/lite/kernels/xpu/conv2d_transpose_compute.cc @@ -216,7 +216,7 @@ void Conv2dTransposeCompute::Run() { namespace xpu = paddle::lite::kernels::xpu; using Conv2dTranspose_FP16_FP32_FP32 = xpu:: - Conv2dTransposeCompute; + Conv2dTransposeCompute; using Conv2dTransposeFp16 = xpu::Conv2dTransposeCompute Date: Tue, 12 Jul 2022 09:09:55 +0800 Subject: [PATCH 16/16] update --- lite/kernels/xpu/conv2d_transpose_compute.cc | 176 +++++++------------ lite/kernels/xpu/conv2d_transpose_compute.h | 11 +- 2 files changed, 64 insertions(+), 123 deletions(-) diff --git a/lite/kernels/xpu/conv2d_transpose_compute.cc b/lite/kernels/xpu/conv2d_transpose_compute.cc index eff1d1543f2..0ec8532b4bc 100644 --- a/lite/kernels/xpu/conv2d_transpose_compute.cc +++ b/lite/kernels/xpu/conv2d_transpose_compute.cc @@ -14,8 +14,6 @@ #include "lite/kernels/xpu/conv2d_transpose_compute.h" #include -#include "lite/backends/xpu/math.h" -#include "lite/backends/xpu/target_wrapper.h" #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_registry.h" @@ -24,23 +22,8 @@ namespace lite { namespace kernels { namespace xpu { -template -void Conv2dTransposeCompute::PrepareForRun() { - auto& param = this->template Param(); - auto& ctx = this->ctx_->template As(); - ctx.GetRawContext(); - - auto filter_ptr = param.filter->template data(); - auto filter_dims = param.filter->dims(); - xpu_quant_filter_ = - TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - filter_ptr, filter_dims, false); - -#ifdef LITE_WITH_XPU +template <> +void Conv2dTransposeCompute::PrepareForRun() { int cur_dev_idx = 0; XPU_CALL(xpu_current_device(&cur_dev_idx)); @@ -54,15 +37,10 @@ void Conv2dTransposeCompute::PrepareForRun() { } else { VLOG(4) << "invaid XPU device"; } -#endif } -template -void Conv2dTransposeCompute::Run() { +template <> +void Conv2dTransposeCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -77,15 +55,11 @@ void Conv2dTransposeCompute::Run() { if (param.output_padding.empty()) { if (cur_dev_attr_ <= 1) { - // conv2d_transpose_fusion only support kl2,conv2d_transpose only support - // data precision FP32 - CHECK_EQ(sizeof(DX), sizeof(float)); int ret = xdnn::conv2d_transpose( ctx.GetRawContext(), - param.x->template data(), - reinterpret_cast( - xpu_quant_filter_.data_ptr_), /* weight */ - param.output->template mutable_data(TARGET(kXPU)), + param.x->data(), + param.filter->data(), + param.output->mutable_data(TARGET(kXPU)), in_dims[0], in_dims[1], in_dims[2], @@ -98,16 +72,16 @@ void Conv2dTransposeCompute::Run() { dilations, groups, nullptr, - reinterpret_cast(xpu_quant_filter_.max_ptr_), + nullptr, nullptr, true); CHECK_EQ(ret, 0); } else { - int ret = xdnn::conv2d_transpose_fusion( + int ret = xdnn::conv2d_transpose_fusion( ctx.GetRawContext(), - param.x->template data(), - reinterpret_cast(xpu_quant_filter_.data_ptr_), /* weight */ - param.output->template mutable_data(TARGET(kXPU)), + param.x->data(), + param.filter->data(), + param.output->mutable_data(TARGET(kXPU)), in_dims[0], in_dims[1], in_dims[2], @@ -120,17 +94,14 @@ void Conv2dTransposeCompute::Run() { dilations, groups, nullptr, - reinterpret_cast(xpu_quant_filter_.max_ptr_), + nullptr, nullptr, nullptr, xdnn::Activation_t::LINEAR, true); CHECK_EQ(ret, 0); } - } else { - CHECK_EQ(sizeof(DX), sizeof(DY)); - int n = in_dims[0]; int yc = in_dims[1]; int yh = in_dims[2]; @@ -140,67 +111,65 @@ void Conv2dTransposeCompute::Run() { int xw = out_dims[3]; int kh = w_dims[2]; int kw = w_dims[3]; - DX* x_trans = nullptr; + float* x_trans = nullptr; XPU_CALL(xpu_malloc(reinterpret_cast(&x_trans), - (param.x->numel()) * sizeof(DX))); - DX* x_col_before_concat = nullptr; + (param.x->numel()) * sizeof(float))); + float* x_col_before_concat = nullptr; XPU_CALL(xpu_malloc(reinterpret_cast(&x_col_before_concat), - (n * yh * yw * kh * kw * xc) * sizeof(DX))); - DX* x_col = nullptr; + (n * yh * yw * kh * kw * xc) * sizeof(float))); + float* x_col = nullptr; XPU_CALL(xpu_malloc(reinterpret_cast(&x_col), - (n * yh * yw * kh * kw * xc) * sizeof(DX))); - const TW* weight = reinterpret_cast(xpu_quant_filter_.data_ptr_); - int ret = xdnn::transpose(ctx.GetRawContext(), - param.x->template data(), - x_trans, - {n, groups, yc / groups, yh, yw}, - {1, 0, 3, 4, 2}); + (n * yh * yw * kh * kw * xc) * sizeof(float))); + const float* weight = param.filter->data(); + int ret = xdnn::transpose(ctx.GetRawContext(), + param.x->data(), + x_trans, + {n, groups, yc / groups, yh, yw}, + {1, 0, 3, 4, 2}); CHECK_EQ(ret, 0); - for (int g = 0; g < groups; g++) { - const DX* curr_y = x_trans + g * n * yh * yw * (yc / groups); - const TW* curr_w = weight + g * (yc / groups) * (xc / groups) * kh * kw; - DX* curr_x = + const float* curr_y = x_trans + g * n * yh * yw * (yc / groups); + const float* curr_w = + weight + g * (yc / groups) * (xc / groups) * kh * kw; + float* curr_x = x_col_before_concat + g * n * yh * yw * (xc / groups) * kh * kw; int mac_m = n * yh * yw; int mac_k = yc / groups; int mac_n = xc / groups * kh * kw; - ret = xdnn::fc( - ctx.GetRawContext(), - curr_y, - reinterpret_cast(curr_w), - curr_x, - mac_m, - mac_n, - mac_k, - false, - false, - nullptr, - reinterpret_cast(xpu_quant_filter_.max_ptr_), - nullptr); + ret = xdnn::fc(ctx.GetRawContext(), + curr_y, + curr_w, + curr_x, + mac_m, + mac_n, + mac_k, + false, + false, + nullptr, + nullptr, + nullptr); CHECK_EQ(ret, 0); } - ret = xdnn::transpose(ctx.GetRawContext(), - x_col_before_concat, - x_col, - {groups, n * yh * yw, (xc / groups) * kh * kw}, - {1, 0, 2}); + ret = xdnn::transpose(ctx.GetRawContext(), + x_col_before_concat, + x_col, + {groups, n * yh * yw, (xc / groups) * kh * kw}, + {1, 0, 2}); CHECK_EQ(ret, 0); - ret = - xdnn::col2im(ctx.GetRawContext(), - x_col, - param.output->template mutable_data(TARGET(kXPU)), - n, - xc, - xh, - xw, - std::vector{static_cast(w_dims[2]), - static_cast(w_dims[3])}, - strides, - paddings, - dilations, - true); + ret = xdnn::col2im(ctx.GetRawContext(), + x_col, + param.output->mutable_data(TARGET(kXPU)), + n, + xc, + xh, + xw, + std::vector{static_cast(w_dims[2]), + static_cast(w_dims[3])}, + strides, + paddings, + dilations, + true); CHECK_EQ(ret, 0); XPU_CALL(xpu_free(x_trans)); XPU_CALL(xpu_free(x_col_before_concat)); @@ -214,33 +183,12 @@ void Conv2dTransposeCompute::Run() { } // namespace paddle namespace xpu = paddle::lite::kernels::xpu; - -using Conv2dTranspose_FP16_FP32_FP32 = xpu:: - Conv2dTransposeCompute; - -using Conv2dTransposeFp16 = xpu::Conv2dTransposeCompute; +using Conv2dTransposeFp32 = xpu::Conv2dTransposeCompute; REGISTER_LITE_KERNEL( - conv2d_transpose, kXPU, kFloat, kNCHW, Conv2dTranspose_FP16_FP32_FP32, def) + conv2d_transpose, kXPU, kFloat, kNCHW, Conv2dTransposeFp32, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) - .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); - -REGISTER_LITE_KERNEL(conv2d_transpose, - kXPU, - kFP16, - kNCHW, - Conv2dTransposeFp16, - DISABLE_XPU1_Conv2dTransposeFp16) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) - .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) - .BindOutput("Output", - {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) - .Finalize(); diff --git a/lite/kernels/xpu/conv2d_transpose_compute.h b/lite/kernels/xpu/conv2d_transpose_compute.h index a5f7dfa8240..6e779fc42ad 100644 --- a/lite/kernels/xpu/conv2d_transpose_compute.h +++ b/lite/kernels/xpu/conv2d_transpose_compute.h @@ -23,12 +23,8 @@ namespace lite { namespace kernels { namespace xpu { -template -class Conv2dTransposeCompute : public KernelLite { +template +class Conv2dTransposeCompute : public KernelLite { public: using param_t = operators::ConvParam; @@ -36,9 +32,6 @@ class Conv2dTransposeCompute : public KernelLite { void Run() override; virtual ~Conv2dTransposeCompute() = default; - - private: - XPUQuantData xpu_quant_filter_; uint64_t cur_dev_attr_ = 0; };