Skip to content

Commit

Permalink
[XPU] support fp16 data pression (PaddlePaddle#9228)
Browse files Browse the repository at this point in the history
  • Loading branch information
xiuxin121 authored and newway committed Aug 23, 2022
1 parent 756810f commit 1894404
Show file tree
Hide file tree
Showing 10 changed files with 249 additions and 58 deletions.
8 changes: 7 additions & 1 deletion lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ void XPUStaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// Collect input data precision for each node in the graph
#ifdef LITE_WITH_XPU
DicideUseFP16Optimizer(graph);
GetXPUDeviceType();
if (xpu_use_fp16_optimizer_) {
GetXPUDeviceType();
for (auto& node : graph->StmtTopologicalOrder()) {
if (!node->IsStmt()) continue;
if (xpu_special_op_.count(node->AsStmt().op_type())) {
Expand Down Expand Up @@ -235,6 +235,12 @@ void XPUStaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
#ifdef LITE_WITH_XPU
void XPUStaticKernelPickPass::DicideUseFP16Optimizer(
const std::unique_ptr<SSAGraph>& graph) {
if (GetStringFromEnv("XPUForceUseFP16", "false") == "true") {
xpu_use_fp16_optimizer_ = false;
VLOG(2) << "XPU force use data precision: FP16 ";
return;
}

if (graph->valid_places()[0].precision == PrecisionType::kFP16) {
xpu_use_fp16_optimizer_ = true;
VLOG(2) << "XPU auto use data precision: FP16/FP32/INT16 ";
Expand Down
19 changes: 16 additions & 3 deletions lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,11 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
if (kernel_pick_factors_.IsPrecisionConsidered() &&
(place.precision == kernel.precision() ||
kernel.precision() == PRECISION(kAny) ||
place.precision == PRECISION(kAny))) {
place.precision == PRECISION(kAny) ||
// fp16 may also pick FP32 kernel preciison
(xpu_use_fp16_optimizer_ &&
kernel.precision() == PRECISION(kFloat) &&
place.precision == PRECISION(kFP16)))) {
// score skipped, if kernel is int8, but op is not int8
if (!(kernel.precision() == PRECISION(kInt8) &&
!instruct.op_info()->HasAttr("enable_int8"))) {
Expand Down Expand Up @@ -294,8 +298,9 @@ class XPUStaticKernelPickPass : public mir::StmtPass {

private:
core::KernelPickFactor kernel_pick_factors_;
#ifdef LITE_WITH_XPU

bool xpu_use_fp16_optimizer_{false};
#ifdef LITE_WITH_XPU
// TODO(quwei:) addn more op
const std::set<std::string> PRECISION_INT31_OP_{"__xpu__fc"};
const std::set<std::string> PRECISION_INT8_OP_{"__xpu__fc"};
Expand All @@ -314,7 +319,15 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
"gather",
"pool2d",
"concat",
"calib"};
"calib",
"relu",
"tanh",
"sigmoid",
"leaky_relu",
"conv2d_transpose",
"elementwise_mul",
"elementwise_add",
"reduce_mean"};
const std::set<std::string> xpu_inplace_op_{"reshape",
"reshape2",
"flatten",
Expand Down
84 changes: 55 additions & 29 deletions lite/kernels/xpu/activation_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@ namespace lite {
namespace kernels {
namespace xpu {

void ReluCompute::Run() {
template <typename T, PrecisionType PType>
void ReluCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::relu(ctx.GetRawContext(),
param.X->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel());
CHECK_EQ(r, 0);
}
Expand All @@ -54,24 +55,26 @@ void GeluCompute::Run() {
CHECK_EQ(r, 0);
}

void TanhCompute::Run() {
template <typename T, PrecisionType PType>
void TanhCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::tanh(ctx.GetRawContext(),
param.X->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel());
CHECK_EQ(r, 0);
}

void SigmoidCompute::Run() {
template <typename T, PrecisionType PType>
void SigmoidCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::sigmoid(ctx.GetRawContext(),
param.X->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel());
CHECK_EQ(r, 0);
}
Expand Down Expand Up @@ -205,13 +208,13 @@ void HardSigmoidCompute::Run() {
CHECK_EQ(r, 0);
}

void LeakyReluCompute::Run() {
template <typename T, PrecisionType PType>
void LeakyReluCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::leaky_relu(ctx.GetRawContext(),
param.X->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel(),
param.Leaky_relu_alpha);
CHECK_EQ(r, 0);
Expand Down Expand Up @@ -274,12 +277,20 @@ void PReluCompute::Run() {
} // namespace lite
} // namespace paddle

REGISTER_LITE_KERNEL(
relu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ReluCompute, def)
using reluFP32 =
paddle::lite::kernels::xpu::ReluCompute<float, PRECISION(kFloat)>;
using reluFP16 =
paddle::lite::kernels::xpu::ReluCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(relu, kXPU, kFloat, kNCHW, reluFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(relu, kXPU, kFP16, kNCHW, reluFP16, reluFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
relu6, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Relu6Compute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
Expand All @@ -292,21 +303,31 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(
tanh, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TanhCompute, def)
using tanhFP32 =
paddle::lite::kernels::xpu::TanhCompute<float, PRECISION(kFloat)>;
using tanhFP16 =
paddle::lite::kernels::xpu::TanhCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(tanh, kXPU, kFloat, kNCHW, tanhFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(tanh, kXPU, kFP16, kNCHW, tanhFP16, tanhFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(sigmoid,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::SigmoidCompute,
def)
using sigmoidFP32 =
paddle::lite::kernels::xpu::SigmoidCompute<float, PRECISION(kFloat)>;
using sigmoidFP16 =
paddle::lite::kernels::xpu::SigmoidCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(sigmoid, kXPU, kFloat, kNCHW, sigmoidFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(sigmoid, kXPU, kFP16, kNCHW, sigmoidFP16, sigmoidFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
abs, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::AbsCompute, def)
Expand Down Expand Up @@ -386,16 +407,21 @@ REGISTER_LITE_KERNEL(hard_swish,
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(leaky_relu,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::LeakyReluCompute,
def)
using leaky_reluFP32 =
paddle::lite::kernels::xpu::LeakyReluCompute<float, PRECISION(kFloat)>;
using leaky_reluFP16 =
paddle::lite::kernels::xpu::LeakyReluCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(leaky_relu, kXPU, kFloat, kNCHW, leaky_reluFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(
leaky_relu, kXPU, kFP16, kNCHW, leaky_reluFP16, leaky_reluFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(softsign,
kXPU,
kFloat,
Expand Down
12 changes: 8 additions & 4 deletions lite/kernels/xpu/activation_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ namespace lite {
namespace kernels {
namespace xpu {

class ReluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class ReluCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::ActivationParam;

Expand All @@ -47,7 +48,8 @@ class GeluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
virtual ~GeluCompute() = default;
};

class TanhCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class TanhCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::ActivationParam;

Expand All @@ -56,7 +58,8 @@ class TanhCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
virtual ~TanhCompute() = default;
};

class SigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class SigmoidCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::ActivationParam;

Expand Down Expand Up @@ -164,7 +167,8 @@ class HardSigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
virtual ~HardSigmoidCompute() = default;
};

class LeakyReluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class LeakyReluCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::ActivationParam;

Expand Down
85 changes: 64 additions & 21 deletions lite/kernels/xpu/conv2d_transpose_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,23 @@ namespace lite {
namespace kernels {
namespace xpu {

template <>
void Conv2dTransposeCompute<PRECISION(kFloat)>::PrepareForRun() {
int cur_dev_idx = 0;

XPU_CALL(xpu_current_device(&cur_dev_idx));
XPU_CALL(xpu_device_get_attr(&cur_dev_attr_, XPUATTR_MODEL, cur_dev_idx));
if (cur_dev_attr_ <= 1) {
VLOG(4) << "Currents XPU device : XPU1";
} else if (cur_dev_attr_ >= 2 && cur_dev_attr_ <= 299) {
VLOG(4) << "Currents XPU device : XPU2";
} else if (cur_dev_attr_ >= 300 && cur_dev_attr_ <= 599) {
VLOG(4) << "Currents XPU device : XPU3";
} else {
VLOG(4) << "invaid XPU device";
}
}

template <>
void Conv2dTransposeCompute<PRECISION(kFloat)>::Run() {
auto& param = this->template Param<param_t>();
Expand All @@ -37,27 +54,53 @@ void Conv2dTransposeCompute<PRECISION(kFloat)>::Run() {
auto dilations = *param.dilations;

if (param.output_padding.empty()) {
int ret = xdnn::conv2d_transpose<float, float, float, int16_t>(
ctx.GetRawContext(),
param.x->data<float>(),
param.filter->data<float>(),
param.output->mutable_data<float>(TARGET(kXPU)),
in_dims[0],
in_dims[1],
in_dims[2],
in_dims[3],
out_dims[1],
std::vector<int>{static_cast<int>(w_dims[2]),
static_cast<int>(w_dims[3])},
strides,
paddings,
dilations,
groups,
nullptr,
nullptr,
nullptr,
true);
CHECK_EQ(ret, 0);
if (cur_dev_attr_ <= 1) {
int ret = xdnn::conv2d_transpose<float, float, float, int16_t>(
ctx.GetRawContext(),
param.x->data<float>(),
param.filter->data<float>(),
param.output->mutable_data<float>(TARGET(kXPU)),
in_dims[0],
in_dims[1],
in_dims[2],
in_dims[3],
out_dims[1],
std::vector<int>{static_cast<int>(w_dims[2]),
static_cast<int>(w_dims[3])},
strides,
paddings,
dilations,
groups,
nullptr,
nullptr,
nullptr,
true);
CHECK_EQ(ret, 0);
} else {
int ret = xdnn::conv2d_transpose_fusion<float, float, float, int16_t>(
ctx.GetRawContext(),
param.x->data<float>(),
param.filter->data<float>(),
param.output->mutable_data<float>(TARGET(kXPU)),
in_dims[0],
in_dims[1],
in_dims[2],
in_dims[3],
out_dims[1],
std::vector<int>{static_cast<int>(w_dims[2]),
static_cast<int>(w_dims[3])},
strides,
paddings,
dilations,
groups,
nullptr,
nullptr,
nullptr,
nullptr,
xdnn::Activation_t::LINEAR,
true);
CHECK_EQ(ret, 0);
}
} else {
int n = in_dims[0];
int yc = in_dims[1];
Expand Down
2 changes: 2 additions & 0 deletions lite/kernels/xpu/conv2d_transpose_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ class Conv2dTransposeCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
public:
using param_t = operators::ConvParam;

void PrepareForRun() override;
void Run() override;

virtual ~Conv2dTransposeCompute() = default;
uint64_t cur_dev_attr_ = 0;
};

} // namespace xpu
Expand Down
Loading

0 comments on commit 1894404

Please sign in to comment.