Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[XPU] support fp16 data pression #9228

Merged
merged 19 commits into from
Jul 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ void XPUStaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// Collect input data precision for each node in the graph
#ifdef LITE_WITH_XPU
DicideUseFP16Optimizer(graph);
GetXPUDeviceType();
if (xpu_use_fp16_optimizer_) {
GetXPUDeviceType();
for (auto& node : graph->StmtTopologicalOrder()) {
if (!node->IsStmt()) continue;
if (xpu_special_op_.count(node->AsStmt().op_type())) {
Expand Down Expand Up @@ -235,6 +235,12 @@ void XPUStaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
#ifdef LITE_WITH_XPU
void XPUStaticKernelPickPass::DicideUseFP16Optimizer(
const std::unique_ptr<SSAGraph>& graph) {
if (GetStringFromEnv("XPUForceUseFP16", "false") == "true") {
xpu_use_fp16_optimizer_ = false;
VLOG(2) << "XPU force use data precision: FP16 ";
return;
}

if (graph->valid_places()[0].precision == PrecisionType::kFP16) {
xpu_use_fp16_optimizer_ = true;
VLOG(2) << "XPU auto use data precision: FP16/FP32/INT16 ";
Expand Down
19 changes: 16 additions & 3 deletions lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,11 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
if (kernel_pick_factors_.IsPrecisionConsidered() &&
(place.precision == kernel.precision() ||
kernel.precision() == PRECISION(kAny) ||
place.precision == PRECISION(kAny))) {
place.precision == PRECISION(kAny) ||
// fp16 may also pick FP32 kernel preciison
(xpu_use_fp16_optimizer_ &&
kernel.precision() == PRECISION(kFloat) &&
place.precision == PRECISION(kFP16)))) {
// score skipped, if kernel is int8, but op is not int8
if (!(kernel.precision() == PRECISION(kInt8) &&
!instruct.op_info()->HasAttr("enable_int8"))) {
Expand Down Expand Up @@ -294,8 +298,9 @@ class XPUStaticKernelPickPass : public mir::StmtPass {

private:
core::KernelPickFactor kernel_pick_factors_;
#ifdef LITE_WITH_XPU

bool xpu_use_fp16_optimizer_{false};
#ifdef LITE_WITH_XPU
// TODO(quwei:) addn more op
const std::set<std::string> PRECISION_INT31_OP_{"__xpu__fc"};
const std::set<std::string> PRECISION_INT8_OP_{"__xpu__fc"};
Expand All @@ -314,7 +319,15 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
"gather",
"pool2d",
"concat",
"calib"};
"calib",
"relu",
"tanh",
"sigmoid",
"leaky_relu",
"conv2d_transpose",
"elementwise_mul",
"elementwise_add",
"reduce_mean"};
const std::set<std::string> xpu_inplace_op_{"reshape",
"reshape2",
"flatten",
Expand Down
84 changes: 55 additions & 29 deletions lite/kernels/xpu/activation_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@ namespace lite {
namespace kernels {
namespace xpu {

void ReluCompute::Run() {
template <typename T, PrecisionType PType>
void ReluCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::relu(ctx.GetRawContext(),
param.X->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel());
CHECK_EQ(r, 0);
}
Expand All @@ -54,24 +55,26 @@ void GeluCompute::Run() {
CHECK_EQ(r, 0);
}

void TanhCompute::Run() {
template <typename T, PrecisionType PType>
void TanhCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::tanh(ctx.GetRawContext(),
param.X->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel());
CHECK_EQ(r, 0);
}

void SigmoidCompute::Run() {
template <typename T, PrecisionType PType>
void SigmoidCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::sigmoid(ctx.GetRawContext(),
param.X->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel());
CHECK_EQ(r, 0);
}
Expand Down Expand Up @@ -205,13 +208,13 @@ void HardSigmoidCompute::Run() {
CHECK_EQ(r, 0);
}

void LeakyReluCompute::Run() {
template <typename T, PrecisionType PType>
void LeakyReluCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::leaky_relu(ctx.GetRawContext(),
param.X->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel(),
param.Leaky_relu_alpha);
CHECK_EQ(r, 0);
Expand Down Expand Up @@ -274,12 +277,20 @@ void PReluCompute::Run() {
} // namespace lite
} // namespace paddle

REGISTER_LITE_KERNEL(
relu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ReluCompute, def)
using reluFP32 =
paddle::lite::kernels::xpu::ReluCompute<float, PRECISION(kFloat)>;
using reluFP16 =
paddle::lite::kernels::xpu::ReluCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(relu, kXPU, kFloat, kNCHW, reluFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(relu, kXPU, kFP16, kNCHW, reluFP16, reluFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
relu6, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Relu6Compute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
Expand All @@ -292,21 +303,31 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(
tanh, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TanhCompute, def)
using tanhFP32 =
paddle::lite::kernels::xpu::TanhCompute<float, PRECISION(kFloat)>;
using tanhFP16 =
paddle::lite::kernels::xpu::TanhCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(tanh, kXPU, kFloat, kNCHW, tanhFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(tanh, kXPU, kFP16, kNCHW, tanhFP16, tanhFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(sigmoid,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::SigmoidCompute,
def)
using sigmoidFP32 =
paddle::lite::kernels::xpu::SigmoidCompute<float, PRECISION(kFloat)>;
using sigmoidFP16 =
paddle::lite::kernels::xpu::SigmoidCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(sigmoid, kXPU, kFloat, kNCHW, sigmoidFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(sigmoid, kXPU, kFP16, kNCHW, sigmoidFP16, sigmoidFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
abs, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::AbsCompute, def)
Expand Down Expand Up @@ -386,16 +407,21 @@ REGISTER_LITE_KERNEL(hard_swish,
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(leaky_relu,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::LeakyReluCompute,
def)
using leaky_reluFP32 =
paddle::lite::kernels::xpu::LeakyReluCompute<float, PRECISION(kFloat)>;
using leaky_reluFP16 =
paddle::lite::kernels::xpu::LeakyReluCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(leaky_relu, kXPU, kFloat, kNCHW, leaky_reluFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(
leaky_relu, kXPU, kFP16, kNCHW, leaky_reluFP16, leaky_reluFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(softsign,
kXPU,
kFloat,
Expand Down
12 changes: 8 additions & 4 deletions lite/kernels/xpu/activation_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ namespace lite {
namespace kernels {
namespace xpu {

class ReluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class ReluCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::ActivationParam;

Expand All @@ -47,7 +48,8 @@ class GeluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
virtual ~GeluCompute() = default;
};

class TanhCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class TanhCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::ActivationParam;

Expand All @@ -56,7 +58,8 @@ class TanhCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
virtual ~TanhCompute() = default;
};

class SigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class SigmoidCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::ActivationParam;

Expand Down Expand Up @@ -164,7 +167,8 @@ class HardSigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
virtual ~HardSigmoidCompute() = default;
};

class LeakyReluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class LeakyReluCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::ActivationParam;

Expand Down
85 changes: 64 additions & 21 deletions lite/kernels/xpu/conv2d_transpose_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,23 @@ namespace lite {
namespace kernels {
namespace xpu {

template <>
void Conv2dTransposeCompute<PRECISION(kFloat)>::PrepareForRun() {
int cur_dev_idx = 0;

XPU_CALL(xpu_current_device(&cur_dev_idx));
XPU_CALL(xpu_device_get_attr(&cur_dev_attr_, XPUATTR_MODEL, cur_dev_idx));
if (cur_dev_attr_ <= 1) {
VLOG(4) << "Currents XPU device : XPU1";
} else if (cur_dev_attr_ >= 2 && cur_dev_attr_ <= 299) {
VLOG(4) << "Currents XPU device : XPU2";
} else if (cur_dev_attr_ >= 300 && cur_dev_attr_ <= 599) {
VLOG(4) << "Currents XPU device : XPU3";
} else {
VLOG(4) << "invaid XPU device";
}
}

template <>
void Conv2dTransposeCompute<PRECISION(kFloat)>::Run() {
auto& param = this->template Param<param_t>();
Expand All @@ -37,27 +54,53 @@ void Conv2dTransposeCompute<PRECISION(kFloat)>::Run() {
auto dilations = *param.dilations;

if (param.output_padding.empty()) {
int ret = xdnn::conv2d_transpose<float, float, float, int16_t>(
ctx.GetRawContext(),
param.x->data<float>(),
param.filter->data<float>(),
param.output->mutable_data<float>(TARGET(kXPU)),
in_dims[0],
in_dims[1],
in_dims[2],
in_dims[3],
out_dims[1],
std::vector<int>{static_cast<int>(w_dims[2]),
static_cast<int>(w_dims[3])},
strides,
paddings,
dilations,
groups,
nullptr,
nullptr,
nullptr,
true);
CHECK_EQ(ret, 0);
if (cur_dev_attr_ <= 1) {
int ret = xdnn::conv2d_transpose<float, float, float, int16_t>(
ctx.GetRawContext(),
param.x->data<float>(),
param.filter->data<float>(),
param.output->mutable_data<float>(TARGET(kXPU)),
in_dims[0],
in_dims[1],
in_dims[2],
in_dims[3],
out_dims[1],
std::vector<int>{static_cast<int>(w_dims[2]),
static_cast<int>(w_dims[3])},
strides,
paddings,
dilations,
groups,
nullptr,
nullptr,
nullptr,
true);
CHECK_EQ(ret, 0);
} else {
int ret = xdnn::conv2d_transpose_fusion<float, float, float, int16_t>(
ctx.GetRawContext(),
param.x->data<float>(),
param.filter->data<float>(),
param.output->mutable_data<float>(TARGET(kXPU)),
in_dims[0],
in_dims[1],
in_dims[2],
in_dims[3],
out_dims[1],
std::vector<int>{static_cast<int>(w_dims[2]),
static_cast<int>(w_dims[3])},
strides,
paddings,
dilations,
groups,
nullptr,
nullptr,
nullptr,
nullptr,
xdnn::Activation_t::LINEAR,
true);
CHECK_EQ(ret, 0);
}
} else {
int n = in_dims[0];
int yc = in_dims[1];
Expand Down
2 changes: 2 additions & 0 deletions lite/kernels/xpu/conv2d_transpose_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ class Conv2dTransposeCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
public:
using param_t = operators::ConvParam;

void PrepareForRun() override;
void Run() override;

virtual ~Conv2dTransposeCompute() = default;
uint64_t cur_dev_attr_ = 0;
};

} // namespace xpu
Expand Down
Loading