From 5d2a4510c2c226c6921a8a213d04461f68ca7173 Mon Sep 17 00:00:00 2001 From: ciyong Date: Wed, 27 Mar 2019 20:03:49 +0800 Subject: [PATCH] Performance improving for MKL-DNN Quantized FullyConnected (#14528) * Cached bias to Quantized FullyCOnnected based on Subgraph to improve performance * retrigger CI * retrigger CI --- src/operator/nn/fully_connected-inl.h | 6 +++ .../mkldnn_quantized_fully_connected.cc | 21 ++++------ .../quantization/quantized_fully_connected.cc | 22 ++++++---- src/operator/subgraph/mkldnn/mkldnn_fc.cc | 41 +++++++++++-------- 4 files changed, 52 insertions(+), 38 deletions(-) diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h index 93d384d51e6f..e4bb11f6bc56 100644 --- a/src/operator/nn/fully_connected-inl.h +++ b/src/operator/nn/fully_connected-inl.h @@ -48,6 +48,12 @@ enum FullyConnectedOpResource {kTempSpace}; enum FullyConnectedOpOutputs {kOut}; } // fullc +namespace quantized_fullc { +enum QuantizedFCInputMinMax {kDataMin, kDataMax, kWeightMin, kWeightMax, kBiasMin, kBiasMax}; +enum QuantizedFCOutputs {kOut, kOutMin, kOutMax}; +} // quantized_fullc + + struct FullyConnectedParam : public dmlc::Parameter { int num_hidden; bool no_bias; diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc index 39f8116379c2..71daf2ec2c16 100644 --- a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc +++ b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc @@ -31,11 +31,6 @@ namespace mxnet { namespace op { -namespace quantized_fc_enum { -enum QuantizedFCInputMinMax { kDataMin, kDataMax, kWeightMin, kWeightMax, kBiasMin, kBiasMax }; -enum QuantizedFCOutputs { kOut, kOutMin, kOutMax }; -} - void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &in_data, @@ -52,15 +47,15 @@ void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs, NDArray weight = in_data[fullc::kWeight]; const float min_data = - in_data[num_inputs + quantized_fc_enum::kDataMin].data().dptr()[0]; + in_data[num_inputs + quantized_fullc::kDataMin].data().dptr()[0]; const float max_data = - in_data[num_inputs + quantized_fc_enum::kDataMax].data().dptr()[0]; + in_data[num_inputs + quantized_fullc::kDataMax].data().dptr()[0]; const float min_weight = - in_data[num_inputs + quantized_fc_enum::kWeightMin].data().dptr()[0]; + in_data[num_inputs + quantized_fullc::kWeightMin].data().dptr()[0]; const float max_weight = - in_data[num_inputs + quantized_fc_enum::kWeightMax].data().dptr()[0]; - float *min_output_ptr = out_data[quantized_fc_enum::kOutMin].data().dptr(); - float *max_output_ptr = out_data[quantized_fc_enum::kOutMax].data().dptr(); + in_data[num_inputs + quantized_fullc::kWeightMax].data().dptr()[0]; + float *min_output_ptr = out_data[quantized_fullc::kOutMin].data().dptr(); + float *max_output_ptr = out_data[quantized_fullc::kOutMax].data().dptr(); auto data_range = (data.dtype() == mshadow::kInt8) ? kInt8Range : kUint8Range; float data_scale = data_range / MaxAbs(min_data, max_data); @@ -69,8 +64,8 @@ void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs, NDArray quantized_bias; if (!param.no_bias) { NDArray bias = in_data[fullc::kBias]; - float min_bias = in_data[num_inputs + quantized_fc_enum::kBiasMin].data().dptr()[0]; - float max_bias = in_data[num_inputs + quantized_fc_enum::kBiasMax].data().dptr()[0]; + float min_bias = in_data[num_inputs + quantized_fullc::kBiasMin].data().dptr()[0]; + float max_bias = in_data[num_inputs + quantized_fullc::kBiasMax].data().dptr()[0]; float bias_int32_rescale = data_scale * weight_scale * MaxAbs(min_bias, max_bias) / kInt8Range; quantized_bias = NDArray(bias.storage_type(), bias.shape(), diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc index 4718b3b673eb..0a04e71b9093 100644 --- a/src/operator/quantization/quantized_fully_connected.cc +++ b/src/operator/quantization/quantized_fully_connected.cc @@ -222,20 +222,26 @@ void QuantizedFullyConnectedForwardCPU(const nnvm::NodeAttrs& attrs, shiftdata.dptr_[i] = data_temp[i] + shift; } - Tensor min_output = out_data[1].get(s); - Tensor max_output = out_data[2].get(s); - Tensor min_data = in_data[num_inputs].get(s); - Tensor max_data = in_data[num_inputs + 1].get(s); - Tensor min_weight = in_data[num_inputs + 2].get(s); - Tensor max_weight = in_data[num_inputs + 3].get(s); + Tensor min_output = out_data[quantized_fullc::kOutMin].get(s); + Tensor max_output = out_data[quantized_fullc::kOutMax].get(s); + Tensor min_data = + in_data[num_inputs + quantized_fullc::kDataMin].get(s); + Tensor max_data = + in_data[num_inputs + quantized_fullc::kDataMax].get(s); + Tensor min_weight = + in_data[num_inputs + quantized_fullc::kWeightMin].get(s); + Tensor max_weight = + in_data[num_inputs + quantized_fullc::kWeightMax].get(s); Kernel::Launch(s, 1, min_output.dptr_, max_output.dptr_, min_data.dptr_, max_data.dptr_, min_weight.dptr_, max_weight.dptr_); if (!param.no_bias) { Tensor bias = in_data[fullc::kBias].get_with_shape( Shape1(wshape[0]), s); - Tensor min_bias = in_data[num_inputs + 4].get(s); - Tensor max_bias = in_data[num_inputs + 5].get(s); + Tensor min_bias = + in_data[num_inputs + quantized_fullc::kBiasMin].get(s); + Tensor max_bias = + in_data[num_inputs + quantized_fullc::kBiasMax].get(s); Kernel::Launch(s, n, out.dptr_, bias.dptr_, min_output.dptr_, max_output.dptr_, min_bias.dptr_, max_bias.dptr_); diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc.cc b/src/operator/subgraph/mkldnn/mkldnn_fc.cc index c9e1e1c79244..0ec05a2af087 100644 --- a/src/operator/subgraph/mkldnn/mkldnn_fc.cc +++ b/src/operator/subgraph/mkldnn/mkldnn_fc.cc @@ -63,7 +63,6 @@ class SgMKLDNNFCOp { nnvm::Symbol subgraph_sym_; MKLDNNFCFullParam full_param_; std::shared_ptr fwd_; - NDArray cached_weight_; NDArray cached_bias_; float cached_min_data_; float cached_max_data_; @@ -71,6 +70,8 @@ class SgMKLDNNFCOp { float cached_max_weight_; float cached_min_bias_; float cached_max_bias_; + float cached_min_output_; + float cached_max_output_; }; void SgMKLDNNFCOp::Forward(const OpContext &ctx, @@ -91,23 +92,19 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx, float max_weight = 0.0; float min_bias = 0.0; float max_bias = 0.0; - float *min_output_ptr = nullptr; - float *max_output_ptr = nullptr; if (mkldnn_param.quantized) { total_num_inputs = base_num_inputs * 3; - min_data = in_data[base_num_inputs].data().dptr()[0]; - max_data = in_data[base_num_inputs + 1].data().dptr()[0]; - min_weight = in_data[base_num_inputs + 2].data().dptr()[0]; - max_weight = in_data[base_num_inputs + 3].data().dptr()[0]; + min_data = in_data[base_num_inputs + quantized_fullc::kDataMin].data().dptr()[0]; + max_data = in_data[base_num_inputs + quantized_fullc::kDataMax].data().dptr()[0]; + min_weight = in_data[base_num_inputs + quantized_fullc::kWeightMin].data().dptr()[0]; + max_weight = in_data[base_num_inputs + quantized_fullc::kWeightMax].data().dptr()[0]; if (has_bias) { - min_bias = in_data[base_num_inputs + 4].data().dptr()[0]; - max_bias = in_data[base_num_inputs + 5].data().dptr()[0]; + min_bias = in_data[base_num_inputs + quantized_fullc::kBiasMin].data().dptr()[0]; + max_bias = in_data[base_num_inputs + quantized_fullc::kBiasMax].data().dptr()[0]; } if (!mkldnn_param.enable_float_output) { total_num_outputs = base_num_outputs * 3; - min_output_ptr = out_data[1].data().dptr(); - max_output_ptr = out_data[2].data().dptr(); } } CHECK_EQ(in_data.size(), total_num_inputs); @@ -135,6 +132,8 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx, cached_max_weight_ = max_weight; if (has_bias) { cached_bias_ = in_data[fullc::kBias]; + cached_min_bias_ = min_bias; + cached_max_bias_ = max_bias; } else { cached_bias_ = NDArray(); } @@ -149,7 +148,7 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx, if (has_bias) { NDArray bias = in_data[fullc::kBias]; float bias_int32_rescale = data_scale * weight_scale * - MaxAbs(min_bias, max_bias) / kInt8Range; + MaxAbs(cached_min_bias_, cached_max_bias_) / kInt8Range; cached_bias_ = NDArray(bias.storage_type(), bias.shape(), bias.ctx(), true, mshadow::kInt32); @@ -168,15 +167,16 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx, } else if (mkldnn_param.min_calib_range.has_value() && mkldnn_param.max_calib_range.has_value()) { full_param_.output_scales.resize(0); - *min_output_ptr = mkldnn_param.min_calib_range.value(); - *max_output_ptr = mkldnn_param.max_calib_range.value(); + cached_min_output_ = mkldnn_param.min_calib_range.value(); + cached_max_output_ = mkldnn_param.max_calib_range.value(); full_param_.requantize_scales[0] = quantized_out_range / - MaxAbs(*min_output_ptr, *max_output_ptr) / data_scale / weight_scale; + MaxAbs(cached_min_output_, cached_max_output_) / data_scale / weight_scale; } else { Stream *s = ctx.get_stream(); - mxnet_op::Kernel::Launch(s, 1, - min_output_ptr, max_output_ptr, &min_data, &max_data, &min_weight, &max_weight); + mxnet_op::Kernel::Launch( + s, 1, &cached_min_output_, &cached_max_output_, + &min_data, &max_data, &min_weight, &max_weight); } } @@ -195,6 +195,13 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx, } MKLDNNFCForwardFullFeature(full_param_, ctx, fwd_.get(), new_inputs, new_req, out_data); + + if (mkldnn_param.quantized && !mkldnn_param.enable_float_output) { + float *min_output_ptr = out_data[quantized_fullc::kOutMin].data().dptr(); + float *max_output_ptr = out_data[quantized_fullc::kOutMax].data().dptr(); + *min_output_ptr = cached_min_output_; + *max_output_ptr = cached_max_output_; + } } static void SgMKLDNNFCParamParser(nnvm::NodeAttrs *attrs) {