-
Notifications
You must be signed in to change notification settings - Fork 6.8k
Performance improving for MKL-DNN Quantized FullyConnected #14528
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,11 +31,6 @@ | |
namespace mxnet { | ||
namespace op { | ||
|
||
namespace quantized_fc_enum { | ||
enum QuantizedFCInputMinMax { kDataMin, kDataMax, kWeightMin, kWeightMax, kBiasMin, kBiasMax }; | ||
enum QuantizedFCOutputs { kOut, kOutMin, kOutMax }; | ||
} | ||
|
||
void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs, | ||
const OpContext &ctx, | ||
const std::vector<NDArray> &in_data, | ||
|
@@ -52,15 +47,15 @@ void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs, | |
NDArray weight = in_data[fullc::kWeight]; | ||
|
||
const float min_data = | ||
in_data[num_inputs + quantized_fc_enum::kDataMin].data().dptr<float>()[0]; | ||
in_data[num_inputs + quantized_fullc::kDataMin].data().dptr<float>()[0]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Quite strange usage. Why not define a whole input sets with original inputs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Original inputs might not include bias, which results in different index for all these min/max. Just to simplify the ordering for quantized op only. |
||
const float max_data = | ||
in_data[num_inputs + quantized_fc_enum::kDataMax].data().dptr<float>()[0]; | ||
in_data[num_inputs + quantized_fullc::kDataMax].data().dptr<float>()[0]; | ||
const float min_weight = | ||
in_data[num_inputs + quantized_fc_enum::kWeightMin].data().dptr<float>()[0]; | ||
in_data[num_inputs + quantized_fullc::kWeightMin].data().dptr<float>()[0]; | ||
const float max_weight = | ||
in_data[num_inputs + quantized_fc_enum::kWeightMax].data().dptr<float>()[0]; | ||
float *min_output_ptr = out_data[quantized_fc_enum::kOutMin].data().dptr<float>(); | ||
float *max_output_ptr = out_data[quantized_fc_enum::kOutMax].data().dptr<float>(); | ||
in_data[num_inputs + quantized_fullc::kWeightMax].data().dptr<float>()[0]; | ||
float *min_output_ptr = out_data[quantized_fullc::kOutMin].data().dptr<float>(); | ||
float *max_output_ptr = out_data[quantized_fullc::kOutMax].data().dptr<float>(); | ||
|
||
auto data_range = (data.dtype() == mshadow::kInt8) ? kInt8Range : kUint8Range; | ||
float data_scale = data_range / MaxAbs(min_data, max_data); | ||
|
@@ -69,8 +64,8 @@ void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs, | |
NDArray quantized_bias; | ||
if (!param.no_bias) { | ||
NDArray bias = in_data[fullc::kBias]; | ||
float min_bias = in_data[num_inputs + quantized_fc_enum::kBiasMin].data().dptr<float>()[0]; | ||
float max_bias = in_data[num_inputs + quantized_fc_enum::kBiasMax].data().dptr<float>()[0]; | ||
float min_bias = in_data[num_inputs + quantized_fullc::kBiasMin].data().dptr<float>()[0]; | ||
float max_bias = in_data[num_inputs + quantized_fullc::kBiasMax].data().dptr<float>()[0]; | ||
float bias_int32_rescale = data_scale * weight_scale * MaxAbs(min_bias, max_bias) / kInt8Range; | ||
|
||
quantized_bias = NDArray(bias.storage_type(), bias.shape(), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,14 +63,15 @@ class SgMKLDNNFCOp { | |
nnvm::Symbol subgraph_sym_; | ||
MKLDNNFCFullParam full_param_; | ||
std::shared_ptr<MKLDNNFullyConnectedForward> fwd_; | ||
NDArray cached_weight_; | ||
NDArray cached_bias_; | ||
float cached_min_data_; | ||
float cached_max_data_; | ||
float cached_min_weight_; | ||
float cached_max_weight_; | ||
float cached_min_bias_; | ||
float cached_max_bias_; | ||
float cached_min_output_; | ||
float cached_max_output_; | ||
}; | ||
|
||
void SgMKLDNNFCOp::Forward(const OpContext &ctx, | ||
|
@@ -91,23 +92,19 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx, | |
float max_weight = 0.0; | ||
float min_bias = 0.0; | ||
float max_bias = 0.0; | ||
float *min_output_ptr = nullptr; | ||
float *max_output_ptr = nullptr; | ||
|
||
if (mkldnn_param.quantized) { | ||
total_num_inputs = base_num_inputs * 3; | ||
min_data = in_data[base_num_inputs].data().dptr<float>()[0]; | ||
max_data = in_data[base_num_inputs + 1].data().dptr<float>()[0]; | ||
min_weight = in_data[base_num_inputs + 2].data().dptr<float>()[0]; | ||
max_weight = in_data[base_num_inputs + 3].data().dptr<float>()[0]; | ||
min_data = in_data[base_num_inputs + quantized_fullc::kDataMin].data().dptr<float>()[0]; | ||
max_data = in_data[base_num_inputs + quantized_fullc::kDataMax].data().dptr<float>()[0]; | ||
min_weight = in_data[base_num_inputs + quantized_fullc::kWeightMin].data().dptr<float>()[0]; | ||
max_weight = in_data[base_num_inputs + quantized_fullc::kWeightMax].data().dptr<float>()[0]; | ||
if (has_bias) { | ||
min_bias = in_data[base_num_inputs + 4].data().dptr<float>()[0]; | ||
max_bias = in_data[base_num_inputs + 5].data().dptr<float>()[0]; | ||
min_bias = in_data[base_num_inputs + quantized_fullc::kBiasMin].data().dptr<float>()[0]; | ||
max_bias = in_data[base_num_inputs + quantized_fullc::kBiasMax].data().dptr<float>()[0]; | ||
} | ||
if (!mkldnn_param.enable_float_output) { | ||
total_num_outputs = base_num_outputs * 3; | ||
min_output_ptr = out_data[1].data().dptr<float>(); | ||
max_output_ptr = out_data[2].data().dptr<float>(); | ||
} | ||
} | ||
CHECK_EQ(in_data.size(), total_num_inputs); | ||
|
@@ -135,6 +132,8 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx, | |
cached_max_weight_ = max_weight; | ||
if (has_bias) { | ||
cached_bias_ = in_data[fullc::kBias]; | ||
cached_min_bias_ = min_bias; | ||
cached_max_bias_ = max_bias; | ||
} else { | ||
cached_bias_ = NDArray(); | ||
} | ||
|
@@ -149,7 +148,7 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx, | |
if (has_bias) { | ||
NDArray bias = in_data[fullc::kBias]; | ||
float bias_int32_rescale = data_scale * weight_scale * | ||
MaxAbs(min_bias, max_bias) / kInt8Range; | ||
MaxAbs(cached_min_bias_, cached_max_bias_) / kInt8Range; | ||
|
||
cached_bias_ = NDArray(bias.storage_type(), bias.shape(), | ||
bias.ctx(), true, mshadow::kInt32); | ||
|
@@ -168,15 +167,16 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx, | |
} else if (mkldnn_param.min_calib_range.has_value() && | ||
mkldnn_param.max_calib_range.has_value()) { | ||
full_param_.output_scales.resize(0); | ||
*min_output_ptr = mkldnn_param.min_calib_range.value(); | ||
*max_output_ptr = mkldnn_param.max_calib_range.value(); | ||
cached_min_output_ = mkldnn_param.min_calib_range.value(); | ||
cached_max_output_ = mkldnn_param.max_calib_range.value(); | ||
|
||
full_param_.requantize_scales[0] = quantized_out_range / | ||
MaxAbs(*min_output_ptr, *max_output_ptr) / data_scale / weight_scale; | ||
MaxAbs(cached_min_output_, cached_max_output_) / data_scale / weight_scale; | ||
} else { | ||
Stream<cpu> *s = ctx.get_stream<cpu>(); | ||
mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1, | ||
min_output_ptr, max_output_ptr, &min_data, &max_data, &min_weight, &max_weight); | ||
mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch( | ||
s, 1, &cached_min_output_, &cached_max_output_, | ||
&min_data, &max_data, &min_weight, &max_weight); | ||
} | ||
} | ||
|
||
|
@@ -195,6 +195,13 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx, | |
} | ||
|
||
MKLDNNFCForwardFullFeature(full_param_, ctx, fwd_.get(), new_inputs, new_req, out_data); | ||
|
||
if (mkldnn_param.quantized && !mkldnn_param.enable_float_output) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add comments on why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's straightforward here, the OutMin and OutMax are only valid when the op is quantized and not generating fp32 output. |
||
float *min_output_ptr = out_data[quantized_fullc::kOutMin].data().dptr<float>(); | ||
float *max_output_ptr = out_data[quantized_fullc::kOutMax].data().dptr<float>(); | ||
*min_output_ptr = cached_min_output_; | ||
*max_output_ptr = cached_max_output_; | ||
} | ||
} | ||
|
||
static void SgMKLDNNFCParamParser(nnvm::NodeAttrs *attrs) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
quantized_fc?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just to align with fullc :)