Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
[MKLDNN] Support channel wise quantization for FullyConnected (#17187)
Browse files Browse the repository at this point in the history
* Add channel wise quantization option for fullyconnected

* fix lint

* retrigger CI

* retrigger CI

* Add channel-wise option support for more quantization user-level API

* Only update quatnize_net_v2 API and keep quantize_net API unchange

* retrigger CI

* fix pylint

* Add check for  option
  • Loading branch information
ciyongch authored and TaoLv committed Jan 3, 2020
1 parent 2a9ec0e commit 89fe1f6
Show file tree
Hide file tree
Showing 13 changed files with 536 additions and 273 deletions.
5 changes: 3 additions & 2 deletions include/mxnet/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1932,6 +1932,7 @@ MXNET_DLL int MXSymbolInferTypePartial(SymbolHandle sym,
* \param quantized_dtype the quantized destination type for input data
* \param calib_quantize **Deprecated**. quantize op will always be calibrated if could
* \param quantize_mode quantize mode to be used in quantize pass
* \param quantize_granularity quantize granularity, tensor-wise or channel-wise
* \param out_num_calib_names return the number of nodes to be calibrated
* \param out_calib_names return the node names to be calibrated
*/
Expand All @@ -1944,8 +1945,8 @@ MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle,
const char **excluded_op_names,
const uint32_t num_offline, const char **offline_params,
const char *quantized_dtype, const bool calib_quantize,
const char *quantize_mode, uint32_t* out_num_calib_names,
const char ***out_calib_names);
const char *quantize_mode, const char *quantize_granularity,
uint32_t* out_num_calib_names, const char ***out_calib_names);

/*!
* \brief Convert a symbol into a mixed precision symbol with cast operators for target dtype casting
Expand Down
4 changes: 3 additions & 1 deletion include/mxnet/op_attr_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

#include <vector>
#include <functional>
#include <string>

#include "./base.h"
#include "./ndarray.h"
Expand Down Expand Up @@ -344,7 +345,8 @@ using FNeedRequantize = std::function<bool (const NodeAttrs& attrs)>;
* which can handle fp32 inputs directly.
*/
using FAvoidQuantizeInput = std::function<bool (const NodeAttrs& attrs,
size_t index)>;
const size_t index,
const std::string quantize_granularity)>;

/*!
* \brief Register a function to determine if the input of a quantized operator
Expand Down
52 changes: 40 additions & 12 deletions python/mxnet/contrib/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ def _quantize_params(qsym, params, th_dict):
return quantized_params

def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
offline_params=None, quantized_dtype='int8', quantize_mode='smart'):
offline_params=None, quantized_dtype='int8', quantize_mode='smart',
quantize_granularity='tensor-wise'):
"""Given a symbol object representing a neural network of data type FP32,
quantize it into a INT8 network.
Expand All @@ -109,6 +110,9 @@ def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
The quantized destination type for input data.
quantize_mode: str
The mode that quantization pass to apply.
quantize_granularity: str
The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
quantization. The default value is 'tensor-wise'.
"""
num_excluded_symbols = 0
Expand Down Expand Up @@ -147,6 +151,7 @@ def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
c_str(quantized_dtype),
ctypes.c_bool(True),
c_str(quantize_mode),
c_str(quantize_granularity),
ctypes.byref(size),
ctypes.byref(calib_str)))
calib_layer = []
Expand Down Expand Up @@ -459,7 +464,8 @@ def quantize_model(sym, arg_params, aux_params,
data_names=('data',), label_names=('softmax_label',),
ctx=cpu(), excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
calib_data=None, num_calib_examples=None,
quantized_dtype='int8', quantize_mode='smart', logger=None):
quantized_dtype='int8', quantize_mode='smart',
quantize_granularity='tensor-wise', logger=None):
"""User-level API for generating a quantized model from a FP32 model w/ or w/o calibration.
The backend quantized operators are only enabled for Linux systems. Please do not run
inference using the quantized models on Windows for now.
Expand Down Expand Up @@ -515,6 +521,9 @@ def quantize_model(sym, arg_params, aux_params,
The mode that quantization pass to apply. Support 'full' and 'smart'.
'full' means quantize all operator if possible.
'smart' means quantization pass will smartly choice which operator should be quantized.
quantize_granularity: str
The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
quantization. The default value is 'tensor-wise'.
logger : Object
A logging object for printing information during the process of quantization.
Expand Down Expand Up @@ -544,11 +553,15 @@ def quantize_model(sym, arg_params, aux_params,
if quantized_dtype not in ('int8', 'uint8', 'auto'):
raise ValueError('unknown quantized_dtype %s received,'
' expected `int8`, `uint8` or `auto`' % quantized_dtype)
if quantize_granularity not in ('tensor-wise', 'channel-wise'):
raise ValueError('unkonwn quantize_granularity %s received,'
' expected `tensor-wise` or `channel-wise`.' % quantize_granularity)
qsym, calib_layer = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
excluded_operators=excluded_op_names,
offline_params=list(
arg_params.keys()),
quantized_dtype=quantized_dtype, quantize_mode=quantize_mode)
offline_params=list(arg_params.keys()),
quantized_dtype=quantized_dtype,
quantize_mode=quantize_mode,
quantize_granularity=quantize_granularity)
th_dict = {}
if calib_mode is not None and calib_mode != 'none':
if not isinstance(ctx, Context):
Expand Down Expand Up @@ -597,7 +610,8 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
data_names=('data',), label_names=('softmax_label',),
ctx=cpu(), excluded_sym_names=None, excluded_op_names=None,
calib_mode='entropy', calib_data=None, num_calib_examples=None,
quantized_dtype='int8', quantize_mode='smart', logger=None):
quantized_dtype='int8', quantize_mode='smart',
quantize_granularity='tensor-wise', logger=None):
"""User-level API for generating a fusion + quantized model from a FP32 model
w/ or w/o calibration with Intel MKL-DNN.
The backend quantized operators are only enabled for Linux systems. Please do not run
Expand Down Expand Up @@ -628,15 +642,16 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
calib_mode=calib_mode, calib_data=calib_data,
num_calib_examples=num_calib_examples,
quantized_dtype=quantized_dtype, quantize_mode=quantize_mode,
logger=logger)
quantize_granularity=quantize_granularity, logger=logger)

qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')

return qsym, qarg_params, aux_params

def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
excluded_sym_names=None, excluded_op_names=None,
calib_mode='entropy', quantized_dtype='int8', quantize_mode='full',
calib_mode='entropy', quantized_dtype='int8',
quantize_mode='full', quantize_granularity='tensor-wise',
LayerOutputCollector=None, logger=None):
"""User-level API for generating a quantized model from a FP32 model w/o calibration
and a collector for naive or entropy calibration.
Expand Down Expand Up @@ -676,6 +691,9 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
The mode that quantization pass to apply. Support 'full' and 'smart'.
'full' means quantize all operator if possible.
'smart' means quantization pass will smartly choice which operator should be quantized.
quantize_granularity: str
The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
quantization. The default value is 'tensor-wise'.
LayerOutputCollector : class
For customize calibration method usage.
logger : Object
Expand All @@ -700,12 +718,16 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
if quantized_dtype not in ('int8', 'uint8', 'auto'):
raise ValueError('unknown quantized_dtype %s received,'
' expected `int8`, `uint8` or `auto`' % quantized_dtype)
if quantize_granularity not in ('tensor-wise', 'channel-wise'):
raise ValueError('unkonwn quantize_granularity %s received,'
' expected `tensor-wise` or `channel-wise`.' % quantize_granularity)
qsym, calib_layer = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
excluded_operators=excluded_op_names,
offline_params=list(
arg_params.keys()),
quantized_dtype=quantized_dtype,
quantize_mode=quantize_mode)
quantize_mode=quantize_mode,
quantize_granularity=quantize_granularity)

th_dict = {}
collector = None
Expand Down Expand Up @@ -801,7 +823,7 @@ def calib_graph(qsym, arg_params, aux_params, collector,

return qsym, qarg_params, aux_params

def quantize_net_v2(network, quantized_dtype='auto', quantize_mode='full',
def quantize_net_v2(network, quantized_dtype='auto', quantize_mode='full', quantize_granularity='tensor-wise',
exclude_layers=None, exclude_layers_match=None, exclude_operators=None,
calib_data=None, data_shapes=None, calib_mode='none',
num_calib_examples=None, ctx=cpu(), LayerOutputCollector=None, logger=None):
Expand All @@ -821,6 +843,9 @@ def quantize_net_v2(network, quantized_dtype='auto', quantize_mode='full',
The mode that quantization pass to apply. Support 'full' and 'smart'.
'full' means quantize all operator if possible.
'smart' means quantization pass will smartly choice which operator should be quantized.
quantize_granularity: str
The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
quantization. The default value is 'tensor-wise'.
exclude_layers : list of strings
A list of strings representing the names of the symbols that users want to excluding
exclude_layers_match : list of strings
Expand Down Expand Up @@ -927,7 +952,8 @@ def __exit__(self, exc_type, exc_value, traceback):
sym=symnet, arg_params=args, aux_params=auxs, ctx=ctx,
excluded_sym_names=exclude_layers, excluded_op_names=exclude_operators,
calib_mode=calib_mode, quantized_dtype=quantized_dtype, quantize_mode=quantize_mode,
LayerOutputCollector=LayerOutputCollector, logger=logger)
quantize_granularity=quantize_granularity, LayerOutputCollector=LayerOutputCollector,
logger=logger)

if calib_mode is not None and calib_mode != 'none':
if not isinstance(ctx, Context):
Expand Down Expand Up @@ -987,7 +1013,9 @@ def quantize_net(network, quantized_dtype='auto', quantize_mode='full',
"""
warnings.warn('WARNING: This will be deprecated after MXNet 2.0, please use quantize_net_v2.')
return quantize_net_v2(network=network, quantized_dtype=quantized_dtype,
quantize_mode=quantize_mode, exclude_layers=exclude_layers,
quantize_mode=quantize_mode,
quantize_granularity='tensor-wise',
exclude_layers=exclude_layers,
exclude_layers_match=exclude_layers_match,
exclude_operators=exclude_operators,
calib_data=calib_data, data_shapes=data_shapes,
Expand Down
3 changes: 3 additions & 0 deletions src/c_api/c_api_symbolic.cc
Original file line number Diff line number Diff line change
Expand Up @@ -925,6 +925,7 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
const char *quantized_dtype,
const bool calib_quantize,
const char *quantize_mode,
const char *quantize_granularity,
mx_uint* out_num_calib_names,
const char ***out_calib_names) {
nnvm::Symbol *s = new nnvm::Symbol();
Expand All @@ -946,12 +947,14 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
}
std::string quantized_type(quantized_dtype);
std::string quantized_mode(quantize_mode);
std::string quantized_granularity(quantize_granularity);
g.attrs["excluded_nodes"] = std::make_shared<nnvm::any>(std::move(excluded_node_names));
g.attrs["excluded_ops"] = std::make_shared<nnvm::any>(std::move(excluded_op));
g.attrs["offline_params"] = std::make_shared<nnvm::any>(std::move(offline));
g.attrs["quantized_dtype"] = std::make_shared<nnvm::any>(std::move(quantized_type));
g.attrs["target_ctx"] = std::make_shared<nnvm::any>(target_dev);
g.attrs["quantize_mode"] = std::make_shared<nnvm::any>(std::move(quantized_mode));
g.attrs["quantize_granularity"] = std::make_shared<nnvm::any>(std::move(quantized_granularity));
g = ApplyPass(std::move(g), "QuantizeGraph");
const auto& calib_nodes = g.GetAttr<std::vector<std::string>>("calib_nodes");
MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
Expand Down
7 changes: 5 additions & 2 deletions src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ struct MKLDNNFCParam: public dmlc::Parameter<MKLDNNFCParam> {
bool with_eltwise;
dmlc::optional<float> min_calib_range; // min float value calculated from calibration dataset
dmlc::optional<float> max_calib_range; // max float value calculated from calibration dataset
dmlc::optional<bool> channel_wise_quantize;

DMLC_DECLARE_PARAMETER(MKLDNNFCParam) {
DMLC_DECLARE_FIELD(quantized).set_default(false)
Expand All @@ -61,15 +62,17 @@ struct MKLDNNFCParam: public dmlc::Parameter<MKLDNNFCParam> {
.describe("The maximum scalar value in the form of float32 obtained "
"through calibration. If present, it will be used to by "
"quantized fullyconnected op to calculate primitive scale");
DMLC_DECLARE_FIELD(channel_wise_quantize)
.set_default(dmlc::optional<bool>())
.describe("Whether support channel-wise-quantize for weight.");
}
};

struct MKLDNNFCFullParam {
FullyConnectedParam default_param;
MKLDNNFCParam mkldnn_param;
MKLDNNPostEltwiseParam eltwise_param;
std::vector<float> output_scales = {0.0};
std::vector<float> requantize_scales = {0.0};
std::vector<float> output_scales = {0.0f};
};

mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(
Expand Down
25 changes: 7 additions & 18 deletions src/operator/nn/mkldnn/mkldnn_fully_connected.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(
const NDArray &data, const NDArray &weight, const NDArray *bias,
const mkldnn::memory::desc &out_md) {
auto data_md = GetMemDesc(data);
auto weight_md = GetFCWeightDesc(weight);
auto weight_md = full_param.mkldnn_param.quantized ?
GetFCWeightDesc(weight, mshadow::kInt8) : GetFCWeightDesc(weight);
auto engine = CpuEngine::Get()->get_engine();
auto propagation =
is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
Expand All @@ -52,22 +53,9 @@ mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(
}
attr.set_post_ops(ops);

if (full_param.mkldnn_param.quantized) {
if ((full_param.mkldnn_param.min_calib_range.has_value() &&
full_param.mkldnn_param.max_calib_range.has_value()) ||
full_param.mkldnn_param.enable_float_output) {
int mask = 0;
std::vector<float> scales = {0.0};
if (full_param.requantize_scales.size()) {
scales[0] = full_param.requantize_scales[0];
} else if (full_param.output_scales.size()) {
scales[0] = full_param.output_scales[0];
} else {
LOG(FATAL) << "Must specified either output_scales or requantize_scales!";
}

attr.set_output_scales(mask, scales);
}
if (full_param.mkldnn_param.quantized && full_param.output_scales.size()) {
int mask = (full_param.output_scales.size() == 1) ? 0 : (1 << 1);
attr.set_output_scales(mask, full_param.output_scales);
}

auto GetFCFwdPd = [&full_param, &attr,
Expand All @@ -88,7 +76,8 @@ mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(
if (bias) {
if ((*bias).shape().ndim() != 1)
LOG(FATAL) << "Unexpected shape for bias " << (*bias).shape();
auto bias_md = GetMemDesc(*bias);
auto bias_md =
full_param.mkldnn_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias);
mkldnn::inner_product_forward::desc desc(propagation,
data_md, weight_md, bias_md, out_md);
return GetFCFwdPd(desc);
Expand Down
Loading

0 comments on commit 89fe1f6

Please sign in to comment.