[MKLDNN] Support channel wise quantization for FullyConnected (#17187)

* Add channel wise quantization option for fullyconnected * fix lint * retrigger CI * retrigger CI * Add channel-wise option support for more quantization user-level API * Only update quatnize_net_v2 API and keep quantize_net API unchange * retrigger CI * fix pylint * Add check for option
apache · Jan 3, 2020 · 89fe1f6 · 89fe1f6
1 parent 2a9ec0e
commit 89fe1f6
Show file tree

Hide file tree

Showing 13 changed files with 536 additions and 273 deletions.
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
@@ -1932,6 +1932,7 @@ MXNET_DLL int MXSymbolInferTypePartial(SymbolHandle sym,
  * \param quantized_dtype the quantized destination type for input data
  * \param calib_quantize **Deprecated**. quantize op will always be calibrated if could
  * \param quantize_mode quantize mode to be used in quantize pass
+ * \param quantize_granularity quantize granularity, tensor-wise or channel-wise
  * \param out_num_calib_names return the number of nodes to be calibrated
  * \param out_calib_names return the node names to be calibrated
  */
@@ -1944,8 +1945,8 @@ MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle,
                                const char **excluded_op_names,
                                const uint32_t num_offline, const char **offline_params,
                                const char *quantized_dtype, const bool calib_quantize,
-                               const char *quantize_mode, uint32_t* out_num_calib_names,
-                               const char ***out_calib_names);
+                               const char *quantize_mode, const char *quantize_granularity,
+                               uint32_t* out_num_calib_names, const char ***out_calib_names);
 
 /*!
  * \brief Convert a symbol into a mixed precision symbol with cast operators for target dtype casting

diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
@@ -31,6 +31,7 @@
 
 #include <vector>
 #include <functional>
+#include <string>
 
 #include "./base.h"
 #include "./ndarray.h"
@@ -344,7 +345,8 @@ using FNeedRequantize = std::function<bool (const NodeAttrs& attrs)>;
  * which can handle fp32 inputs directly.
  */
 using FAvoidQuantizeInput = std::function<bool (const NodeAttrs& attrs,
-                                                size_t index)>;
+                                                const size_t index,
+                                                const std::string quantize_granularity)>;
 
 /*!
  * \brief Register a function to determine if the input of a quantized operator

diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
@@ -85,7 +85,8 @@ def _quantize_params(qsym, params, th_dict):
     return quantized_params
 
 def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
-                     offline_params=None, quantized_dtype='int8', quantize_mode='smart'):
+                     offline_params=None, quantized_dtype='int8', quantize_mode='smart',
+                     quantize_granularity='tensor-wise'):
     """Given a symbol object representing a neural network of data type FP32,
     quantize it into a INT8 network.
 
@@ -109,6 +110,9 @@ def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
         The quantized destination type for input data.
     quantize_mode: str
         The mode that quantization pass to apply.
+    quantize_granularity: str
+        The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
+        quantization. The default value is 'tensor-wise'.
 
     """
     num_excluded_symbols = 0
@@ -147,6 +151,7 @@ def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
                                      c_str(quantized_dtype),
                                      ctypes.c_bool(True),
                                      c_str(quantize_mode),
+                                     c_str(quantize_granularity),
                                      ctypes.byref(size),
                                      ctypes.byref(calib_str)))
     calib_layer = []
@@ -459,7 +464,8 @@ def quantize_model(sym, arg_params, aux_params,
                    data_names=('data',), label_names=('softmax_label',),
                    ctx=cpu(), excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
                    calib_data=None, num_calib_examples=None,
-                   quantized_dtype='int8', quantize_mode='smart', logger=None):
+                   quantized_dtype='int8', quantize_mode='smart',
+                   quantize_granularity='tensor-wise', logger=None):
     """User-level API for generating a quantized model from a FP32 model w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -515,6 +521,9 @@ def quantize_model(sym, arg_params, aux_params,
         The mode that quantization pass to apply. Support 'full' and 'smart'.
         'full' means quantize all operator if possible.
         'smart' means quantization pass will smartly choice which operator should be quantized.
+    quantize_granularity: str
+        The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
+        quantization. The default value is 'tensor-wise'.
     logger : Object
         A logging object for printing information during the process of quantization.
 
@@ -544,11 +553,15 @@ def quantize_model(sym, arg_params, aux_params,
     if quantized_dtype not in ('int8', 'uint8', 'auto'):
         raise ValueError('unknown quantized_dtype %s received,'
                          ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
+    if quantize_granularity not in ('tensor-wise', 'channel-wise'):
+        raise ValueError('unkonwn quantize_granularity %s received,'
+                         ' expected `tensor-wise` or `channel-wise`.' % quantize_granularity)
     qsym, calib_layer = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
                                          excluded_operators=excluded_op_names,
-                                         offline_params=list(
-                                             arg_params.keys()),
-                                         quantized_dtype=quantized_dtype, quantize_mode=quantize_mode)
+                                         offline_params=list(arg_params.keys()),
+                                         quantized_dtype=quantized_dtype,
+                                         quantize_mode=quantize_mode,
+                                         quantize_granularity=quantize_granularity)
     th_dict = {}
     if calib_mode is not None and calib_mode != 'none':
         if not isinstance(ctx, Context):
@@ -597,7 +610,8 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
                           data_names=('data',), label_names=('softmax_label',),
                           ctx=cpu(), excluded_sym_names=None, excluded_op_names=None,
                           calib_mode='entropy', calib_data=None, num_calib_examples=None,
-                          quantized_dtype='int8', quantize_mode='smart', logger=None):
+                          quantized_dtype='int8', quantize_mode='smart',
+                          quantize_granularity='tensor-wise', logger=None):
     """User-level API for generating a fusion + quantized model from a FP32 model
     w/ or w/o calibration with Intel MKL-DNN.
     The backend quantized operators are only enabled for Linux systems. Please do not run
@@ -628,15 +642,16 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
                                                    calib_mode=calib_mode, calib_data=calib_data,
                                                    num_calib_examples=num_calib_examples,
                                                    quantized_dtype=quantized_dtype, quantize_mode=quantize_mode,
-                                                   logger=logger)
+                                                   quantize_granularity=quantize_granularity, logger=logger)
 
     qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
 
     return qsym, qarg_params, aux_params
 
 def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
                    excluded_sym_names=None, excluded_op_names=None,
-                   calib_mode='entropy', quantized_dtype='int8', quantize_mode='full',
+                   calib_mode='entropy', quantized_dtype='int8',
+                   quantize_mode='full', quantize_granularity='tensor-wise',
                    LayerOutputCollector=None, logger=None):
     """User-level API for generating a quantized model from a FP32 model w/o calibration
     and a collector for naive or entropy calibration.
@@ -676,6 +691,9 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
         The mode that quantization pass to apply. Support 'full' and 'smart'.
         'full' means quantize all operator if possible.
         'smart' means quantization pass will smartly choice which operator should be quantized.
+    quantize_granularity: str
+        The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
+        quantization. The default value is 'tensor-wise'.
     LayerOutputCollector : class
         For customize calibration method usage.
     logger : Object
@@ -700,12 +718,16 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
     if quantized_dtype not in ('int8', 'uint8', 'auto'):
         raise ValueError('unknown quantized_dtype %s received,'
                          ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
+    if quantize_granularity not in ('tensor-wise', 'channel-wise'):
+        raise ValueError('unkonwn quantize_granularity %s received,'
+                         ' expected `tensor-wise` or `channel-wise`.' % quantize_granularity)
     qsym, calib_layer = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
                                          excluded_operators=excluded_op_names,
                                          offline_params=list(
                                              arg_params.keys()),
                                          quantized_dtype=quantized_dtype,
-                                         quantize_mode=quantize_mode)
+                                         quantize_mode=quantize_mode,
+                                         quantize_granularity=quantize_granularity)
 
     th_dict = {}
     collector = None
@@ -801,7 +823,7 @@ def calib_graph(qsym, arg_params, aux_params, collector,
 
     return qsym, qarg_params, aux_params
 
-def quantize_net_v2(network, quantized_dtype='auto', quantize_mode='full',
+def quantize_net_v2(network, quantized_dtype='auto', quantize_mode='full', quantize_granularity='tensor-wise',
                     exclude_layers=None, exclude_layers_match=None, exclude_operators=None,
                     calib_data=None, data_shapes=None, calib_mode='none',
                     num_calib_examples=None, ctx=cpu(), LayerOutputCollector=None, logger=None):
@@ -821,6 +843,9 @@ def quantize_net_v2(network, quantized_dtype='auto', quantize_mode='full',
         The mode that quantization pass to apply. Support 'full' and 'smart'.
         'full' means quantize all operator if possible.
         'smart' means quantization pass will smartly choice which operator should be quantized.
+    quantize_granularity: str
+        The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
+        quantization. The default value is 'tensor-wise'.
     exclude_layers : list of strings
         A list of strings representing the names of the symbols that users want to excluding
     exclude_layers_match : list of strings
@@ -927,7 +952,8 @@ def __exit__(self, exc_type, exc_value, traceback):
         sym=symnet, arg_params=args, aux_params=auxs, ctx=ctx,
         excluded_sym_names=exclude_layers, excluded_op_names=exclude_operators,
         calib_mode=calib_mode, quantized_dtype=quantized_dtype, quantize_mode=quantize_mode,
-        LayerOutputCollector=LayerOutputCollector, logger=logger)
+        quantize_granularity=quantize_granularity, LayerOutputCollector=LayerOutputCollector,
+        logger=logger)
 
     if calib_mode is not None and calib_mode != 'none':
         if not isinstance(ctx, Context):
@@ -987,7 +1013,9 @@ def quantize_net(network, quantized_dtype='auto', quantize_mode='full',
     """
     warnings.warn('WARNING: This will be deprecated after MXNet 2.0, please use quantize_net_v2.')
     return quantize_net_v2(network=network, quantized_dtype=quantized_dtype,
-                           quantize_mode=quantize_mode, exclude_layers=exclude_layers,
+                           quantize_mode=quantize_mode,
+                           quantize_granularity='tensor-wise',
+                           exclude_layers=exclude_layers,
                            exclude_layers_match=exclude_layers_match,
                            exclude_operators=exclude_operators,
                            calib_data=calib_data, data_shapes=data_shapes,

diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
@@ -925,6 +925,7 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
                      const char *quantized_dtype,
                      const bool calib_quantize,
                      const char *quantize_mode,
+                     const char *quantize_granularity,
                      mx_uint* out_num_calib_names,
                      const char ***out_calib_names) {
   nnvm::Symbol *s = new nnvm::Symbol();
@@ -946,12 +947,14 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
   }
   std::string quantized_type(quantized_dtype);
   std::string quantized_mode(quantize_mode);
+  std::string quantized_granularity(quantize_granularity);
   g.attrs["excluded_nodes"] = std::make_shared<nnvm::any>(std::move(excluded_node_names));
   g.attrs["excluded_ops"] = std::make_shared<nnvm::any>(std::move(excluded_op));
   g.attrs["offline_params"] = std::make_shared<nnvm::any>(std::move(offline));
   g.attrs["quantized_dtype"] = std::make_shared<nnvm::any>(std::move(quantized_type));
   g.attrs["target_ctx"] = std::make_shared<nnvm::any>(target_dev);
   g.attrs["quantize_mode"] = std::make_shared<nnvm::any>(std::move(quantized_mode));
+  g.attrs["quantize_granularity"] = std::make_shared<nnvm::any>(std::move(quantized_granularity));
   g = ApplyPass(std::move(g), "QuantizeGraph");
   const auto& calib_nodes = g.GetAttr<std::vector<std::string>>("calib_nodes");
   MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();

diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h b/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
@@ -43,6 +43,7 @@ struct MKLDNNFCParam: public dmlc::Parameter<MKLDNNFCParam> {
   bool with_eltwise;
   dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
   dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
+  dmlc::optional<bool> channel_wise_quantize;
 
   DMLC_DECLARE_PARAMETER(MKLDNNFCParam) {
     DMLC_DECLARE_FIELD(quantized).set_default(false)
@@ -61,15 +62,17 @@ struct MKLDNNFCParam: public dmlc::Parameter<MKLDNNFCParam> {
     .describe("The maximum scalar value in the form of float32 obtained "
               "through calibration. If present, it will be used to by "
               "quantized fullyconnected op to calculate primitive scale");
+    DMLC_DECLARE_FIELD(channel_wise_quantize)
+    .set_default(dmlc::optional<bool>())
+    .describe("Whether support channel-wise-quantize for weight.");
   }
 };
 
 struct MKLDNNFCFullParam {
   FullyConnectedParam default_param;
   MKLDNNFCParam mkldnn_param;
   MKLDNNPostEltwiseParam eltwise_param;
-  std::vector<float> output_scales = {0.0};
-  std::vector<float> requantize_scales = {0.0};
+  std::vector<float> output_scales = {0.0f};
 };
 
 mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(

diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
@@ -37,7 +37,8 @@ mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(
     const NDArray &data, const NDArray &weight, const NDArray *bias,
     const mkldnn::memory::desc &out_md) {
   auto data_md = GetMemDesc(data);
-  auto weight_md = GetFCWeightDesc(weight);
+  auto weight_md = full_param.mkldnn_param.quantized ?
+    GetFCWeightDesc(weight, mshadow::kInt8) : GetFCWeightDesc(weight);
   auto engine = CpuEngine::Get()->get_engine();
   auto propagation =
     is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
@@ -52,22 +53,9 @@ mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(
   }
   attr.set_post_ops(ops);
 
-  if (full_param.mkldnn_param.quantized) {
-    if ((full_param.mkldnn_param.min_calib_range.has_value() &&
-         full_param.mkldnn_param.max_calib_range.has_value()) ||
-        full_param.mkldnn_param.enable_float_output) {
-      int mask = 0;
-      std::vector<float> scales = {0.0};
-      if (full_param.requantize_scales.size()) {
-        scales[0] = full_param.requantize_scales[0];
-      } else if (full_param.output_scales.size()) {
-        scales[0] = full_param.output_scales[0];
-      } else {
-        LOG(FATAL) << "Must specified either output_scales or requantize_scales!";
-      }
-
-      attr.set_output_scales(mask, scales);
-    }
+  if (full_param.mkldnn_param.quantized && full_param.output_scales.size()) {
+    int mask = (full_param.output_scales.size() == 1) ? 0 : (1 << 1);
+    attr.set_output_scales(mask, full_param.output_scales);
   }
 
   auto GetFCFwdPd = [&full_param, &attr,
@@ -88,7 +76,8 @@ mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(
   if (bias) {
     if ((*bias).shape().ndim() != 1)
       LOG(FATAL) << "Unexpected shape for bias " << (*bias).shape();
-    auto bias_md = GetMemDesc(*bias);
+    auto bias_md =
+       full_param.mkldnn_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias);
     mkldnn::inner_product_forward::desc desc(propagation,
         data_md, weight_md, bias_md, out_md);
     return GetFCFwdPd(desc);