diff --git a/example/quantization/README.md b/example/quantization/README.md
index 1ae58fbb3a69..2215a13a97d5 100644
--- a/example/quantization/README.md
+++ b/example/quantization/README.md
@@ -86,21 +86,20 @@ Use the following command to install [Gluon-CV](https://gluon-cv.mxnet.io/):
 pip install gluoncv
 ```
 
-Below are some quantization demos. These models have been tested on Linux systems.
+The following models have been tested on Linux systems. Accuracy is collected on Intel XEON Cascade Lake CPU. For CPU with Skylake Lake or eariler architecture, the accuracy may not be the same.
 
 | Model | Source | Dataset | FP32 Accuracy (top-1/top-5)| INT8 Accuracy (top-1/top-5)|
 |:---|:---|---|:---:|:---:|
-| [ResNet18-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  |70.15%/89.38%|69.92%/89.26%|
-| [ResNet50-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 76.34%/93.13%  |  75.91%/92.95% |
-| [ResNet50-V1b](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 76.82%/93.38% |  76.39%/93.24% |
-| [ResNet101-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 77.33%/93.59%  | 77.05%/93.43%  |
-|[Squeezenet 1.0](#4)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|56.98%/79.20%|52.98%/77.21%|
-|[MobileNet 1.0](#5)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.23%/90.64%|72.03%/90.42%|
-|[MobileNetV2 1.0](#6)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|70.27%/89.62%|69.70%/89.26%|
-|[Inception V3](#7)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|77.76%/93.83% |77.87%/93.78% |
-|[ResNet152-V2](#8)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/resnet/152-layers/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|76.65%/93.07%|76.36%/92.89%|
-|[Inception-BN](#9)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/inception-bn/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.28%/90.63%|72.20%/90.56%|
-| [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | VOC2007/2012  | 0.8366 mAP  | 0.8364 mAP  |
+| [ResNet18-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  |70.15%/89.38%|69.92%/89.30%|
+| [ResNet50-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 76.34%/93.13%  |  76.06%/92.99% |
+| [ResNet101-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 77.33%/93.59%  | 77.07%/93.47%  |
+|[Squeezenet 1.0](#4)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|56.98%/79.20%|56.79%/79.47%|
+|[MobileNet 1.0](#5)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.23%/90.64%|72.06%/90.53%|
+|[MobileNetV2 1.0](#6)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|70.27%/89.62%|69.82%/89.35%|
+|[Inception V3](#7)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|77.76%/93.83% |78.05%/93.91% |
+|[ResNet152-V2](#8)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/resnet/152-layers/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|76.65%/93.07%|76.19%/92.88%|
+|[Inception-BN](#9)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/inception-bn/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.28%/90.63%|72.02%/90.53%|
+| [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | VOC2007/2012  | 0.8366 mAP  | 0.8357 mAP  |
 | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | COCO2014  | 0.2552 mAP  | 0.253 mAP  |
 
 <h3 id='3'>ResNetV1</h3>
diff --git a/example/quantization/imagenet_gen_qsym.py b/example/quantization/imagenet_gen_qsym.py
index 27a1d7a45a68..ec4c809a79d0 100644
--- a/example/quantization/imagenet_gen_qsym.py
+++ b/example/quantization/imagenet_gen_qsym.py
@@ -141,23 +141,12 @@ def save_params(fname, arg_params, aux_params, logger=None):
     excluded_op_names = []
     if args.model == 'imagenet1k-resnet-152':
         rgb_mean = '0,0,0'
-        if args.ctx == 'gpu':
-            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
-                                                                     or name.find('sc') != -1
-                                                                     or name.find('fc') != -1)
-        else:
-            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
-                                                                     or name.find('sc') != -1)
-            excluded_sym_names += ['flatten0', 'fc1']
+        excluded_sym_names += ['flatten0', 'fc1']
         if exclude_first_conv:
             excluded_sym_names += ['conv0']
     elif args.model == 'imagenet1k-inception-bn':
         rgb_mean = '123.68,116.779,103.939'
-        if args.ctx == 'gpu':
-            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
-                                                                     or name.find('fc') != -1)
-        else:
-            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1)
+        if args.ctx == 'cpu':
             excluded_sym_names += ['flatten', 'fc1']
         if exclude_first_conv:
             excluded_sym_names += ['conv_1']
@@ -203,7 +192,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
                                                         excluded_op_names=excluded_op_names,
                                                         calib_mode=calib_mode, calib_data=data,
                                                         num_calib_examples=num_calib_batches * batch_size,
-                                                        calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
+                                                        quantized_dtype=args.quantized_dtype,
                                                         logger=logger)
         if calib_mode == 'entropy':
             suffix = '-quantized-%dbatches-entropy' % num_calib_batches
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
index 6c87f58b63e2..e24fb780a9d8 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -193,7 +193,6 @@ def save_params(fname, arg_params, aux_params, logger=None):
     # get image shape
     image_shape = args.image_shape
 
-    calib_layer = lambda name: name.endswith('_output') or name == "data"
     exclude_first_conv = args.exclude_first_conv
     if args.quantized_dtype == "uint8":
         logger.info('quantized dtype is set to uint8, will exclude first conv.')
@@ -295,7 +294,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
                                                               ctx=ctx, excluded_sym_names=excluded_sym_names,
                                                               calib_mode=calib_mode, calib_data=data,
                                                               num_calib_examples=num_calib_batches * batch_size,
-                                                              calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
+                                                              quantized_dtype=args.quantized_dtype,
                                                               label_names=(label_name,), logger=logger)
         if calib_mode == 'entropy':
             suffix = '-quantized-%dbatches-entropy' % num_calib_batches
diff --git a/example/ssd/quantization.py b/example/ssd/quantization.py
index d50935499240..9801a7dc0076 100644
--- a/example/ssd/quantization.py
+++ b/example/ssd/quantization.py
@@ -115,19 +115,10 @@ def save_params(fname, arg_params, aux_params, logger=None):
     # get image shape
     image_shape = '3,300,300'
 
-    def calib_layer(name): return not (name.endswith('_data') or
-                                       name.endswith('_weight') or
-                                       name.endswith('_bias') or
-                                       name.endswith('_workspace'))
     # Quantization layer configs
     exclude_first_conv = args.exclude_first_conv
     excluded_sym_names = []
     rgb_mean = '123,117,104'
-    for i in range(1,19):
-        excluded_sym_names += ['flatten'+str(i)]
-    excluded_sym_names += ['multibox_loc_pred',
-                           'concat0',
-                           'concat1']
     if exclude_first_conv:
         excluded_sym_names += ['conv1_1']
 
@@ -159,7 +150,7 @@ def calib_layer(name): return not (name.endswith('_data') or
                                                         ctx=ctx, excluded_sym_names=excluded_sym_names,
                                                         calib_mode=calib_mode, calib_data=eval_iter,
                                                         num_calib_examples=num_calib_batches * batch_size,
-                                                        calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
+                                                        quantized_dtype=args.quantized_dtype,
                                                         label_names=(label_name,), logger=logger)
         sym_name = '%s-symbol.json' % ('./model/cqssd_vgg16_reduced_300')
         param_name = '%s-%04d.params' % ('./model/cqssd_vgg16_reduced_300', epoch)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index d4f756f5333c..e0ed316a1bdc 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1907,6 +1907,9 @@ MXNET_DLL int MXSymbolInferTypePartial(SymbolHandle sym,
  * \param offline_params array of c strings representing the names of params quantized offline
  * \param quantized_dtype the quantized destination type for input data
  * \param calib_quantize **Deprecated**. quantize op will always be calibrated if could
+ * \param quantize_mode quantize mode to be used in quantize pass
+ * \param out_num_calib_names return the number of nodes to be calibrated
+ * \param out_calib_names return the node names to be calibrated
  */
 MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle, SymbolHandle *ret_sym_handle,
                                const int* dev_type,
@@ -1915,7 +1918,9 @@ MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle, SymbolHandle *ret_sym_ha
                                const mx_uint num_excluded_op_names,
                                const char **excluded_op_names,
                                const mx_uint num_offline, const char **offline_params,
-                               const char *quantized_dtype, const bool calib_quantize);
+                               const char *quantized_dtype, const bool calib_quantize,
+                               const char *quantize_mode, mx_uint* out_num_calib_names,
+                               const char ***out_calib_names);
 
 /*!
  * \brief Convert a symbol into a mixed precision symbol with cast operators for target dtype casting
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index 889b5028a460..75d843c98bd2 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -131,6 +131,16 @@ enum class DispatchMode {
   kVariable,
 };
 
+/*! \brief the quantization type of the operator */
+enum class QuantizeType {
+  // This operator doesn't support quantization
+  kNone = 0,
+  // This operator can get huge benefit from quantization, thus must be quantized
+  kMust,
+  // This operator support quantization, but will be decided depending on the connection
+  kSupport,
+};
+
 /*!
  * \brief Operator state. This is a pointer type, its content is mutable
  *  even if OpStatePtr is const.
@@ -297,6 +307,12 @@ using FInferStorageType = std::function<bool (const NodeAttrs& attrs,
                                               std::vector<int>* in_attrs,
                                               std::vector<int>* out_attrs)>;
 
+/*!
+ * \brief Register a quantized node creation function based on the attrs of the node
+ * \note Register under "FQuantizedOp" for non-quantized operators
+ */
+using FQuantizable = std::function<QuantizeType (const NodeAttrs& attrs)>;
+
 /*!
  * \brief Register a quantized node creation function based on the attrs of the node
  * \note Register under "FQuantizedOp" for non-quantized operators
@@ -319,6 +335,20 @@ using FNeedRequantize = std::function<bool (const NodeAttrs& attrs)>;
 using FAvoidQuantizeInput = std::function<bool (const NodeAttrs& attrs,
                                                 size_t index)>;
 
+/*!
+ * \brief Register a function to determine if the input of a quantized operator
+ * needs to be calibrated. This is usually used for the quantized operators
+ * which need calibration on its input.
+ */
+using FNeedCalibrateInput = std::function<std::vector<int> (const NodeAttrs& attrs)>;
+
+/*!
+ * \brief Register a function to determine if the output of a quantized operator
+ * needs to be calibrated. This is usually used for the quantized operators
+ * which need calibration on its output.
+ */
+using FNeedCalibrateOutput = std::function<std::vector<int> (const NodeAttrs& attrs)>;
+
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index c34e934bfa02..ef24bddf9bd1 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -84,7 +84,7 @@ def _quantize_params(qsym, params, th_dict):
     return quantized_params
 
 def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
-                     offline_params=None, quantized_dtype='int8'):
+                     offline_params=None, quantized_dtype='int8', quantize_mode='smart'):
     """Given a symbol object representing a neural network of data type FP32,
     quantize it into a INT8 network.
 
@@ -106,6 +106,9 @@ def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
         avoided.
     quantized_dtype: str
         The quantized destination type for input data.
+    quantize_mode: str
+        The mode that quantization pass to apply.
+
     """
     num_excluded_symbols = 0
     if excluded_symbols is not None:
@@ -129,6 +132,8 @@ def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
             offline.append(c_str(k))
 
     out = SymbolHandle()
+    size = mx_uint()
+    calib_str = ctypes.POINTER(ctypes.c_char_p)()
     check_call(_LIB.MXQuantizeSymbol(sym.handle,
                                      ctypes.byref(out),
                                      ctypes.byref(ctypes.c_int(ctx.device_typeid)),
@@ -139,47 +144,75 @@ def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
                                      mx_uint(num_offline),
                                      c_array(ctypes.c_char_p, offline),
                                      c_str(quantized_dtype),
-                                     ctypes.c_bool(True)))
-    return Symbol(out)
-
-
-class _LayerOutputCollector(object):
-    """Saves layer output NDArray in a dict with layer names as keys and lists of NDArrays as
-    values. The collected NDArrays will be used for calculating the optimal thresholds for
+                                     ctypes.c_bool(True),
+                                     c_str(quantize_mode),
+                                     ctypes.byref(size),
+                                     ctypes.byref(calib_str)))
+    calib_layer = []
+    calib_layer = [py_str(calib_str[i]) for i in range(size.value)]
+    return Symbol(out), calib_layer
+
+def combine_histogram(old_hist, arr, new_min, new_max, new_th):
+    """ Collect layer histogram for arr and combine it with old histogram.
+    """
+    (old_hist, old_hist_edges, old_min, old_max, old_th) = old_hist
+    if new_th <= old_th:
+        hist, _ = np.histogram(arr, bins=len(old_hist), range=(-old_th, old_th))
+        return (old_hist + hist, old_hist_edges, min(old_min, new_min), max(old_max, new_max), old_th)
+    else:
+        # Need to generate new histogram with new_th
+        old_num_bins = len(old_hist)
+        old_step = 2 * old_th / old_num_bins
+        half_increased_bins = int((new_th - old_th) // old_step + 1)
+        new_num_bins = half_increased_bins * 2 + old_num_bins
+        new_th = half_increased_bins * old_step + old_th
+        hist, hist_edges = np.histogram(arr, bins=new_num_bins, range=(-new_th, new_th))
+        hist[half_increased_bins:new_num_bins - half_increased_bins] += old_hist
+        return (hist, hist_edges, min(old_min, new_min), max(old_max, new_max), new_th)
+
+class _LayerHistogramCollector(object):
+    """Saves layer histogram in a dict with layer names as keys and lists of NDArrays as
+    values. The collected histogram will be used for calculating the optimal thresholds for
     quantization using KL divergence.
     """
-    def __init__(self, include_layer=None, logger=None):
-        self.nd_dict = {}
+    def __init__(self, num_bins=8001, include_layer=None, logger=None):
+        self.hist_dict = {}
+        self.num_bins = num_bins
         self.include_layer = include_layer
         self.logger = logger
 
     def collect(self, name, arr):
         """Callback function for collecting layer output NDArrays."""
         name = py_str(name)
-        if self.include_layer is not None and not self.include_layer(name):
+        if name not in self.include_layer:
             return
         handle = ctypes.cast(arr, NDArrayHandle)
-        arr = NDArray(handle, writable=False).copyto(cpu())
+        arr = NDArray(handle, writable=False).copyto(cpu()).asnumpy()
         if self.logger is not None:
-            self.logger.info("Collecting layer %s output of shape %s" % (name, arr.shape))
-        if name in self.nd_dict:
-            self.nd_dict[name].append(arr)
+            self.logger.info("Collecting layer %s histogram of shape %s" % (name, arr.shape))
+        min_range = np.min(arr)
+        max_range = np.max(arr)
+        th = max(abs(min_range), abs(max_range))
+        if name in self.hist_dict:
+            self.hist_dict[name] = combine_histogram(self.hist_dict[name], arr, min_range, max_range, th)
         else:
-            self.nd_dict[name] = [arr]
+            hist, hist_edges = np.histogram(arr, bins=self.num_bins, range=(-th, th))
+            self.hist_dict[name] = (hist, hist_edges, min_range, max_range, th)
 
 class _LayerOutputMinMaxCollector(object):
     """Saves layer output min and max values in a dict with layer names as keys.
     The collected min and max values will be directly used as thresholds for quantization.
     """
-    def __init__(self, include_layer=None, logger=None):
+    def __init__(self, quantized_dtype, include_layer=None, logger=None):
         self.min_max_dict = {}
+        self.quantized_dtype = quantized_dtype
         self.include_layer = include_layer
         self.logger = logger
 
     def collect(self, name, arr):
         """Callback function for collecting min and max values from an NDArray."""
         name = py_str(name)
-        if self.include_layer is not None and not self.include_layer(name):
+        if name not in self.include_layer:
             return
         handle = ctypes.cast(arr, NDArrayHandle)
         arr = NDArray(handle, writable=False)
@@ -239,21 +272,23 @@ def _collect_layer_statistics(mod, data, collector, max_num_examples=None, logge
     return num_examples
 
 
-def _collect_layer_output_min_max(mod, data, include_layer=None,
+def _collect_layer_output_min_max(mod, data, quantized_dtype, include_layer=None,
                                   max_num_examples=None, logger=None):
     """Collect min and max values from layer outputs and save them in
     a dictionary mapped by layer names.
     """
-    collector = _LayerOutputMinMaxCollector(include_layer=include_layer, logger=logger)
+    collector = _LayerOutputMinMaxCollector(quantized_dtype=quantized_dtype,
+                                            include_layer=include_layer, logger=logger)
     num_examples = _collect_layer_statistics(mod, data, collector, max_num_examples, logger)
     return collector.min_max_dict, num_examples
 
 
-def _collect_layer_outputs(mod, data, include_layer=None, max_num_examples=None, logger=None):
+def _collect_layer_histogram(mod, data, include_layer=None,
+                             max_num_examples=None, logger=None):
     """Collect layer outputs and save them in a dictionary mapped by layer names."""
-    collector = _LayerOutputCollector(include_layer=include_layer, logger=logger)
+    collector = _LayerHistogramCollector(include_layer=include_layer, logger=logger)
     num_examples = _collect_layer_statistics(mod, data, collector, max_num_examples, logger)
-    return collector.nd_dict, num_examples
+    return collector.hist_dict, num_examples
 
 
 def _smooth_distribution(p, eps=0.0001):
@@ -277,126 +312,55 @@ def _smooth_distribution(p, eps=0.0001):
 
 
 # pylint: disable=line-too-long
-def _get_optimal_threshold(arr, quantized_dtype, num_bins=8001, num_quantized_bins=255):
+def _get_optimal_threshold(hist_data, quantized_dtype, num_quantized_bins=255):
     """Given a dataset, find the optimal threshold for quantizing it.
     The reference distribution is `q`, and the candidate distribution is `p`.
     `q` is a truncated version of the original distribution.
 
     Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
     """
-    if isinstance(arr, NDArray):
-        arr = arr.asnumpy()
-    elif isinstance(arr, list):
-        assert len(arr) != 0
-        for i, nd in enumerate(arr):
-            if isinstance(nd, NDArray):
-                arr[i] = nd.asnumpy()
-            elif not isinstance(nd, np.ndarray):
-                raise TypeError('get_optimal_threshold only supports input type of NDArray,'
-                                ' list of np.ndarrays or NDArrays, and np.ndarray,'
-                                ' while received type=%s' % (str(type(nd))))
-        arr = np.concatenate(arr)
-    elif not isinstance(arr, np.ndarray):
-        raise TypeError('get_optimal_threshold only supports input type of NDArray,'
-                        ' list of NDArrays and np.ndarray,'
-                        ' while received type=%s' % (str(type(arr))))
-    min_val = np.min(arr)
-    max_val = np.max(arr)
-    th = max(abs(min_val), abs(max_val))
-
+    (hist, hist_edges, min_val, max_val, _) = hist_data
+    num_bins = len(hist)
+    assert (num_bins % 2 == 1)
     if min_val >= 0 and quantized_dtype in ['auto', 'uint8']:
         # We need to move negative bins to positive bins to fit uint8 range.
         num_quantized_bins = num_quantized_bins * 2 + 1
-
-    hist, hist_edges = np.histogram(arr, bins=num_bins, range=(-th, th))
-    zero_bin_idx = num_bins // 2
-    num_half_quantized_bins = num_quantized_bins // 2
-
-    thresholds = np.zeros(num_bins // 2 + 1 - num_quantized_bins // 2)
-    divergence = np.zeros_like(thresholds)
-    quantized_bins = np.zeros(num_quantized_bins, dtype=np.int32)
-    # i means the number of bins on half axis excluding the zero bin.
-    for i in range(num_quantized_bins // 2,
-                   num_bins // 2 + 1):
-        p_bin_idx_start = zero_bin_idx - i
-        p_bin_idx_stop = zero_bin_idx + i + 1
-        thresholds[i - num_half_quantized_bins] = hist_edges[p_bin_idx_stop]
-        sliced_nd_hist = hist[p_bin_idx_start:p_bin_idx_stop]
-
-        # generate reference distribution p
-        p = sliced_nd_hist.copy()
-        assert p.size % 2 == 1
-        assert p.size >= num_quantized_bins
-        # put left outlier count in p[0]
-        left_outlier_count = np.sum(hist[0:p_bin_idx_start])
-        p[0] += left_outlier_count
-        # put right outlier count in p[-1]
-        right_outlier_count = np.sum(hist[p_bin_idx_stop:])
-        p[-1] += right_outlier_count
-        # is_nonzeros[k] indicates whether hist[k] is nonzero
-        is_nonzeros = (p != 0).astype(np.int32)
-
-        # calculate how many bins should be merged to generate quantized distribution q
-        num_merged_bins = sliced_nd_hist.size // num_quantized_bins
-        # merge hist into num_quantized_bins bins
-        for j in range(num_quantized_bins):
-            start = j * num_merged_bins
-            stop = start + num_merged_bins
-            quantized_bins[j] = sliced_nd_hist[start:stop].sum()
-        quantized_bins[-1] += sliced_nd_hist[num_quantized_bins * num_merged_bins:].sum()
-        # expand quantized_bins into p.size bins
-        q = np.zeros(sliced_nd_hist.size, dtype=np.float32)
-        for j in range(num_quantized_bins):
-            start = j * num_merged_bins
-            if j == num_quantized_bins - 1:
-                stop = len(is_nonzeros)
-            else:
-                stop = start + num_merged_bins
-            norm = is_nonzeros[start:stop].sum()
-            if norm != 0:
-                q[start:stop] = float(quantized_bins[j]) / float(norm)
-        q[p == 0] = 0
-        p = _smooth_distribution(p)
-        # There is a chance that q is an invalid probability distribution.
-        try:
-            q = _smooth_distribution(q)
-        except ValueError:
-            divergence[i - num_half_quantized_bins] = float("inf")
-        divergence[i - num_half_quantized_bins] = stats.entropy(p, q)
-
-    min_divergence_idx = np.argmin(divergence)
-    min_divergence = divergence[min_divergence_idx]
-    opt_th = thresholds[min_divergence_idx]
-    return min_val, max_val, min_divergence, opt_th
+    hist = ndarray.array(hist, ctx=cpu())
+    hist_edges = ndarray.array(hist_edges, ctx=cpu())
+    threshold, divergence = ndarray.contrib.calibrate_entropy(hist=hist,
+                                                              hist_edges=hist_edges,
+                                                              num_quantized_bins=num_quantized_bins)
+    threshold = threshold.asnumpy()
+    divergence = divergence.asnumpy()
+    return min_val, max_val, threshold, divergence
 # pylint: enable=line-too-long
 
-
-def _get_optimal_thresholds(nd_dict, quantized_dtype, num_bins=8001, num_quantized_bins=255, logger=None):
+def _get_optimal_thresholds(hist_dict, quantized_dtype, num_quantized_bins=255, logger=None):
     """Given a ndarray dict, find the optimal threshold for quantizing each value of the key."""
     if stats is None:
         raise ImportError('scipy.stats is required for running entropy mode of calculating'
                           ' the optimal thresholds for quantizing FP32 ndarrays into int8.'
                           ' Please check if the scipy python bindings are installed.')
-    assert isinstance(nd_dict, dict)
+    assert isinstance(hist_dict, dict)
     if logger is not None:
         logger.info('Calculating optimal thresholds for quantization using KL divergence'
-                    ' with num_bins=%d and num_quantized_bins=%d' % (num_bins, num_quantized_bins))
+                    ' with num_quantized_bins=%d' % num_quantized_bins)
     th_dict = {}
-    # copy nd_dict keys since the keys() only returns a view in python3
-    layer_names = list(nd_dict.keys())
+    # copy hist_dict keys since the keys() only returns a view in python3
+    layer_names = list(hist_dict.keys())
     for name in layer_names:
-        assert name in nd_dict
-        min_val, max_val, min_divergence, opt_th = \
-            _get_optimal_threshold(nd_dict[name], quantized_dtype, num_bins=num_bins,
+        assert name in hist_dict
+        min_val, max_val, th, divergence = \
+            _get_optimal_threshold(hist_dict[name], quantized_dtype,
                                    num_quantized_bins=num_quantized_bins)
-        del nd_dict[name]  # release the memory of ndarray
-        if min_val < 0:
-            th_dict[name] = (-opt_th, opt_th)
+        if min_val >= 0 and quantized_dtype in ['auto', 'uint8']:
+            th_dict[name] = (0, th)
         else:
-            th_dict[name] = (0, opt_th)
+            th_dict[name] = (-th, th)
+        del hist_dict[name]  # release the memory
         if logger is not None:
-            logger.info('layer=%s, min_val=%f, max_val=%f, min_divergence=%f, optimal_threshold=%f'
-                        % (name, min_val, max_val, min_divergence, opt_th))
+            logger.info('layer=%s, min_val=%f, max_val=%f, th=%f, divergence=%f'
+                        % (name, min_val, max_val, th, divergence))
     return th_dict
 
 
@@ -479,8 +443,8 @@ def _as_data_iter(calib_data):
 def quantize_model(sym, arg_params, aux_params,
                    data_names=('data',), label_names=('softmax_label',),
                    ctx=cpu(), excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
-                   calib_data=None, num_calib_examples=None, calib_layer=None,
-                   quantized_dtype='int8', logger=logging):
+                   calib_data=None, num_calib_examples=None,
+                   quantized_dtype='int8', quantize_mode='smart', logger=logging):
     """User-level API for generating a quantized model from a FP32 model w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -528,15 +492,14 @@ def quantize_model(sym, arg_params, aux_params,
     num_calib_examples : int or None
         The maximum number of examples that user would like to use for calibration. If not provided,
         the whole calibration dataset will be used.
-    calib_layer : function
-        Given a layer's output name in string, return True or False for deciding whether to
-        calibrate this layer. If yes, the statistics of the layer's output will be collected;
-        otherwise, no information of the layer's output will be collected. If not provided,
-        all the layers' outputs that need requantization will be collected.
     quantized_dtype : str
-        The quantized destination type for input data. Currently support 'int8'
-        , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result.
+        The quantized destination type for input data. Currently support 'int8', 'uint8' and 'auto'.
+        'auto' means automatically select output type according to calibration result.
         Default value is 'int8'.
+    quantize_mode : str
+        The mode that quantization pass to apply. Support 'full' and 'smart'.
+        'full' means quantize all operator if possible.
+        'smart' means quantization pass will smartly choice which operator should be quantized.
     logger : Object
         A logging object for printing information during the process of quantization.
 
@@ -564,11 +527,11 @@ def quantize_model(sym, arg_params, aux_params,
     if quantized_dtype not in ('int8', 'uint8', 'auto'):
         raise ValueError('unknown quantized_dtype %s received,'
                          ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
-    qsym = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
-                            excluded_operators=excluded_op_names,
-                            offline_params=list(arg_params.keys()),
-                            quantized_dtype=quantized_dtype)
-
+    qsym, calib_layer = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
+                                         excluded_operators=excluded_op_names,
+                                         offline_params=list(
+                                             arg_params.keys()),
+                                         quantized_dtype=quantized_dtype, quantize_mode=quantize_mode)
     th_dict = {}
     if calib_mode is not None and calib_mode != 'none':
         if not isinstance(ctx, Context):
@@ -587,23 +550,22 @@ def quantize_model(sym, arg_params, aux_params,
             mod.bind(for_training=False, data_shapes=calib_data.provide_data)
         mod.set_params(arg_params, aux_params)
         if calib_mode == 'entropy':
-            nd_dict, num_examples = _collect_layer_outputs(mod, calib_data,
-                                                           include_layer=calib_layer,
-                                                           max_num_examples=num_calib_examples,
-                                                           logger=logger)
+            hist_dict, num_examples = _collect_layer_histogram(mod, calib_data,
+                                                               include_layer=calib_layer,
+                                                               max_num_examples=num_calib_examples,
+                                                               logger=logger)
             logger.info('Collected layer outputs from FP32 model using %d examples' % num_examples)
             logger.info('Calculating optimal thresholds for quantization')
-            th_dict = _get_optimal_thresholds(nd_dict, quantized_dtype, logger=logger)
+            th_dict = _get_optimal_thresholds(hist_dict, quantized_dtype, logger=logger)
         elif calib_mode == 'naive':
             th_dict, num_examples = _collect_layer_output_min_max(
-                mod, calib_data, include_layer=calib_layer, max_num_examples=num_calib_examples,
+                mod, calib_data, quantized_dtype, include_layer=calib_layer, max_num_examples=num_calib_examples,
                 logger=logger)
             logger.info('Collected layer output min/max values from FP32 model using %d examples'
                         % num_examples)
         else:
             raise ValueError('unknown calibration mode %s received,'
                              ' expected `none`, `naive`, or `entropy`' % calib_mode)
-        logger.info('Calibrating quantized symbol')
         qsym = _calibrate_quantized_sym(qsym, th_dict)
 
     logger.info('Quantizing parameters')
@@ -615,7 +577,7 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
                           data_names=('data',), label_names=('softmax_label',),
                           ctx=cpu(), excluded_sym_names=None, excluded_op_names=None,
                           calib_mode='entropy', calib_data=None, num_calib_examples=None,
-                          calib_layer=None, quantized_dtype='int8', logger=logging):
+                          quantized_dtype='int8', logger=logging):
     """User-level API for generating a fusion + quantized model from a FP32 model
     w/ or w/o calibration with Intel MKL-DNN.
     The backend quantized operators are only enabled for Linux systems. Please do not run
@@ -642,7 +604,7 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
                                                    ctx=ctx, excluded_sym_names=excluded_sym_names,
                                                    excluded_op_names=excluded_op_names,
                                                    calib_mode=calib_mode, calib_data=calib_data,
-                                                   num_calib_examples=num_calib_examples, calib_layer=calib_layer,
+                                                   num_calib_examples=num_calib_examples,
                                                    quantized_dtype=quantized_dtype, logger=logger)
 
     qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
@@ -651,7 +613,7 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
 
 def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
                    excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
-                   calib_layer=None, quantized_dtype='int8', logger=logging):
+                   quantized_dtype='int8', logger=logging):
     """User-level API for generating a quantized model from a FP32 model w/o calibration
     and a collector for naive or entropy calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
@@ -682,11 +644,6 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
         If calib_mode='entropy' (default mode), the thresholds for quantization will be
         derived such that the KL divergence between the distributions of FP32 layer outputs and
         quantized layer outputs is minimized based upon the calibration dataset.
-    calib_layer : function
-        Given a layer's output name in string, return True or False for deciding whether to
-        calibrate this layer. If yes, the statistics of the layer's output will be collected;
-        otherwise, no information of the layer's output will be collected. If not provided,
-        all the layers' outputs that need requantization will be collected.
     quantized_dtype : str
         The quantized destination type for input data. Currently support 'int8'
         , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result.
@@ -711,22 +668,23 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
     if quantized_dtype not in ('int8', 'uint8', 'auto'):
         raise ValueError('unknown quantized_dtype %s received,'
                          ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
-    qsym = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
-                            excluded_operators=excluded_op_names,
-                            offline_params=list(arg_params.keys()),
-                            quantized_dtype=quantized_dtype)
+    qsym, calib_layer = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
+                                         excluded_operators=excluded_op_names,
+                                         offline_params=list(
+                                             arg_params.keys()),
+                                         quantized_dtype=quantized_dtype)
 
     th_dict = {}
     collector = None
     if calib_mode is not None and calib_mode != 'none':
         if calib_mode == 'entropy':
-            collector = _LayerOutputCollector(
+            collector = _LayerHistogramCollector(
                 include_layer=calib_layer, logger=logger)
             logger.info(
                 'Create a layer output collector for entropy calibration.')
         elif calib_mode == 'naive':
-            collector = _LayerOutputMinMaxCollector(
-                include_layer=calib_layer, logger=logger)
+            collector = _LayerOutputMinMaxCollector(quantized_dtype=quantized_dtype,
+                                                    include_layer=calib_layer, logger=logger)
             logger.info(
                 'Create a layer output minmax collector for naive calibration')
         else:
@@ -765,11 +723,6 @@ def calib_graph(qsym, arg_params, aux_params, collector,
         If calib_mode='entropy' (default mode), the thresholds for quantization will be
         derived such that the KL divergence between the distributions of FP32 layer outputs and
         quantized layer outputs is minimized based upon the calibration dataset.
-    calib_layer : function
-        Given a layer's output name in string, return True or False for deciding whether to
-        calibrate this layer. If yes, the statistics of the layer's output will be collected;
-        otherwise, no information of the layer's output will be collected. If not provided,
-        all the layers' outputs that need requantization will be collected.
     quantized_dtype : str
         The quantized destination type for input data. Currently support 'int8'
         , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result.
@@ -787,13 +740,12 @@ def calib_graph(qsym, arg_params, aux_params, collector,
         if calib_mode == 'entropy':
             logger.info('Calculating optimal thresholds for quantization')
             th_dict = _get_optimal_thresholds(
-                collector.nd_dict, quantized_dtype, logger=logger)
+                collector.hist_dict, quantized_dtype, logger=logger)
         elif calib_mode == 'naive':
             th_dict = collector.min_max_dict
         else:
             raise ValueError('unknown calibration mode %s received,'
                              ' expected `none`, `naive`, or `entropy`' % calib_mode)
-        logger.info('Calibrating quantized symbol')
         qsym = _calibrate_quantized_sym(qsym, th_dict)
     else:
         raise ValueError('please set calibration mode to naive or entropy.')
@@ -840,11 +792,6 @@ def quantize_net(network, quantized_dtype='auto',
         If calib_mode='entropy' (default mode), the thresholds for quantization will be
         derived such that the KL divergence between the distributions of FP32 layer outputs and
         quantized layer outputs is minimized based upon the calibration dataset.
-    calib_layer : function
-        Given a layer's output name in string, return True or False for deciding whether to
-        calibrate this layer. If yes, the statistics of the layer's output will be collected;
-        otherwise, no information of the layer's output will be collected. If not provided,
-        all the layers' outputs that need requantization will be collected.
     num_calib_examples : int or None
         The maximum number of examples that user would like to use for calibration. If not provided,
         the whole calibration dataset will be used.
@@ -925,7 +872,7 @@ def __exit__(self, exc_type, exc_value, traceback):
     qsym, qarg_params, aux_params, collector = quantize_graph(
         sym=symnet, arg_params=args, aux_params=auxs, ctx=ctx,
         excluded_sym_names=exclude_layers, excluded_op_names=exclude_operators,
-        calib_mode=calib_mode, calib_layer=None, quantized_dtype=quantized_dtype, logger=logger)
+        calib_mode=calib_mode, quantized_dtype=quantized_dtype, logger=logger)
 
     if calib_mode is not None and calib_mode != 'none':
         if not isinstance(ctx, Context):
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index be0cad648015..40391082e1b5 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -888,7 +888,10 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
                      const mx_uint num_offline,
                      const char **offline_params,
                      const char *quantized_dtype,
-                     const bool calib_quantize) {
+                     const bool calib_quantize,
+                     const char *quantize_mode,
+                     mx_uint* out_num_calib_names,
+                     const char ***out_calib_names) {
   nnvm::Symbol *s = new nnvm::Symbol();
   API_BEGIN();
   nnvm::Symbol *sym = static_cast<nnvm::Symbol*>(sym_handle);
@@ -907,12 +910,24 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
     offline.emplace(offline_params[i]);
   }
   std::string quantized_type(quantized_dtype);
+  std::string quantized_mode(quantize_mode);
   g.attrs["excluded_nodes"] = std::make_shared<nnvm::any>(std::move(excluded_node_names));
   g.attrs["excluded_ops"] = std::make_shared<nnvm::any>(std::move(excluded_op));
   g.attrs["offline_params"] = std::make_shared<nnvm::any>(std::move(offline));
   g.attrs["quantized_dtype"] = std::make_shared<nnvm::any>(std::move(quantized_type));
   g.attrs["target_ctx"] = std::make_shared<nnvm::any>(target_dev);
+  g.attrs["quantize_mode"] = std::make_shared<nnvm::any>(std::move(quantized_mode));
   g = ApplyPass(std::move(g), "QuantizeGraph");
+  const auto& calib_nodes = g.GetAttr<std::vector<std::string>>("calib_nodes");
+  MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
+  ret->ret_vec_str = std::move(calib_nodes);
+  *out_num_calib_names = ret->ret_vec_str.size();
+  ret->ret_vec_charp.clear();
+  ret->ret_vec_charp.reserve(ret->ret_vec_str.size());
+  for (const auto &str : ret->ret_vec_str) {
+    ret->ret_vec_charp.push_back(str.c_str());
+  }
+  *out_calib_names = dmlc::BeginPtr(ret->ret_vec_charp);
   s->outputs = g.outputs;
   *ret_sym_handle = s;
   API_END_HANDLE_ERROR(delete s);
diff --git a/src/common/utils.h b/src/common/utils.h
index 9dad5df84fd2..6edfa3c6bb56 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -28,6 +28,7 @@
 #include <dmlc/logging.h>
 #include <dmlc/omp.h>
 #include <nnvm/graph.h>
+#include <nnvm/node.h>
 #include <mxnet/engine.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/op_attr_types.h>
@@ -804,6 +805,15 @@ inline void ConvertToLegacyShape(mxnet::ShapeVector* shapes) {
   }
 }
 
+/*!
+ * \brief This is function can return the output names of a NodeEntry.
+ */
+static inline std::string GetOutputName(const nnvm::NodeEntry& e) {
+  nnvm::Symbol sym;
+  sym.outputs.push_back(e);
+  return sym.ListOutputNames()[0];
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 7bdeac708003..d92253266f35 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1370,24 +1370,13 @@ void GraphExecutor::ExecuteMonInputCallback(size_t nid) {
 }
 
 void GraphExecutor::ExecuteMonOutputCallback(size_t nid) {
-  static const auto& flist_outputs =
-      nnvm::Op::GetAttr<nnvm::FListOutputNames>("FListOutputNames");
   const auto& idx = graph_.indexed_graph();
-  std::vector<std::string> output_names;
   OpNode& opnode = op_nodes_[nid];
-  const auto& inode = idx[nid];
   const auto& node = idx[nid].source;
-  if (flist_outputs.count(node->op())) {
-    output_names = flist_outputs[node->op()](node->attrs);
-  } else {
-    for (size_t i = 0; i < node->num_outputs(); ++i) {
-      output_names.emplace_back(std::to_string(i));
-    }
-  }
-  CHECK_EQ(opnode.exec->out_array.size(), output_names.size());
   for (size_t i = 0; i < opnode.exec->out_array.size(); ++i) {
     NDArray *cpy = new NDArray(opnode.exec->out_array[i]);
-    std::string name = inode.source->attrs.name + "_" + output_names[i];
+    nnvm::NodePtr node_ptr = std::make_shared<nnvm::Node>(*node);
+    std::string name = GetOutputName({node_ptr, static_cast<uint32_t >(i), 0});
     this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy));
   }
 }
diff --git a/src/operator/nn/mkldnn/mkldnn_flatten-inl.h b/src/operator/nn/mkldnn/mkldnn_flatten-inl.h
new file mode 100644
index 000000000000..ae890d8f3d91
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_flatten-inl.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_flatten-inl.h
+ * \brief Implement flatten operator by using mkldnn reorder primitive
+ * \author Wuxun Zhang
+ */
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_FLATTEN_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_FLATTEN_INL_H_
+#if MXNET_USE_MKLDNN == 1
+
+#include "mkldnn_reshape-inl.h"
+
+namespace mxnet {
+namespace op {
+
+class MKLDNNFlattenFwd : public MKLDNNReshapeFwd {
+ public:
+  explicit MKLDNNFlattenFwd(const OpReqType &req, const NDArray &input, const NDArray &output)
+      : MKLDNNReshapeFwd(req, input, output) {}
+};
+
+void MKLDNNFlattenForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const NDArray &input,
+                          const OpReqType &req, const NDArray &output);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_FLATTEN_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_flatten.cc b/src/operator/nn/mkldnn/mkldnn_flatten.cc
index fdc02f960009..4090eb026cfc 100644
--- a/src/operator/nn/mkldnn/mkldnn_flatten.cc
+++ b/src/operator/nn/mkldnn/mkldnn_flatten.cc
@@ -25,19 +25,11 @@
 
 #if MXNET_USE_MKLDNN == 1
 
-#include "mkldnn_reshape-inl.h"
+#include "mkldnn_flatten-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class MKLDNNFlattenFwd : public MKLDNNReshapeFwd {
- public:
-  explicit MKLDNNFlattenFwd(const OpReqType &req,
-                            const NDArray &input,
-                            const NDArray &output)
-    : MKLDNNReshapeFwd(req, input, output) {}
-};
-
 static MKLDNNFlattenFwd &GetFlattenForward(const OpReqType &req,
                                            const NDArray &input,
                                            const NDArray &output) {
diff --git a/src/operator/quantization/calibrate-inl.h b/src/operator/quantization/calibrate-inl.h
new file mode 100644
index 000000000000..ac7d04403be1
--- /dev/null
+++ b/src/operator/quantization/calibrate-inl.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file calibraite-inl.h
+ * \brief Implementation of calibrate operator
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_CALIBRATE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_CALIBRATE_INL_H_
+
+#include <limits>
+#include <vector>
+#include "../mxnet_op.h"
+#include "./quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+struct CalibrateEntropyParam : public dmlc::Parameter<CalibrateEntropyParam> {
+  int num_quantized_bins;
+  DMLC_DECLARE_PARAMETER(CalibrateEntropyParam) {
+    DMLC_DECLARE_FIELD(num_quantized_bins)
+      .set_default(255)
+      .describe(
+          "The number of quantized bins.");
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_QUANTIZATION_CALIBRATE_INL_H_
diff --git a/src/operator/quantization/calibrate.cc b/src/operator/quantization/calibrate.cc
new file mode 100644
index 000000000000..3852eab979f8
--- /dev/null
+++ b/src/operator/quantization/calibrate.cc
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file calibrate.cc
+ * \brief
+ */
+
+#include <numeric>
+#include "./calibrate-inl.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(CalibrateEntropyParam);
+
+// Given a discrete distribution (may have not been normalized to 1),
+// smooth it by replacing zeros with eps multiplied by a scaling factor and taking the
+// corresponding amount off the non-zero values.
+std::vector<float> SmoothDistribution(const std::vector<float>& p, const float eps = 0.0001) {
+  std::vector<size_t> is_zeros(p.size());
+  std::vector<size_t> is_nonzeros(p.size());
+  {
+    auto it = p.begin();
+    std::generate(is_zeros.begin(), is_zeros.end(),
+                  [&it]() { return static_cast<size_t>(*(it++) == 0.f); });
+  }
+  {
+    auto it = p.begin();
+    std::generate(is_nonzeros.begin(), is_nonzeros.end(),
+                  [&it]() { return static_cast<size_t>(*(it++) != 0.f); });
+  }
+
+  size_t n_zeros = std::accumulate(is_zeros.begin(), is_zeros.end(), 0);
+  size_t n_nonzeros = p.size() - n_zeros;
+  if (!n_nonzeros) {
+    // The discrete probability distribution is malformed. All entries are 0.
+    return std::vector<float>();
+  }
+  float eps1 = eps * static_cast<float>(n_zeros) / static_cast<float>(n_nonzeros);
+  if (eps1 >= 1.0) return std::vector<float>();
+  auto ret = p;
+  for (size_t i = 0; i < p.size(); i++) {
+    ret[i] += eps * is_zeros[i] - eps1 * is_nonzeros[i];
+  }
+  return ret;
+}
+
+static float ComputeEntropy(std::vector<float>* p_ptr, std::vector<float>* q_ptr) {
+  std::vector<float>& p = *p_ptr;
+  std::vector<float>& q = *q_ptr;
+  CHECK_EQ(p.size(), q.size());
+  float p_sum = std::accumulate(p.begin(), p.end(), 0.f);
+  float q_sum = std::accumulate(q.begin(), q.end(), 0.f);
+  for (auto& it : p) {
+    it = it / p_sum;
+  }
+
+  for (auto& it : q) {
+    it = it / q_sum;
+  }
+  float ret = 0;
+  for (size_t i = 0; i < p.size(); i++) {
+    CHECK(p[i] > 0 && q[i] > 0);
+    if (p[i] && q[i]) ret += p[i] * std::log(p[i] / q[i]);
+  }
+  return ret;
+}
+
+void CalibrateComputeCPU(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                         const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  const auto& param = nnvm::get<CalibrateEntropyParam>(attrs.parsed);
+  const auto& hist = inputs[0];
+  const auto& hist_ptr = hist.dptr<float>();
+  const auto& hist_edges = inputs[1];
+  const auto& hist_edges_ptr = hist_edges.dptr<float>();
+  float* const out_threshold = outputs[0].dptr<float>();
+  float* const out_divergence = outputs[1].dptr<float>();
+  const auto num_bins = hist.Size();
+  CHECK_EQ(num_bins + 1, hist_edges.Size());
+  int num_quantized_bins = param.num_quantized_bins;
+
+  const int zero_bin_idx = num_bins / 2;
+  const int num_half_quantized_bins = num_quantized_bins / 2;
+  std::vector<float> thresholds(num_bins / 2 + 1 - num_quantized_bins / 2, 0.f);
+  std::vector<float> divergence(thresholds.size(), 0.f);
+  #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+  for (index_t i = num_quantized_bins / 2; i <= zero_bin_idx; i++) {
+    const size_t p_bin_idx_start = zero_bin_idx - i;
+    const size_t p_bin_idx_stop = zero_bin_idx + i + 1;
+    thresholds[i - num_half_quantized_bins] = hist_edges_ptr[p_bin_idx_stop];
+
+    std::vector<size_t> sliced_nd_hist(p_bin_idx_stop - p_bin_idx_start);
+    std::vector<float> p(p_bin_idx_stop - p_bin_idx_start);
+    p[0] = 0;
+    p.back() = 0;
+    for (size_t j = 0; j < num_bins; j++) {
+      if (j <= p_bin_idx_start) {
+        p[0] += hist_ptr[j];
+      } else if (j >= p_bin_idx_stop) {
+        p.back() += hist_ptr[j];
+      } else {
+        sliced_nd_hist[j - p_bin_idx_start] = hist_ptr[j];
+        p[j - p_bin_idx_start] = hist_ptr[j];
+      }
+    }
+    // calculate how many bins should be merged to generate quantized distribution q
+    const auto num_merged_bins = sliced_nd_hist.size() / num_quantized_bins;
+    // merge hist into num_quantized_bins bins
+    std::vector<float> quantized_bins(num_quantized_bins, 0);
+    for (index_t j = 0; j < num_quantized_bins; j++) {
+      const int start = j * num_merged_bins;
+      const int stop = (j + 1) * num_merged_bins;
+      quantized_bins[j] =
+          std::accumulate(sliced_nd_hist.begin() + start, sliced_nd_hist.begin() + stop, 0);
+    }
+    quantized_bins.back() += std::accumulate(
+        sliced_nd_hist.begin() + static_cast<int>(num_quantized_bins * num_merged_bins),
+        sliced_nd_hist.end(), 0);
+    // expand quantized_bins into p.size bins
+    std::vector<float> q(sliced_nd_hist.size(), 0);
+    for (index_t j = 0; j < num_quantized_bins; j++) {
+      const int start = j * num_merged_bins;
+      const int stop = (j == num_quantized_bins - 1) ? q.size() : ((j + 1) * num_merged_bins);
+      int norm = std::count_if(sliced_nd_hist.begin() + start, sliced_nd_hist.begin() + stop,
+                               [](size_t i) { return i != 0; });
+      if (norm) {
+        for (index_t k = start; k < stop; k++) {
+          if (p[k]) q[k] = quantized_bins[j] / norm;
+        }
+      }
+    }
+    p = SmoothDistribution(p);
+    q = SmoothDistribution(q);
+
+    if (!q.size()) {
+      divergence[i - num_half_quantized_bins] = std::numeric_limits<float>::infinity();
+    } else {
+      divergence[i - num_half_quantized_bins] = ComputeEntropy(&p, &q);
+    }
+  }
+
+  size_t min_divergence_idx = 0;
+  float min_divergence = mshadow::red::limits::MaxValue<float>();
+  for (size_t i = 0; i < divergence.size(); i++) {
+    if (divergence[i] < min_divergence) {
+      min_divergence = divergence[i];
+      min_divergence_idx = i;
+    }
+  }
+  *out_divergence = min_divergence;
+  *out_threshold = thresholds[min_divergence_idx];
+}
+
+static inline bool CalibrateShape(const nnvm::NodeAttrs& attrs, std::vector<TShape>* in_attrs,
+                                  std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape(1, 1));
+  return (!shape_is_none(in_attrs->at(0))) && (!shape_is_none(in_attrs->at(1)));
+}
+
+static inline bool CalibrateType(const nnvm::NodeAttrs& attrs, std::vector<int>* in_attrs,
+                                 std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  CHECK(in_attrs->at(0) == mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kFloat32);
+  return true;
+}
+
+NNVM_REGISTER_OP(_contrib_calibrate_entropy)
+.describe(R"code(Provide calibrated min/max for input histogram.
+
+.. Note::
+    This operator only supports forward propagation. DO NOT use it in training.)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<CalibrateEntropyParam>)
+.set_num_inputs(2)
+.set_num_outputs(2)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"hist", "hist_edges"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"threshold", "divergence"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", CalibrateShape)
+.set_attr<nnvm::FInferType>("FInferType", CalibrateType)
+.set_attr<FCompute>("FCompute<cpu>", CalibrateComputeCPU)
+.add_argument("hist", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`")
+.add_argument("hist_edges", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`")
+.add_arguments(CalibrateEntropyParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
index bd1b47e4c2de..7cdce8e32bc8 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -68,10 +68,10 @@ void SgMKLDNNQuantizeOperator::Forward(const OpContext &ctx, const std::vector<N
     } else {
       if (inputs[0].dtype() == mshadow::kUint8) {
         *outputs[1].data().dptr<float>() = 0;
-        *outputs[2].data().dptr<float>() = 255;
+        *outputs[2].data().dptr<float>() = kUint8Range;
       } else {
-        *outputs[1].data().dptr<float>() = -127;
-        *outputs[2].data().dptr<float>() = 127;
+        *outputs[1].data().dptr<float>() = -kInt8Range;
+        *outputs[2].data().dptr<float>() = kInt8Range;
       }
     }
     if (req[0] != kWriteInplace) {
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc
index 05da99207651..2be6b2baca63 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc
@@ -60,7 +60,7 @@ static void MKLDNNQuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs, cons
   auto dataB_mem  = in_data[quantized_elemwise_add_enum::kDataB].GetMKLDNNData();
   const bool is_dataA_int8 = (in_data[quantized_elemwise_add_enum::kDataA].dtype()
                               == mshadow::kInt8);
-  const size_t dataA_range = is_dataA_int8 ? kInt8Range : kUint8Range;
+  const float dataA_range = is_dataA_int8 ? kInt8Range : kUint8Range;
 
   const float A_scale = GetScale(in_data[quantized_elemwise_add_enum::kDataA],
                                  dataA_min,
@@ -72,7 +72,7 @@ static void MKLDNNQuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs, cons
   mkldnn::memory *rescaled_mem;
 
   // output default set as int32
-  size_t output_data_range = kInt32Range;
+  float output_data_range = kInt32Range;
   auto output_data_type = mkldnn::memory::s32;
   // dataA && dataB are uint8
   if (out_data[quantized_elemwise_add_enum::kOut].dtype() == mshadow::kInt8) {
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_flatten.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_flatten.cc
new file mode 100644
index 000000000000..31da936915e6
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_flatten.cc
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file quantized_flatten.cc
+ * \brief
+ */
+
+#if MXNET_USE_MKLDNN == 1
+#include "../../nn/mkldnn/mkldnn_flatten-inl.h"
+#include "../quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+inline static bool FlattenStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask,
+                                     DispatchMode* dispatch_mode, std::vector<int>* in_attrs,
+                                     std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U);
+  CHECK_EQ(out_attrs->size(), 3U);
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+}
+
+static void MKLDNNQuantizedFlattenForward(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                                          const std::vector<NDArray>& inputs,
+                                          const std::vector<OpReqType>& req,
+                                          const std::vector<NDArray>& outputs) {
+  MKLDNNFlattenForward(attrs, ctx, inputs[0], req[0], outputs[0]);
+  outputs[1].data().dptr<float>()[0] = inputs[1].data().dptr<float>()[0];
+  outputs[2].data().dptr<float>()[0] = inputs[2].data().dptr<float>()[0];
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_flatten)
+.set_attr<FInferStorageType>("FInferStorageType", FlattenStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedFlattenForward)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<bool>("TIsMKLDNN", true);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/quantization/quantization_utils.h b/src/operator/quantization/quantization_utils.h
index e7f7ccdf13b7..857841d0db1b 100644
--- a/src/operator/quantization/quantization_utils.h
+++ b/src/operator/quantization/quantization_utils.h
@@ -32,8 +32,8 @@
 namespace mxnet {
 namespace op {
 
-static const size_t kUint8Range = 255;
-static const size_t kInt8Range = 127;
+static const float kUint8Range = 255.5;
+static const float kInt8Range = 127.5;
 static const size_t kInt32Range = 0x7fffffff;
 
 template<typename T>
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index e3139913332d..182f6339308a 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -22,11 +22,16 @@
  * \file quantization.cc
  * \brief
  */
+
+#include <mxnet/op_attr_types.h>
 #include <nnvm/graph.h>
 #include <nnvm/pass.h>
-#include <mxnet/op_attr_types.h>
+#include <queue>
+#include <unordered_map>
 #include <unordered_set>
+#include <vector>
 #include "quantize_v2-inl.h"
+#include "../../common/utils.h"
 
 namespace mxnet {
 namespace op {
@@ -37,7 +42,7 @@ using nnvm::NodePtr;
 using nnvm::NodeEntry;
 using nnvm::Graph;
 
-inline size_t GetNumOutputs(NodePtr node) {
+static inline size_t GetNumOutputs(NodePtr node) {
   // Get NumOutputs, check if current node has NumVisibleOutputs function, if yes, return
   // num_visible_outputs
   size_t num_outputs = node->num_outputs();
@@ -116,17 +121,19 @@ bool isRegistered(NodePtr node, const int& dev_type) {
           fcomputestateful != nullptr || fcomputestateful_ex != nullptr);
 }
 
-inline NodePtr NeedQuantize(
-    NodePtr node, const std::unordered_set<std::string>& excluded_nodes,
-    const std::unordered_set<std::string>& excluded_ops,
-    const int& dev_type) {
+inline QuantizeType NeedQuantize(NodePtr node,
+                                 const std::unordered_set<std::string>& excluded_nodes,
+                                 const std::unordered_set<std::string>& excluded_ops,
+                                 const int& dev_type,
+                                 std::unordered_map<NodePtr, NodePtr>* quantized_node_map) {
   std::unordered_map<NodePtr, NodePtr> quantized_node;
+  static auto& quantizable_map = Op::GetAttr<mxnet::FQuantizable>("FQuantizable");
   static auto& quantized_op_map = Op::GetAttr<mxnet::FQuantizedOp>("FQuantizedOp");
   static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
   const auto& op = node->op();
-
+  bool need = false;
   if (op && quantized_op_map.count(op)) {
-    bool need = true;
+    need = true;
     // If the quantized node is not registered with a computation function, the node
     // will be excluded automatically.
     auto q_ptr = quantized_op_map[node->op()];
@@ -154,20 +161,101 @@ inline NodePtr NeedQuantize(
         }
       }
     }
-
     if (need) {
-      auto n_ptr = quantized_op_map[node->op()];
-      auto tmp_node = n_ptr(node->attrs);
-      if (tmp_node->op()) {
-        quantized_node[node] = tmp_node;
+      auto quantized_node = quantized_op_map[op](node->attrs);
+      if (!quantized_node->op()) need = false;
+      if (need) {
+        quantized_node_map->insert(std::make_pair(node, quantized_node));
+      }
+      if (quantizable_map.count(op)) {
+        return quantizable_map[op](node->attrs);
       } else {
-        quantized_node[node] = nullptr;
+        return QuantizeType::kSupport;
       }
-    } else {
-      quantized_node[node] = nullptr;
     }
   }
-  return quantized_node[node];
+  CHECK(!need);
+  return QuantizeType::kNone;
+}
+
+enum quantize_bit {
+  kFromInput = 1,
+  kFromOutput = 2,
+};
+
+static void MarkQuantizedNodes(const Graph& src,
+                               std::unordered_map<NodePtr, NodePtr>* quantized_node_map) {
+  const auto excluded_nodes = src.GetAttr<std::unordered_set<std::string>>("excluded_nodes");
+  const auto excluded_ops = src.GetAttr<std::unordered_set<std::string>>("excluded_ops");
+  const auto quantize_mode = src.GetAttr<std::string>("quantize_mode");
+  const auto dev_type = src.GetAttr<int>("target_ctx");
+
+  std::unordered_map<NodePtr, std::vector<NodePtr>> node_output_map;
+  std::unordered_set<NodePtr> must_quantize_nodes;
+  std::unordered_map<NodePtr, int> support_quantize_nodes;
+  // Build node_output_map, must_quantize_nodes and support_quantize_nodes;
+  DFSVisit(src.outputs, [&](const NodePtr& node) {
+    auto quantize_type =
+        NeedQuantize(node, excluded_nodes, excluded_ops, dev_type, quantized_node_map);
+    if (quantize_type == QuantizeType::kMust) {
+      must_quantize_nodes.insert(node);
+    } else if (quantize_type == QuantizeType::kSupport) {
+      support_quantize_nodes[node] = 0;
+    }
+    for (size_t i = 0; i < node->inputs.size(); ++i) {
+      node_output_map[node->inputs[i].node].push_back(node);
+    }
+  });
+
+  if (quantize_mode == "full") {
+    return;
+  } else if (quantize_mode == "smart") {
+    // Mark quantized nodes from input
+    std::queue<NodePtr> task_queue;
+    for (const auto& node : must_quantize_nodes) {
+      task_queue.push(node);
+    }
+    while (!task_queue.empty()) {
+      const auto& node = task_queue.front();
+      task_queue.pop();
+      for (size_t i = 0; i < node->inputs.size(); ++i) {
+        const auto& input = node->inputs[i].node;
+        auto it = support_quantize_nodes.find(input);
+        if (it != support_quantize_nodes.end()) {
+          it->second = it->second | kFromInput;
+          task_queue.push(input);
+        }
+      }
+    }
+
+    // Mark quantized nodes from output
+    for (const auto& node : must_quantize_nodes) {
+      task_queue.push(node);
+    }
+    while (!task_queue.empty()) {
+      const auto& node = task_queue.front();
+      task_queue.pop();
+      const auto& outputs = node_output_map[node];
+      for (size_t i = 0; i < outputs.size(); ++i) {
+        const auto& output = outputs[i];
+        auto it = support_quantize_nodes.find(output);
+        if (it != support_quantize_nodes.end()) {
+          it->second = it->second | kFromOutput;
+          task_queue.push(output);
+        }
+      }
+    }
+
+    // Summarize the result
+    for (const auto& node : support_quantize_nodes) {
+      CHECK(quantized_node_map->count(node.first));
+      if (node.second != (kFromInput | kFromOutput)) {
+        quantized_node_map->erase(node.first);
+      }
+    }
+  } else {
+    LOG(FATAL) << "unrecognized quantize mode: " << quantize_mode;
+  }
 }
 
 Graph QuantizeGraph(Graph &&src) {
@@ -175,24 +263,25 @@ Graph QuantizeGraph(Graph &&src) {
   static const auto& need_requantize_map = Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
   static const auto& avoid_quantize_input_map =
       Op::GetAttr<mxnet::FAvoidQuantizeInput>("FAvoidQuantizeInput");
-  const auto dev_type = src.GetAttr<int>("target_ctx");
   const auto offline_params = src.GetAttr<std::unordered_set<std::string>>("offline_params");
-  const auto excluded_nodes = src.GetAttr<std::unordered_set<std::string>>("excluded_nodes");
-  const auto excluded_ops = src.GetAttr<std::unordered_set<std::string>>("excluded_ops");
   const auto quantized_dtype = src.GetAttr<std::string>("quantized_dtype");
 
+  std::unordered_map<NodePtr, NodePtr> quantized_node_map;
+  MarkQuantizedNodes(src, &quantized_node_map);
+
   // mirror_map stores the mapping from the currently visited graph to the newly created quantized
   // graph. Key is the currently visited graph's node pointer, and value is a copied node of the key
   // node. The existing key's value may be updated with the newly created quantize/dequantize op.
   std::unordered_map<Node*, NodePtr> mirror_map;
+  std::unordered_map<NodePtr, NodePtr> reverse_mirror_map;
   nnvm::NodeEntryMap<NodeEntry> mirror_entry_map;
   DFSVisit(src.outputs, [&](const NodePtr& node) {
     NodePtr new_node = Node::Create();
     // If the currently visited node needs quantization, insert a quantize op node before the
     // current node and replace the current node with the quantized version in the new graph.
-    auto tmp_node = NeedQuantize(node, excluded_nodes, excluded_ops, dev_type);
-    if (tmp_node) {
-      new_node = tmp_node;
+    if (quantized_node_map.count(node)) {
+      std::cout << node->attrs.name << " is quantized." << std::endl;
+      new_node = quantized_node_map[node];
 
       // add data into quantized op input
       for (size_t i = 0; i < node->inputs.size(); ++i) {
@@ -208,7 +297,7 @@ Graph QuantizeGraph(Graph &&src) {
         if (avoid_quantize_input_map.count(node->op()) &&
             avoid_quantize_input_map[node->op()](node->attrs, i)) {
           new_node->inputs.emplace_back(mirror_entry);
-        } else if (!NeedQuantize(e.node, excluded_nodes, excluded_ops, dev_type)) {
+        } else if (!quantized_node_map.count(e.node)) {
           if (mirror_entry_map.count(e)) {
             new_node->inputs.emplace_back(mirror_entry_map[e]);
           } else {
@@ -261,7 +350,7 @@ Graph QuantizeGraph(Graph &&src) {
           // skip non-quantized input
           continue;
         }
-        if (NeedQuantize(e.node, excluded_nodes, excluded_ops, dev_type)) {
+        if (quantized_node_map.count(e.node)) {
           // here we calculate the output number (exclude min/max, in order to
           // calculate min/max index from mirror node) based on assumption that
           // there is only 1min and 1max output from mirror node (which is
@@ -306,6 +395,7 @@ Graph QuantizeGraph(Graph &&src) {
       // (e.g., a quantized_conv2d node), and insert a dequantize op node in the new graph if there
       // are any. Otherwise, simply add a copy of the current node's entry to the inputs of
       // the new_node.
+      if (!node->is_variable()) std::cout << node->attrs.name << " is NOT quantized." << std::endl;
       *new_node = *node;
       new_node->inputs.clear();
       for (const auto& e : node->inputs) {
@@ -313,7 +403,7 @@ Graph QuantizeGraph(Graph &&src) {
         NodeEntry mirror_entry = NodeEntry{
           mirror_node, e.index, e.version};
         // if input node is quantized operator, add dequantize node
-        if (NeedQuantize(e.node, excluded_nodes, excluded_ops, dev_type) &&
+        if (quantized_node_map.count(e.node) &&
             (mirror_node->op() != Op::Get("_contrib_dequantize"))) {
           // here we calculate the output number (exclude min/max, in order to
           // calculate min/max index from mirror node) based on assumption that
@@ -330,7 +420,8 @@ Graph QuantizeGraph(Graph &&src) {
           dequantize_node->op()->attr_parser(&(dequantize_node->attrs));
 
           new_node->inputs.emplace_back(dequantize_node, 0, 0);
-          mirror_map[e.node.get()] = std::move(dequantize_node);
+          mirror_map[e.node.get()] = dequantize_node;
+          reverse_mirror_map[dequantize_node] = e.node;
         } else if (mirror_entry_map.count(e)) {
           new_node->inputs.emplace_back(
               mirror_entry_map[e].node->inputs[0].node, e.index, e.version);
@@ -339,12 +430,13 @@ Graph QuantizeGraph(Graph &&src) {
         }
       }
     }
-    mirror_map[node.get()] = std::move(new_node);
+    mirror_map[node.get()] = new_node;
+    reverse_mirror_map[new_node] = node;
   });
 
   std::vector<NodeEntry> outputs;
   for (const auto& e : src.outputs) {
-    if (NeedQuantize(e.node, excluded_nodes, excluded_ops, dev_type)) {
+    if (quantized_node_map.count(e.node)) {
       // Only insert dequantize for those Ops supports quantize and not excluded.
       NodePtr mirror_node = mirror_map.at(e.node.get());
       NodeEntry mirror_entry = NodeEntry{mirror_node, e.index, e.version};
@@ -372,103 +464,89 @@ Graph QuantizeGraph(Graph &&src) {
 
   Graph ret;
   ret.outputs = std::move(outputs);
-  return ret;
-}
 
-Graph SetCalibTableToQuantizedGraph(Graph&& g) {
-  static const auto& flist_outputs =
-    nnvm::Op::GetAttr<nnvm::FListOutputNames>("FListOutputNames");
-  static const auto& need_requantize_map =
-    nnvm::Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
-  const auto& calib_table =
-    g.GetAttr<std::unordered_map<std::string, std::pair<float, float>>>("calib_table");
-  DFSVisit(g.outputs, [&](const NodePtr& node) {
-    // If the current op is requantize
-    // find the thresholds from the calibration table with the key equal
-    // to the current op's input node name, e.g. a quantized_conv2d node.
-    if (node->op() == Op::Get("_contrib_requantize")) {
-      NodePtr quantized_op_node = node->inputs[0].node;
-      CHECK(quantized_op_node->op() != nullptr) << quantized_op_node->attrs.name
-                                                << " must be an quantized op node";
-      CHECK(need_requantize_map.count(quantized_op_node->op()) > 0
-          && need_requantize_map[quantized_op_node->op()](quantized_op_node->attrs))
-          << quantized_op_node->attrs.name << " op must register FNeedRequantize attr"
-                                              " and the attr func should return true";
-      const std::string prefix = "quantized_";
-      CHECK(std::equal(prefix.begin(), prefix.end(), quantized_op_node->attrs.name.begin()))
-          << "an quantized op should start with `quantized_`";
-
-      std::string out_data_name = quantized_op_node->attrs.name.substr(prefix.size()) + "_";
-      auto list_output_names_func = flist_outputs.get(quantized_op_node->op(), nullptr);
-      // Here it's assumed that the quantized_op node only produces three outputs:
-      // out_data, min_range, and max_range. So we want to get the pre-calculated min_calib_range
-      // and max_calib_range from the calibration table for out_data. Here we create the output
-      // data name same as its constructed in GraphExecutor::ExecuteMonCallback.
-      if (list_output_names_func != nullptr) {
-        std::vector<std::string> names = list_output_names_func(quantized_op_node->attrs);
-        CHECK_EQ(names.size(), 3U) << "ListOutputNames is expected to return three string for"
-                                      " quantized operators";
-        out_data_name += names[0];
-      } else {
-        out_data_name += "0";
-      }
-      const auto calib_table_iter = calib_table.find(out_data_name);
-      if (calib_table_iter != calib_table.end()) {
-        node->attrs.dict["min_calib_range"] = std::to_string(calib_table_iter->second.first);
-        node->attrs.dict["max_calib_range"] = std::to_string(calib_table_iter->second.second);
-        node->op()->attr_parser(&(node->attrs));
-      }
-    } else if (node->op() == Op::Get("_contrib_quantize_v2")) {
-      NodePtr float_op_node = node->inputs[0].node;
-      auto float_op_idx = node->inputs[0].index;
-      std::string out_data_name = float_op_node->attrs.name;
-      if (float_op_node->op()) {
-        auto list_output_names_func = flist_outputs.get(float_op_node->op(), nullptr);
-        // We want to get the pre-calculated min_range and max_range from the calibration table for
-        // out_data. Here we create the output data name same as its constructed in
-        // GraphExecutor::ExecuteMonCallback.
-        if (list_output_names_func != nullptr) {
-          std::vector<std::string> names = list_output_names_func(float_op_node->attrs);
-          out_data_name += "_" + names[float_op_idx];
+  static const auto& need_calib_input_map =
+      Op::GetAttr<mxnet::FNeedCalibrateInput>("FNeedCalibrateInput");
+  static const auto& need_calib_output_map =
+      Op::GetAttr<mxnet::FNeedCalibrateOutput>("FNeedCalibrateOutput");
+  std::vector<std::string> calib_nodes;
+  DFSVisit(ret.outputs, [&](const NodePtr& node) {
+    if (need_calib_input_map.count(node->op())) {
+      const auto calib_idx = need_calib_input_map[node->op()](node->attrs);
+      for (const auto &idx : calib_idx) {
+        if (reverse_mirror_map.count(node)) {
+          calib_nodes.push_back(common::GetOutputName(
+              {reverse_mirror_map[node], node->inputs[idx].index, node->inputs[idx].version}));
         } else {
-          out_data_name += "_" + std::to_string(float_op_idx);
-        }
-      }
-      const auto calib_table_iter = calib_table.find(out_data_name);
-      if (calib_table_iter != calib_table.end()) {
-        node->attrs.dict["min_calib_range"] = std::to_string(calib_table_iter->second.first);
-        node->attrs.dict["max_calib_range"] = std::to_string(calib_table_iter->second.second);
-        node->op()->attr_parser(&(node->attrs));
-        const QuantizeV2Param& param = nnvm::get<QuantizeV2Param>(node->attrs.parsed);
-        if (param.out_type == QuantizeOutType::kUint8 &&
-            param.min_calib_range.value() < 0.0f) {
-          LOG(WARNING) << "Calibration statistics indicates that node `" << node->attrs.name
-                       << "` has negative input, consider use `auto` or `int8` as out_type";
+          const auto& e = node->inputs[idx];
+          if (e.node->is_variable()) {
+            calib_nodes.push_back(e.node->attrs.name);
+          } else {
+            if (reverse_mirror_map.count(e.node)) {
+              const auto& fp32_in_node = reverse_mirror_map.at(e.node);
+              calib_nodes.push_back(common::GetOutputName({fp32_in_node, e.index, e.version}));
+            } else {
+              LOG(FATAL) << "Can't find calibration node for " << node->attrs.name;
+            }
+          }
         }
       }
-    } else if (node->op() == Op::Get("_contrib_quantized_batch_norm")) {
-      auto quantized_op_idx = node->inputs[0].index;
-      const std::string prefix = "quantized_";
-      std::string out_data_name = node->attrs.name.substr(prefix.size());
-      if (node->op()) {
-        auto list_output_names_func = flist_outputs.get(node->op(), nullptr);
-        // We want to get the pre-calculated min_range and max_range from the calibration table for
-        // out_data. Here we create the output data name same as its constructed in
-        // GraphExecutor::ExecuteMonCallback.
-        if (list_output_names_func != nullptr) {
-          std::vector<std::string> names = list_output_names_func(node->attrs);
-          out_data_name += "_" + names[quantized_op_idx];
+    } else if (need_calib_output_map.count(node->op())) {
+      const auto calib_idx = need_calib_output_map[node->op()](node->attrs);
+      for (const auto& idx : calib_idx) {
+        if (reverse_mirror_map.count(node)) {
+          calib_nodes.push_back(
+              common::GetOutputName({reverse_mirror_map[node], static_cast<uint32_t>(idx), 0}));
         } else {
-          out_data_name += "_" + std::to_string(quantized_op_idx);
+          calib_nodes.push_back(common::GetOutputName({node, static_cast<uint32_t>(idx), 0}));
         }
       }
+    }
+  });
+  ret.attrs["calib_nodes"] = std::make_shared<dmlc::any>(std::move(calib_nodes));
+  return ret;
+}
 
-      const auto calib_table_iter = calib_table.find(out_data_name);
-      if (calib_table_iter != calib_table.end()) {
-        node->attrs.dict["min_calib_range"] = std::to_string(calib_table_iter->second.first);
-        node->attrs.dict["max_calib_range"] = std::to_string(calib_table_iter->second.second);
-        node->op()->attr_parser(&(node->attrs));
-      }
+static inline void SetCalibTableForEntry(
+    const NodeEntry& e, const NodePtr& node,
+    const std::unordered_map<std::string, std::pair<float, float>>& calib_table) {
+  std::string out_data_name = common::GetOutputName(e);
+  const std::string prefix = "quantized_";
+  if (e.node->attrs.name.rfind(prefix, 0) == 0) {
+    out_data_name = out_data_name.substr(prefix.size());
+  }
+  const auto calib_table_iter = calib_table.find(out_data_name);
+  if (calib_table_iter != calib_table.end()) {
+    std::cout << "Set calibration result to " << node->attrs.name
+              << " : min=" << calib_table_iter->second.first
+              << " max=" << calib_table_iter->second.second << std::endl;
+    node->attrs.dict["min_calib_range"] = std::to_string(calib_table_iter->second.first);
+    node->attrs.dict["max_calib_range"] = std::to_string(calib_table_iter->second.second);
+    if (node->op() && node->op()->attr_parser) node->op()->attr_parser(&(node->attrs));
+  } else {
+    std::cout << "Can't find calibration result for " << node->attrs.name << std::endl;
+  }
+}
+
+Graph SetCalibTableToQuantizedGraph(Graph&& g) {
+  const auto& calib_table =
+      g.GetAttr<std::unordered_map<std::string, std::pair<float, float>>>("calib_table");
+  static const auto& need_calib_input_map =
+      Op::GetAttr<mxnet::FNeedCalibrateInput>("FNeedCalibrateInput");
+  static const auto& need_calib_output_map =
+      Op::GetAttr<mxnet::FNeedCalibrateOutput>("FNeedCalibrateOutput");
+  std::cout << "Set calibration result to quantized symbol." << std::endl;
+  DFSVisit(g.outputs, [&](const NodePtr& node) {
+    if (need_calib_input_map.count(node->op())) {
+      const auto calib_idx = need_calib_input_map[node->op()](node->attrs);
+      CHECK_EQ(calib_idx.size(), 1);
+      const auto& idx = calib_idx[0];
+      SetCalibTableForEntry(node->inputs[idx], node, calib_table);
+    } else if (need_calib_output_map.count(node->op())) {
+      const auto calib_idx = need_calib_output_map[node->op()](node->attrs);
+      CHECK_EQ(calib_idx.size(), 1);
+      const auto& idx = calib_idx[0];
+      SetCalibTableForEntry({node, static_cast<uint32_t>(idx), 0}, node, calib_table);
     }
   });
   return g;
@@ -477,6 +555,7 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
 NNVM_REGISTER_PASS(QuantizeGraph)
 .describe("")
 .set_body(QuantizeGraph)
+.provide_graph_attr("calib_nodes")
 .set_change_graph(true);
 
 NNVM_REGISTER_PASS(SetCalibTableToQuantizedGraph)
diff --git a/src/operator/quantization/quantize_v2.cc b/src/operator/quantization/quantize_v2.cc
index e9017a58a82c..9a30386723be 100644
--- a/src/operator/quantization/quantize_v2.cc
+++ b/src/operator/quantization/quantize_v2.cc
@@ -114,6 +114,9 @@ If min_calib_range isn't presented, the output type will be int8.
 .set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity", [](const NodeAttrs& attrs){
   return std::vector<bool>{true};
 })
+.set_attr<FNeedCalibrateInput>("FNeedCalibrateInput", [](const NodeAttrs& attrs){
+  return std::vector<int>{0};
+})
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
   const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
   if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
diff --git a/src/operator/quantization/quantized_batch_norm.cc b/src/operator/quantization/quantized_batch_norm.cc
index 3c46e1b8bd5c..285c34d3a67f 100644
--- a/src/operator/quantization/quantized_batch_norm.cc
+++ b/src/operator/quantization/quantized_batch_norm.cc
@@ -112,6 +112,9 @@ the float32 data into int8.
 .set_attr<nnvm::FInferType>("FInferType", QuantizedBatchNormType)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return false; })
+.set_attr<FNeedCalibrateInput>("FNeedCalibrateOutput", [](const NodeAttrs& attrs){
+  return std::vector<int>{0};
+})
 .add_argument("data", "NDArray-or-Symbol", "Input data.")
 .add_argument("gamma", "NDArray-or-Symbol", "gamma.")
 .add_argument("beta", "NDArray-or-Symbol", "beta.")
diff --git a/src/operator/quantization/quantized_conv.cc b/src/operator/quantization/quantized_conv.cc
index aa3f5ce1ad61..9d774ddf24f1 100644
--- a/src/operator/quantization/quantized_conv.cc
+++ b/src/operator/quantization/quantized_conv.cc
@@ -180,6 +180,9 @@ and max thresholds representing the threholds for quantizing the float32 output
 .add_arguments(ConvolutionParam::__FIELDS__());
 
 NNVM_REGISTER_OP(Convolution)
+.set_attr<FQuantizable>("FQuantizable", [](const NodeAttrs& attrs) {
+    return QuantizeType::kMust;
+})
 .set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
     nnvm::NodePtr node = nnvm::Node::Create();
     node->attrs.op = Op::Get("_contrib_quantized_conv");
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 23790ca78b3d..4c9d9d2f8095 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -361,6 +361,9 @@ and max thresholds representing the threholds for quantizing the float32 output
 .add_arguments(FullyConnectedParam::__FIELDS__());
 
 NNVM_REGISTER_OP(FullyConnected)
+.set_attr<FQuantizable>("FQuantizable", [](const NodeAttrs& attrs) {
+    return QuantizeType::kMust;
+})
 .set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
     nnvm::NodePtr node = nnvm::Node::Create();
     node->attrs.op = Op::Get("_contrib_quantized_fully_connected");
diff --git a/src/operator/quantization/requantize.cc b/src/operator/quantization/requantize.cc
index 43682383b0d6..9ee299cf4ae9 100644
--- a/src/operator/quantization/requantize.cc
+++ b/src/operator/quantization/requantize.cc
@@ -77,6 +77,9 @@ inference accuracy.
 #else
 .set_attr<FCompute>("FCompute<cpu>", RequantizeForward<cpu>)
 #endif
+.set_attr<FNeedCalibrateInput>("FNeedCalibrateInput", [](const NodeAttrs& attrs){
+  return std::vector<int>{0};
+})
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
     const RequantizeParam& param =
       nnvm::get<RequantizeParam>(attrs.parsed);
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index be2716f139ed..d9bfa02b8820 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -789,6 +789,9 @@ NNVM_REGISTER_OP(_sg_mkldnn_conv)
                                 DefaultSubgraphOpMutableInputs)
 .set_attr<std::string>("key_var_num_args", "num_args")
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", SgMKLDNNConvInplaceOption)
+.set_attr<FQuantizable>("FQuantizable", [](const NodeAttrs& attrs) {
+    return QuantizeType::kMust;
+})
 .set_attr<FQuantizedOp>("FQuantizedOp", SgMKLDNNConvQuantizedOp)
 .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
 .set_attr<FAvoidQuantizeInput>("FAvoidQuantizeInput", SgMKLDNNAvoidQuantizeInput);
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc.cc b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
index 6ebbb79d2571..a4f0bfa32ade 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
@@ -457,6 +457,9 @@ NNVM_REGISTER_OP(_sg_mkldnn_fully_connected)
 .set_attr<nnvm::FMutateInputs>("FMutateInputs",
                                DefaultSubgraphOpMutableInputs)
 .set_attr<std::string>("key_var_num_args", "num_args")
+.set_attr<FQuantizable>("FQuantizable", [](const NodeAttrs& attrs) {
+    return QuantizeType::kMust;
+})
 .set_attr<FQuantizedOp>("FQuantizedOp", SgMKLDNNFCQuantizedOp)
 .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; });
 
diff --git a/tests/python/mkl/test_subgraph.py b/tests/python/mkl/test_subgraph.py
index eba77f2c8710..00e2e824c6f5 100644
--- a/tests/python/mkl/test_subgraph.py
+++ b/tests/python/mkl/test_subgraph.py
@@ -167,9 +167,9 @@ def check_quantize(sym, data_shape, out_type, name='conv',
                                                                    quantized_dtype=out_type,
                                                                    calib_mode='naive',
                                                                    calib_data=calib_data,
-                                                                   calib_layer=None,
                                                                    label_names=None,
-                                                                   num_calib_examples=1)
+                                                                   num_calib_examples=1,
+                                                                   quantize_mode='full')
   qsym = qsym.get_backend_symbol(QUANTIZE_SG_PASS_NAME)
   if check_calibration:
     check_qsym_calibrated(qsym, out_type, name=name)
@@ -218,7 +218,6 @@ def check_quantize_whole_model(out_type):
     calib_data = mx.nd.random.uniform(shape=data_shape)
     calib_data = mx.io.NDArrayIter(data=calib_data)
     calib_data = DummyIter(calib_data)
-    calib_layer = lambda name: name.endswith('_output')
     qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym_sg,
                                                                      arg_params=arg_params,
                                                                      aux_params=aux_params,
@@ -227,9 +226,9 @@ def check_quantize_whole_model(out_type):
                                                                      quantized_dtype=out_type,
                                                                      calib_mode='naive',
                                                                      calib_data=calib_data,
-                                                                     calib_layer=calib_layer,
                                                                      label_names=None,
-                                                                     num_calib_examples=1)
+                                                                     num_calib_examples=1,
+                                                                     quantize_mode='full')
     qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
     check_qsym_forward(qsym, qarg_params, qaux_params, data_shape)
 
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 31bc1638b010..d753b514302e 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -80,7 +80,7 @@ def baseline_dequantization(qdata, real_range, qdata_np):
     def test_nd_array_dequantization(qdata, min_range, max_range, expected_result):
         data = mx.nd.contrib.dequantize(qdata, min_range, max_range, out_type='float32')
         assert data.dtype == np.float32
-        assert_almost_equal(data.asnumpy(), expected_result)
+        assert_almost_equal(data.asnumpy(), expected_result, atol = 1)
 
     def test_symbolic_api_dequantization(qdata, min_range, max_range, expected_result):
         sym_data = mx.sym.Variable('data')
@@ -92,9 +92,9 @@ def test_symbolic_api_dequantization(qdata, min_range, max_range, expected_resul
                            args={'data':qdata, 'min_range':min_range, 'max_range':max_range})
         data = out.forward()[0]
         assert data.dtype == np.float32
-        assert_almost_equal(data.asnumpy(), expected_result)
+        assert_almost_equal(data.asnumpy(), expected_result, atol = 1)
 
-    real_range = 402.3347
+    real_range = 128
     shape = rand_shape_nd(4)
     qdata_np = np.random.uniform(low=-127, high=127, size=shape).astype(dtype=np.int8)
     qdata, min_range, max_range = get_test_data(real_range, qdata_np)
@@ -691,7 +691,8 @@ def test_quantize_params():
     params = {}
     for name in offline_params:
         params[name] = mx.nd.uniform(shape=(2, 2))
-    qsym = mx.contrib.quant._quantize_symbol(sym, ctx=mx.current_context(), offline_params=offline_params)
+    qsym, _ = mx.contrib.quant._quantize_symbol(sym, ctx=mx.current_context(),
+                                                offline_params=offline_params, quantize_mode='full')
     qparams = mx.contrib.quant._quantize_params(qsym, params, th_dict = {})
     param_names = params.keys()
     qparam_names = qparams.keys()
@@ -807,7 +808,8 @@ def check_qsym_qdtype(qsym, qdtype):
                                                                              aux_params=aux_params,
                                                                              ctx=mx.current_context(),
                                                                              quantized_dtype=qdtype,
-                                                                             calib_mode='none')
+                                                                             calib_mode='none',
+                                                                             quantize_mode='full')
             check_params(arg_params, qarg_params, qsym)
             check_params(aux_params, qaux_params)
 
@@ -821,7 +823,8 @@ def check_qsym_qdtype(qsym, qdtype):
                                                                              quantized_dtype=qdtype,
                                                                              calib_mode='naive',
                                                                              calib_data=calib_data,
-                                                                             num_calib_examples=20)
+                                                                             num_calib_examples=20,
+                                                                             quantize_mode='full')
             check_params(arg_params, qarg_params, qsym)
             check_params(aux_params, qaux_params)
             check_qsym_calibrated(qsym)
@@ -963,7 +966,8 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape=N
                                                                              excluded_op_names=excluded_op_names,
                                                                              ctx=mx.current_context(),
                                                                              quantized_dtype=qdtype,
-                                                                             calib_mode='none')
+                                                                             calib_mode='none',
+                                                                             quantize_mode='full')
             check_params(arg_params, qarg_params, qsym)
             check_params(aux_params, qaux_params)
             check_qsym_forward(qsym, qarg_params, qaux_params, dshape, lshape)
@@ -980,7 +984,8 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape=N
                                                                              quantized_dtype=qdtype,
                                                                              calib_mode='naive',
                                                                              calib_data=calib_data,
-                                                                             num_calib_examples=20)
+                                                                             num_calib_examples=20,
+                                                                             quantize_mode='full')
             check_params(arg_params, qarg_params, qsym)
             check_params(aux_params, qaux_params)
             check_qsym_calibrated(qsym)
@@ -1047,7 +1052,8 @@ def test_quantize_sym_with_calib():
     sym = get_fp32_sym()
     offline_params = [name for name in sym.list_arguments()
                       if not name.startswith('data') and not name.endswith('label')]
-    qsym = mx.contrib.quant._quantize_symbol(sym, ctx=mx.current_context(), offline_params=offline_params)
+    qsym, _ = mx.contrib.quant._quantize_symbol(sym, ctx=mx.current_context(),
+                                             offline_params=offline_params, quantize_mode='full')
     requantize_op_names = ['requantize_conv', 'requantize_fc']
     th_dict = {'conv_output': (np.random.uniform(low=100.0, high=200.0), np.random.uniform(low=100.0, high=200.0)),
                'fc_output': (np.random.uniform(low=100.0, high=200.0), np.random.uniform(low=100.0, high=200.0))}
@@ -1080,11 +1086,22 @@ def test_optimal_threshold_adversarial_case():
     # The worst case for the optimal_threshold function is when the values are concentrated
     # at one edge: [0, 0, ..., 1000]. (histogram)
     # We want to make sure that the optimal threshold in this case is the max.
-    arr = np.array([2] * 1000)
+    hist = []
+    hist_edges = []
+    min_val = -2
+    max_val = 2
+    for i in range(0, 998):
+        hist.append(0)
+    for i in range(0, 999):
+        hist_edges.append((max_val - min_val) / 999 * i + min_val)
+    hist.append(1000)
+    hist_edges.append(max_val)
+    hist_data = (hist, hist_edges, min_val, max_val, max_val)
     for dtype in ['uint8', 'int8', 'auto']:
-        res = mx.contrib.quant._get_optimal_threshold(arr, dtype, num_quantized_bins=5)
+        res = mx.contrib.quant._get_optimal_threshold(hist_data, dtype, num_quantized_bins=5)
         # The threshold should be 2.
-        assert res[3] - 2 < 1e-5
+        print (res)
+        assert abs(res[2] - 2) < 1e-5
 
 
 @with_seed()
@@ -1097,9 +1114,15 @@ def get_threshold(nd):
         return mx.nd.maximum(mx.nd.abs(min_nd), mx.nd.abs(max_nd)).asnumpy()
 
     for dtype in ['uint8', 'int8', 'auto']:
-        nd_dict = {'layer1': mx.nd.uniform(low=-10.532, high=11.3432, shape=(8, 3, 23, 23), dtype=np.float64)}
-        expected_threshold = get_threshold(nd_dict['layer1'])
-        th_dict = mx.contrib.quant._get_optimal_thresholds(nd_dict, dtype)
+        nd = mx.nd.uniform(low=-10.532, high=11.3432, shape=(8, 3, 23, 23), dtype=np.float64)
+        expected_threshold = get_threshold(nd)
+        arr = nd.asnumpy()
+        min_range = np.min(arr)
+        max_range = np.max(arr)
+        th = max(abs(min_range), abs(max_range))
+        hist, hist_edges = np.histogram(arr, bins=8001, range=(-th, th))
+        hist_dict = {'layer1' : (hist, hist_edges, min_range, max_range, th)}
+        th_dict = mx.contrib.quant._get_optimal_thresholds(hist_dict, dtype)
         assert 'layer1' in th_dict
         assert_almost_equal(np.array([th_dict['layer1'][1]]), expected_threshold, rtol=1e-2, atol=1e-4)
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index ceee51a3e503..f864003f7c60 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5571,9 +5571,10 @@ def test_quantization_op():
 
     qa_real = mx.nd.array([[18, 75], [77, 109]])
     a_real  = mx.nd.array([[0.14173228, 0.5905512], [0.6062992, 0.8582677]])
-
+    print(a_.asnumpy())
+    print(a_real.asnumpy())
     assert same(qa.asnumpy(), qa_real.asnumpy())
-    assert same(a_.asnumpy(),  a_real.asnumpy())
+    assert_almost_equal(a_.asnumpy(),  a_real.asnumpy(), rtol=1e-2)
 
 @with_seed()
 def test_index_copy():