diff --git a/Jenkinsfile b/Jenkinsfile
index f9e038f0c3cd..f3329cdb796f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -386,6 +386,24 @@ try {
         }
       }
     },
+    'Python2: Quantize GPU': {
+      node('mxnetlinux-gpu-p3') {
+        ws('workspace/ut-python2-quantize-gpu') {
+          init_git()
+          unpack_lib('gpu', mx_lib)
+          sh "ci/build.py --nvidiadocker --build --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_python2_quantization_gpu"
+        }
+      }
+    },
+    'Python3: Quantize GPU': {
+      node('mxnetlinux-gpu-p3') {
+        ws('workspace/ut-python3-quantize-gpu') {
+          init_git()
+          unpack_lib('gpu', mx_lib)
+          sh "ci/build.py --nvidiadocker --build --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_python3_quantization_gpu"
+        }
+      }
+    },
     'Python2: MKLDNN-CPU': {
       node('mxnetlinux-cpu') {
         ws('workspace/ut-python2-mkldnn-cpu') {
diff --git a/benchmark/python/quantization/benchmark_op.py b/benchmark/python/quantization/benchmark_op.py
new file mode 100644
index 000000000000..5ba7740cc918
--- /dev/null
+++ b/benchmark/python/quantization/benchmark_op.py
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import time
+import mxnet as mx
+from mxnet.test_utils import check_speed
+
+
+def quantize_int8_helper(data):
+    min_data = mx.nd.min(data)
+    max_data = mx.nd.max(data)
+    return mx.nd.contrib.quantize(data, min_data, max_data, out_type='int8')
+
+
+def benchmark_convolution(data_shape, kernel, num_filter, pad, stride, no_bias=True, layout='NCHW', repeats=20):
+    ctx_gpu = mx.gpu(0)
+    data = mx.sym.Variable(name="data", shape=data_shape, dtype='float32')
+    # conv cudnn
+    conv_cudnn = mx.sym.Convolution(data=data, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride,
+                                    no_bias=no_bias, layout=layout, cudnn_off=False, name="conv_cudnn")
+    arg_shapes, _, _ = conv_cudnn.infer_shape(data=data_shape)
+    input_data = mx.nd.random.normal(0, 0.2, shape=data_shape, ctx=ctx_gpu)
+    conv_weight_name = conv_cudnn.list_arguments()[1]
+    args = {data.name: input_data, conv_weight_name: mx.random.normal(0, 1, shape=arg_shapes[1], ctx=ctx_gpu)}
+    conv_cudnn_time = check_speed(sym=conv_cudnn, location=args, ctx=ctx_gpu, N=repeats,
+                                  grad_req='null', typ='forward') * 1000
+
+    # quantized_conv2d
+    qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
+    weight = mx.sym.Variable(name='weight', shape=arg_shapes[1], dtype='int8')
+    min_data = mx.sym.Variable(name='min_data', shape=(1,), dtype='float32')
+    max_data = mx.sym.Variable(name='max_data', shape=(1,), dtype='float32')
+    min_weight = mx.sym.Variable(name='min_weight', shape=(1,), dtype='float32')
+    max_weight = mx.sym.Variable(name='max_weight', shape=(1,), dtype='float32')
+    quantized_conv2d = mx.sym.contrib.quantized_conv(data=qdata, weight=weight, min_data=min_data, max_data=max_data,
+                                                     min_weight=min_weight, max_weight=max_weight,
+                                                     kernel=kernel, num_filter=num_filter, pad=pad, stride=stride,
+                                                     no_bias=no_bias, layout=layout, cudnn_off=False,
+                                                     name='quantized_conv2d')
+    qargs = {qdata.name: quantize_int8_helper(input_data)[0],
+             min_data.name: quantize_int8_helper(input_data)[1],
+             max_data.name: quantize_int8_helper(input_data)[2],
+             weight.name: quantize_int8_helper(args[conv_weight_name])[0],
+             min_weight.name: quantize_int8_helper(args[conv_weight_name])[1],
+             max_weight.name: quantize_int8_helper(args[conv_weight_name])[2]}
+    qconv_time = check_speed(sym=quantized_conv2d, location=qargs, ctx=ctx_gpu, N=repeats,
+                             grad_req='null', typ='forward') * 1000
+
+    print('==================================================================================================')
+    print('data=%s, kernel=%s, num_filter=%s, pad=%s, stride=%s, no_bias=%s, layout=%s, repeats=%s'
+          % (data_shape, kernel, num_filter, pad, stride, no_bias, layout, repeats))
+    print('%s , ctx=%s, time=%.2f ms' % (conv_cudnn.name + '-FP32', ctx_gpu, conv_cudnn_time))
+    print('%s, ctx=%s, time=%.2f ms' % (quantized_conv2d.name, ctx_gpu, qconv_time))
+    print('quantization speedup:               %.1fX' % (conv_cudnn_time / qconv_time))
+    print('\n')
+
+
+if __name__ == '__main__':
+    for batch_size in [32, 64, 128]:
+        benchmark_convolution(data_shape=(batch_size, 64, 56, 56), kernel=(1, 1), num_filter=256,
+                              pad=(0, 0), stride=(1, 1), layout='NCHW', repeats=20)
+
+        benchmark_convolution(data_shape=(batch_size, 256, 56, 56), kernel=(1, 1), num_filter=64,
+                              pad=(0, 0), stride=(1, 1), layout='NCHW', repeats=20)
+
+        benchmark_convolution(data_shape=(batch_size, 256, 56, 56), kernel=(1, 1), num_filter=128,
+                              pad=(0, 0), stride=(2, 2), layout='NCHW', repeats=20)
+
+        benchmark_convolution(data_shape=(batch_size, 128, 28, 28), kernel=(3, 3), num_filter=128,
+                              pad=(1, 1), stride=(1, 1), layout='NCHW', repeats=20)
+
+        benchmark_convolution(data_shape=(batch_size, 1024, 14, 14), kernel=(1, 1), num_filter=256,
+                              pad=(0, 0), stride=(1, 1), layout='NCHW', repeats=20)
+
+        benchmark_convolution(data_shape=(batch_size, 2048, 7, 7), kernel=(1, 1), num_filter=512,
+                              pad=(0, 0), stride=(1, 1), layout='NCHW', repeats=20)
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index abb37022d668..1d005ce6dc69 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -361,6 +361,7 @@ unittest_ubuntu_python2_cpu() {
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     nosetests-2.7 --verbose tests/python/unittest
     nosetests-2.7 --verbose tests/python/train
+    nosetests-2.7 --verbose tests/python/quantization
 }
 
 unittest_ubuntu_python3_cpu() {
@@ -371,6 +372,7 @@ unittest_ubuntu_python3_cpu() {
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     nosetests-3.4 --verbose tests/python/unittest
+    nosetests-3.4 --verbose tests/python/quantization
 }
 
 unittest_ubuntu_python2_gpu() {
@@ -393,6 +395,30 @@ unittest_ubuntu_python3_gpu() {
     nosetests-3.4 --verbose tests/python/gpu
 }
 
+# quantization gpu currently only runs on P3 instances
+# need to separte it from unittest_ubuntu_python2_gpu()
+unittest_ubuntu_python2_quantization_gpu() {
+    set -ex
+    export PYTHONPATH=./python/ 
+    # MXNET_MKLDNN_DEBUG is buggy and produces false positives
+    # https://github.com/apache/incubator-mxnet/issues/10026    
+    #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
+    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    nosetests-2.7 --verbose tests/python/quantization_gpu
+}
+
+# quantization gpu currently only runs on P3 instances
+# need to separte it from unittest_ubuntu_python3_gpu()
+unittest_ubuntu_python3_quantization_gpu() {
+    set -ex
+    export PYTHONPATH=./python/ 
+    # MXNET_MKLDNN_DEBUG is buggy and produces false positives
+    # https://github.com/apache/incubator-mxnet/issues/10026
+    #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
+    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    nosetests-3.4 --verbose tests/python/quantization_gpu
+}
+
 unittest_ubuntu_cpu_scala() {
     set -ex
     make scalapkg USE_BLAS=openblas
diff --git a/example/quantization/README.md b/example/quantization/README.md
new file mode 100644
index 000000000000..63b65574d3ac
--- /dev/null
+++ b/example/quantization/README.md
@@ -0,0 +1,22 @@
+# Model Quantization with Calibration Examples
+This folder contains examples of quantizing a FP32 model with or without calibration and using the calibrated
+quantized for inference. Two pre-trained imagenet models are taken as examples for quantization. One is
+[Resnet-152](http://data.mxnet.io/models/imagenet/resnet/152-layers/), and the other one is
+[Inception with BatchNorm](http://data.mxnet.io/models/imagenet/inception-bn/). The calibration dataset
+is the [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) for testing the pre-trained models.
+
+Here are the details of the four files in this folder.
+- `imagenet_gen_qsym.py` This script provides an example of taking FP32 models and calibration dataset to generate
+calibrated quantized models. When launched for the first time, the script would download the user-specified model,
+either Resnet-152 or Inception,
+and calibration dataset into `model` and `data` folders, respectively. The generated quantized models can be found in
+the `model` folder.
+- `imagenet_inference.py` This script is used for calculating the accuracy of FP32 models or quantized models on the
+validation dataset which was downloaded for calibration in `imagenet_gen_qsym.py`.
+- `launch_quantize.sh` This is a shell script that generates various quantized models for Resnet-152 and
+Inception with BatchNorm with different configurations. Users can copy and paste the command from the script to
+the console to run model quantization for a specific configuration.
+- `launch_inference.sh` This is a shell script that calculate the accuracies of all the quantized models generated
+by invoking `launch_quantize.sh`.
+
+**NOTE**: This example has only been tested on Linux systems.
\ No newline at end of file
diff --git a/example/quantization/common b/example/quantization/common
new file mode 120000
index 000000000000..cafb9140ab6a
--- /dev/null
+++ b/example/quantization/common
@@ -0,0 +1 @@
+../image-classification/common
\ No newline at end of file
diff --git a/example/quantization/imagenet_gen_qsym.py b/example/quantization/imagenet_gen_qsym.py
new file mode 100644
index 000000000000..045ce62489ad
--- /dev/null
+++ b/example/quantization/imagenet_gen_qsym.py
@@ -0,0 +1,194 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import os
+import logging
+from common import modelzoo
+import mxnet as mx
+from mxnet.contrib.quantization import *
+
+
+def download_calib_dataset(dataset_url, calib_dataset, logger=None):
+    if logger is not None:
+        logger.info('Downloading calibration dataset from %s to %s' % (dataset_url, calib_dataset))
+    mx.test_utils.download(dataset_url, calib_dataset)
+
+
+def download_model(model_name, logger=None):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    model_path = os.path.join(dir_path, 'model')
+    if logger is not None:
+        logger.info('Downloading model %s... into path %s' % (model_name, model_path))
+    return modelzoo.download_model(args.model, os.path.join(dir_path, 'model'))
+
+
+def save_symbol(fname, sym, logger=None):
+    if logger is not None:
+        logger.info('Saving symbol into file at %s' % fname)
+    sym.save(fname)
+
+
+def save_params(fname, arg_params, aux_params, logger=None):
+    if logger is not None:
+        logger.info('Saving params into file at %s' % fname)
+    save_dict = {('arg:%s' % k): v.as_in_context(cpu()) for k, v in arg_params.items()}
+    save_dict.update({('aux:%s' % k): v.as_in_context(cpu()) for k, v in aux_params.items()})
+    mx.nd.save(fname, save_dict)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model')
+    parser.add_argument('--model', type=str, choices=['imagenet1k-resnet-152', 'imagenet1k-inception-bn'],
+                        help='currently only supports imagenet1k-resnet-152 or imagenet1k-inception-bn')
+    parser.add_argument('--batch-size', type=int, default=32)
+    parser.add_argument('--label-name', type=str, default='softmax_label')
+    parser.add_argument('--calib-dataset', type=str, default='data/val_256_q90.rec',
+                        help='path of the calibration dataset')
+    parser.add_argument('--image-shape', type=str, default='3,224,224')
+    parser.add_argument('--data-nthreads', type=int, default=60,
+                        help='number of threads for data decoding')
+    parser.add_argument('--num-calib-batches', type=int, default=10,
+                        help='number of batches for calibration')
+    parser.add_argument('--exclude-first-conv', action='store_true', default=True,
+                        help='excluding quantizing the first conv layer since the'
+                             ' number of channels is usually not a multiple of 4 in that layer'
+                             ' which does not satisfy the requirement of cuDNN')
+    parser.add_argument('--shuffle-dataset', action='store_true', default=True,
+                        help='shuffle the calibration dataset')
+    parser.add_argument('--shuffle-chunk-seed', type=int, default=3982304,
+                        help='shuffling chunk seed, see'
+                             ' https://mxnet.incubator.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
+                             ' for more details')
+    parser.add_argument('--shuffle-seed', type=int, default=48564309,
+                        help='shuffling seed, see'
+                             ' https://mxnet.incubator.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
+                             ' for more details')
+    parser.add_argument('--calib-mode', type=str, default='entropy',
+                        help='calibration mode used for generating calibration table for the quantized symbol; supports'
+                             ' 1. none: no calibration will be used. The thresholds for quantization will be calculated'
+                             ' on the fly. This will result in inference speed slowdown and loss of accuracy'
+                             ' in general.'
+                             ' 2. naive: simply take min and max values of layer outputs as thresholds for'
+                             ' quantization. In general, the inference accuracy worsens with more examples used in'
+                             ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
+                             ' inference results.'
+                             ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
+                             ' thresholds. This mode is expected to produce the best inference accuracy of all three'
+                             ' kinds of quantized models if the calibration dataset is representative enough of the'
+                             ' inference dataset.')
+    args = parser.parse_args()
+
+    logging.basicConfig()
+    logger = logging.getLogger('logger')
+    logger.setLevel(logging.INFO)
+
+    logger.info('shuffle_dataset=%s' % args.shuffle_dataset)
+
+    calib_mode = args.calib_mode
+    logger.info('calibration mode set to %s' % calib_mode)
+
+    # download calibration dataset
+    if calib_mode != 'none':
+        download_calib_dataset('http://data.mxnet.io/data/val_256_q90.rec', args.calib_dataset)
+
+    # download model
+    prefix, epoch = download_model(model_name=args.model, logger=logger)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+
+    # get batch size
+    batch_size = args.batch_size
+    logger.info('batch size = %d for calibration' % batch_size)
+
+    # get number of batches for calibration
+    num_calib_batches = args.num_calib_batches
+    if calib_mode != 'none':
+        logger.info('number of batches = %d for calibration' % num_calib_batches)
+
+    # get number of threads for decoding the dataset
+    data_nthreads = args.data_nthreads
+
+    # get image shape
+    image_shape = args.image_shape
+
+    exclude_first_conv = args.exclude_first_conv
+    excluded_sym_names = []
+    if args.model == 'imagenet1k-resnet-152':
+        rgb_mean = '0,0,0'
+        calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
+                                                                 or name.find('sc') != -1
+                                                                 or name.find('fc') != -1)
+        if exclude_first_conv:
+            excluded_sym_names = ['conv0']
+    elif args.model == 'imagenet1k-inception-bn':
+        rgb_mean = '123.68,116.779,103.939'
+        calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
+                                                                 or name.find('fc') != -1)
+        if exclude_first_conv:
+            excluded_sym_names = ['conv_1']
+    else:
+        raise ValueError('model %s is not supported in this script' % args.model)
+
+    label_name = args.label_name
+    logger.info('label_name = %s' % label_name)
+
+    data_shape = tuple([int(i) for i in image_shape.split(',')])
+    logger.info('Input data shape = %s' % str(data_shape))
+
+    logger.info('rgb_mean = %s' % rgb_mean)
+    rgb_mean = [float(i) for i in rgb_mean.split(',')]
+    mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': rgb_mean[2]}
+
+    if calib_mode == 'none':
+        logger.info('Quantizing FP32 model %s' % args.model)
+        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                       excluded_sym_names=excluded_sym_names,
+                                                       calib_mode=calib_mode, logger=logger)
+        sym_name = '%s-symbol.json' % (prefix + '-quantized')
+        save_symbol(sym_name, qsym, logger)
+    else:
+        logger.info('Creating ImageRecordIter for reading calibration dataset')
+        data = mx.io.ImageRecordIter(path_imgrec=args.calib_dataset,
+                                     label_width=1,
+                                     preprocess_threads=data_nthreads,
+                                     batch_size=batch_size,
+                                     data_shape=data_shape,
+                                     label_name=label_name,
+                                     rand_crop=False,
+                                     rand_mirror=False,
+                                     shuffle=args.shuffle_dataset,
+                                     shuffle_chunk_seed=args.shuffle_chunk_seed,
+                                     seed=args.shuffle_seed,
+                                     **mean_args)
+
+        cqsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                        ctx=mx.gpu(0), excluded_sym_names=excluded_sym_names,
+                                                        calib_mode=calib_mode, calib_data=data,
+                                                        num_calib_examples=num_calib_batches * batch_size,
+                                                        calib_layer=calib_layer, logger=logger)
+        if calib_mode == 'entropy':
+            suffix = '-quantized-%dbatches-entropy' % num_calib_batches
+        elif calib_mode == 'naive':
+            suffix = '-quantized-%dbatches-naive' % num_calib_batches
+        else:
+            raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`'
+                             % calib_mode)
+        sym_name = '%s-symbol.json' % (prefix + suffix)
+        save_symbol(sym_name, cqsym, logger)
+
+    param_name = '%s-%04d.params' % (prefix + '-quantized', epoch)
+    save_params(param_name, qarg_params, aux_params, logger)
diff --git a/example/quantization/imagenet_inference.py b/example/quantization/imagenet_inference.py
new file mode 100644
index 000000000000..fe3f2661c655
--- /dev/null
+++ b/example/quantization/imagenet_inference.py
@@ -0,0 +1,176 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import logging
+import os
+import time
+import mxnet as mx
+from mxnet import nd
+from mxnet.contrib.quantization import *
+
+
+def download_dataset(dataset_url, dataset_dir, logger=None):
+    if logger is not None:
+        logger.info('Downloading dataset for inference from %s to %s' % (dataset_url, dataset_dir))
+    mx.test_utils.download(dataset_url, dataset_dir)
+
+
+def load_model(symbol_file, param_file, logger=None):
+    cur_path = os.path.dirname(os.path.realpath(__file__))
+    symbol_file_path = os.path.join(cur_path, symbol_file)
+    if logger is not None:
+        logger.info('Loading symbol from file %s' % symbol_file_path)
+    symbol = mx.sym.load(symbol_file_path)
+
+    param_file_path = os.path.join(cur_path, param_file)
+    if logger is not None:
+        logger.info('Loading params from file %s' % param_file_path)
+    save_dict = nd.load(param_file_path)
+    arg_params = {}
+    aux_params = {}
+    for k, v in save_dict.items():
+        tp, name = k.split(':', 1)
+        if tp == 'arg':
+            arg_params[name] = v
+        if tp == 'aux':
+            aux_params[name] = v
+    return symbol, arg_params, aux_params
+
+
+def advance_data_iter(data_iter, n):
+    assert n >= 0
+    if n == 0:
+        return data_iter
+    has_next_batch = True
+    while has_next_batch:
+        try:
+            data_iter.next()
+            n -= 1
+            if n == 0:
+                return data_iter
+        except StopIteration:
+            has_next_batch = False
+
+
+def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples, logger=None):
+    metrics = [mx.metric.create('acc'),
+               mx.metric.create('top_k_accuracy', top_k=5)]
+    if not isinstance(metrics, list):
+        metrics = [metrics, ]
+    mod = mx.mod.Module(symbol=sym, context=devs, label_names=[label_name, ])
+    mod.bind(for_training=False,
+             data_shapes=data.provide_data,
+             label_shapes=data.provide_label)
+    mod.set_params(arg_params, aux_params)
+
+    tic = time.time()
+    num = 0
+    for batch in data:
+        mod.forward(batch, is_train=False)
+        for m in metrics:
+            mod.update_metric(m, batch.label)
+        num += batch_size
+        if max_num_examples is not None and num >= max_num_examples:
+            break
+
+    speed = num / (time.time() - tic)
+
+    if logger is not None:
+        logger.info('Finished inference with %d images' % num)
+        logger.info('Finished with %f images per second', speed)
+        for m in metrics:
+            logger.info(m.get())
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Score a model on a dataset')
+    parser.add_argument('--symbol-file', type=str, required=True, help='symbol file path')
+    parser.add_argument('--param-file', type=str, required=True, help='param file path')
+    parser.add_argument('--batch-size', type=int, default=32)
+    parser.add_argument('--label-name', type=str, default='softmax_label')
+    parser.add_argument('--dataset', type=str, required=True, help='dataset path')
+    parser.add_argument('--rgb-mean', type=str, default='0,0,0')
+    parser.add_argument('--image-shape', type=str, default='3,224,224')
+    parser.add_argument('--data-nthreads', type=int, default=60, help='number of threads for data decoding')
+    parser.add_argument('--num-skipped-batches', type=int, default=0, help='skip the number of batches for inference')
+    parser.add_argument('--num-inference-batches', type=int, required=True, help='number of images used for inference')
+    parser.add_argument('--shuffle-dataset', action='store_true', default=True,
+                        help='shuffle the calibration dataset')
+    parser.add_argument('--shuffle-chunk-seed', type=int, default=3982304,
+                        help='shuffling chunk seed, see'
+                             ' https://mxnet.incubator.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
+                             ' for more details')
+    parser.add_argument('--shuffle-seed', type=int, default=48564309,
+                        help='shuffling seed, see'
+                             ' https://mxnet.incubator.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
+                             ' for more details')
+
+    args = parser.parse_args()
+
+    logging.basicConfig()
+    logger = logging.getLogger('logger')
+    logger.setLevel(logging.INFO)
+
+    symbol_file = args.symbol_file
+    param_file = args.param_file
+    data_nthreads = args.data_nthreads
+
+    batch_size = args.batch_size
+    logger.info('batch size = %d for inference' % batch_size)
+
+    rgb_mean = args.rgb_mean
+    logger.info('rgb_mean = %s' % rgb_mean)
+    rgb_mean = [float(i) for i in rgb_mean.split(',')]
+    mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': rgb_mean[2]}
+
+    label_name = args.label_name
+    logger.info('label_name = %s' % label_name)
+
+    image_shape = args.image_shape
+    data_shape = tuple([int(i) for i in image_shape.split(',')])
+    logger.info('Input data shape = %s' % str(data_shape))
+
+    dataset = args.dataset
+    download_dataset('http://data.mxnet.io/data/val_256_q90.rec', dataset)
+    logger.info('Dataset for inference: %s' % dataset)
+
+    # creating data iterator
+    data = mx.io.ImageRecordIter(path_imgrec=dataset,
+                                 label_width=1,
+                                 preprocess_threads=data_nthreads,
+                                 batch_size=batch_size,
+                                 data_shape=data_shape,
+                                 label_name=label_name,
+                                 rand_crop=False,
+                                 rand_mirror=False,
+                                 shuffle=True,
+                                 shuffle_chunk_seed=3982304,
+                                 seed=48564309,
+                                 **mean_args)
+
+    # loading model
+    sym, arg_params, aux_params = load_model(symbol_file, param_file, logger)
+
+    # make sure that fp32 inference works on the same images as calibrated quantized model
+    logger.info('Skipping the first %d batches' % args.num_skipped_batches)
+    data = advance_data_iter(data, args.num_skipped_batches)
+
+    num_inference_images = args.num_inference_batches * batch_size
+    logger.info('Running model %s for inference' % symbol_file)
+    score(sym, arg_params, aux_params, data, [mx.gpu(0)], label_name,
+          max_num_examples=num_inference_images, logger=logger)
diff --git a/example/quantization/launch_inference.sh b/example/quantization/launch_inference.sh
new file mode 100755
index 000000000000..8c839ba0f611
--- /dev/null
+++ b/example/quantization/launch_inference.sh
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --param-file=./model/imagenet1k-resnet-152-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+
+python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+
+python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-10batches-naive-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-50batches-naive-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+
+python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-5batches-entropy-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-10batches-entropy-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-50batches-entropy-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+
+
+python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-symbol.json --param-file=./model/imagenet1k-inception-bn-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+
+python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+
+python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-10batches-naive-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-50batches-naive-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+
+python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-5batches-entropy-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-10batches-entropy-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
+python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-50batches-entropy-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
diff --git a/example/quantization/launch_quantize.sh b/example/quantization/launch_quantize.sh
new file mode 100755
index 000000000000..9aa4bee4bff1
--- /dev/null
+++ b/example/quantization/launch_quantize.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-mode=none
+
+python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=5 --calib-mode=naive
+python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=10 --calib-mode=naive
+python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=50 --calib-mode=naive
+
+python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=5 --calib-mode=entropy
+python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=10 --calib-mode=entropy
+python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=50 --calib-mode=entropy
+
+
+python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-mode=none
+
+python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=5 --calib-mode=naive
+python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=10 --calib-mode=naive
+python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=50 --calib-mode=naive
+
+python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=5 --calib-mode=entropy
+python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=10 --calib-mode=entropy
+python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=50 --calib-mode=entropy
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index e85afe522f0a..ede137e89b7a 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1386,8 +1386,37 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
                                 const int **aux_type_data,
                                 int *complete);
 
-
-
+/*!
+ * \brief Convert a symbol into a quantized symbol where FP32 operators are replaced with INT8
+ * \param sym_handle symbol to be converted
+ * \param ret_sym_handle quantized symbol result
+ * \param num_excluded_symbols number of layers excluded from being quantized in the input symbol
+ * \param excluded_symbols array of symbols to be excluded from being quantized
+ * \param num_offline number of parameters that are quantized offline
+ * \param offline_params array of c strings representing the names of params quantized offline
+ */
+MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle,
+                               SymbolHandle *ret_sym_handle,
+                               const mx_uint num_excluded_symbols,
+                               const SymbolHandle *excluded_symbols,
+                               const mx_uint num_offline,
+                               const char **offline_params);
+
+/*!
+ * \brief Set calibration table to node attributes in the sym
+ * \param sym_handle symbol whose node attributes are to be set by calibration table
+ * \param num_layers number of layers in the calibration table
+ * \param layer names stored as keys in the calibration table
+ * \param low_quantiles low quantiles of layers stored in the calibration table
+ * \param high_quantiles high quantiles of layers stored in the calibration table
+ * \param ret_sym_handle returned symbol
+ */
+MXNET_DLL int MXSetCalibTableToQuantizedSymbol(SymbolHandle qsym_handle,
+                                               const mx_uint num_layers,
+                                               const char** layer_names,
+                                               const float* low_quantiles,
+                                               const float* high_quantiles,
+                                               SymbolHandle* ret_sym_handle);
 
 //--------------------------------------------
 // Part 4: Executor interface
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index fb41d3960995..820eb1aa7b02 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -239,7 +239,6 @@ using FCompute = std::function<void (const nnvm::NodeAttrs& attrs,
                                      const std::vector<TBlob>& outputs)>;
 /*!
  * \brief Resiger an NDArray compute function for simple stateless forward only operator
- *
  * \note Register under "FComputeEx<xpu>" and "FComputeEx<xpu>"
  *       Dispatched only when inferred dispatch_mode is FDispatchComputeEx
  */
@@ -261,6 +260,20 @@ using FInferStorageType = std::function<bool (const NodeAttrs& attrs,
                                               std::vector<int>* in_attrs,
                                               std::vector<int>* out_attrs)>;
 
+/*!
+ * \brief Register a quantized node creation function based on the attrs of the node
+ * \note Register under "FQuantizedOp" for non-quantized operators
+ */
+using FQuantizedOp = std::function<nnvm::NodePtr (const NodeAttrs& attrs)>;
+
+/*!
+ * \brief Register a function to determine if the output of a quantized operator
+ * needs to be requantized. This is usually used for the operators
+ * taking int8 data types while accumulating in int32, e.g. quantized_conv.
+ * \note Register under "FNeedRequantize" for non-quantized operators
+ */
+using FNeedRequantize = std::function<bool (const NodeAttrs& attrs)>;
+
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/python/mxnet/contrib/__init__.py b/python/mxnet/contrib/__init__.py
index 63cd8ce26649..fbfd3469678b 100644
--- a/python/mxnet/contrib/__init__.py
+++ b/python/mxnet/contrib/__init__.py
@@ -30,3 +30,5 @@
 from . import text
 from . import onnx
 from . import io
+from . import quantization
+from . import quantization as quant
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
new file mode 100644
index 000000000000..c9c58a9c9ba4
--- /dev/null
+++ b/python/mxnet/contrib/quantization.py
@@ -0,0 +1,520 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Quantization module for generating quantized (INT8) models from FP32 models."""
+
+from __future__ import absolute_import
+
+try:
+    from scipy import stats
+except ImportError:
+    stats = None
+
+import ctypes
+import logging
+import os
+import numpy as np
+from ..base import _LIB, check_call, py_str
+from ..base import c_array, c_str, mx_uint, c_str_array
+from ..base import NDArrayHandle, SymbolHandle
+from ..symbol import Symbol
+from ..symbol import load as sym_load
+from .. import ndarray
+from ..ndarray import load as nd_load
+from ..ndarray import NDArray
+from ..io import DataIter
+from ..context import cpu, Context
+from ..module import Module
+
+
+def _quantize_params(qsym, params):
+    """Given a quantized symbol and a dict of params that have not been quantized,
+    generate quantized params. Currently only supports quantizing the arg_params
+    with names of `weight` or `bias`, not aux_params. If `qsym` contains symbols
+    that are excluded from being quantized, their corresponding params will
+    not be quantized, but saved together with quantized params of the symbols that
+    have been quantized.
+
+    Parameters
+    ----------
+    qsym : Symbol
+        Quantized symbol from FP32 symbol.
+    params : dict of str->NDArray
+    """
+    inputs_name = qsym.list_arguments()
+    quantized_params = {}
+    for name in inputs_name:
+        if name.endswith(('weight_quantize', 'bias_quantize')):
+            original_name = name[:-len('_quantize')]
+            param = params[original_name]
+            val, vmin, vmax = ndarray.contrib.quantize(data=param,
+                                                       min_range=ndarray.min(param),
+                                                       max_range=ndarray.max(param),
+                                                       out_type='int8')
+            quantized_params[name] = val
+            quantized_params[name+'_min'] = vmin
+            quantized_params[name+'_max'] = vmax
+        elif name in params:
+            quantized_params[name] = params[name]
+    return quantized_params
+
+
+def _quantize_symbol(sym, excluded_symbols=None, offline_params=None):
+    """Given a symbol object representing a neural network of data type FP32,
+    quantize it into a INT8 network.
+
+    Parameters
+    ----------
+    sym : Symbol
+        FP32 neural network symbol.
+    excluded_symbols : list of symbols
+        Nodes in the network that users do not want to replace with a symbol of INT8 data type.
+    offline_params : list of strs
+        Names of the parameters that users want to quantize offline. It's always recommended to
+        quantize parameters offline so that quantizing parameters during the inference can be
+        avoided.
+    """
+    num_excluded_symbols = 0
+    excluded_handles = []
+    if excluded_symbols is not None:
+        assert isinstance(excluded_symbols, list)
+        num_excluded_symbols = len(excluded_symbols)
+        for s in excluded_symbols:
+            excluded_handles.append(s.handle)
+
+    num_offline = 0
+    offline = []
+    if offline_params is not None:
+        num_offline = len(offline_params)
+        for k in offline_params:
+            offline.append(c_str(k))
+
+    out = SymbolHandle()
+    check_call(_LIB.MXQuantizeSymbol(sym.handle,
+                                     ctypes.byref(out),
+                                     mx_uint(num_excluded_symbols),
+                                     c_array(SymbolHandle, excluded_handles),
+                                     mx_uint(num_offline),
+                                     c_array(ctypes.c_char_p, offline)))
+    return Symbol(out)
+
+
+class _LayerOutputCollector(object):
+    """Saves layer output NDArray in a dict with layer names as keys and lists of NDArrays as
+    values. The collected NDArrays will be used for calculating the optimal thresholds for
+    quantization using KL divergence.
+    """
+    def __init__(self, include_layer=None, logger=None):
+        self.nd_dict = {}
+        self.include_layer = include_layer
+        self.logger = logger
+
+    def collect(self, name, arr):
+        """Callback function for collecting layer output NDArrays."""
+        name = py_str(name)
+        if self.include_layer is not None and not self.include_layer(name):
+            return
+        handle = ctypes.cast(arr, NDArrayHandle)
+        arr = NDArray(handle, writable=False).copyto(cpu())
+        if self.logger is not None:
+            self.logger.info("Collecting layer %s output of shape %s" % (name, arr.shape))
+        if name in self.nd_dict:
+            self.nd_dict[name].append(arr)
+        else:
+            self.nd_dict[name] = [arr]
+
+
+class _LayerOutputMinMaxCollector(object):
+    """Saves layer output min and max values in a dict with layer names as keys.
+    The collected min and max values will be directly used as thresholds for quantization.
+    """
+    def __init__(self, include_layer=None, logger=None):
+        self.min_max_dict = {}
+        self.include_layer = include_layer
+        self.logger = logger
+
+    def collect(self, name, arr):
+        """Callback function for collecting min and max values from an NDArray."""
+        name = py_str(name)
+        if self.include_layer is not None and not self.include_layer(name):
+            return
+        handle = ctypes.cast(arr, NDArrayHandle)
+        arr = NDArray(handle, writable=False)
+        min_range = ndarray.min(arr).asscalar()
+        max_range = ndarray.max(arr).asscalar()
+        if name in self.min_max_dict:
+            cur_min_max = self.min_max_dict[name]
+            self.min_max_dict[name] = (min(cur_min_max[0], min_range),
+                                       max(cur_min_max[1], max_range))
+        else:
+            self.min_max_dict[name] = (min_range, max_range)
+        if self.logger is not None:
+            self.logger.info("Collecting layer %s output min_range=%f, max_range=%f"
+                             % (name, min_range, max_range))
+
+
+def _calibrate_quantized_sym(qsym, th_dict):
+    """Given a dictionary containing the thresholds for quantizing the layers,
+    set the thresholds into the quantized symbol as the params of requantize operators.
+    """
+    if th_dict is None or len(th_dict) == 0:
+        return qsym
+    num_layer_outputs = len(th_dict)
+    layer_output_names = []
+    min_vals = []
+    max_vals = []
+    for k, v in th_dict.items():
+        layer_output_names.append(k)
+        min_vals.append(v[0])
+        max_vals.append(v[1])
+
+    calibrated_sym = SymbolHandle()
+    check_call(_LIB.MXSetCalibTableToQuantizedSymbol(qsym.handle,
+                                                     mx_uint(num_layer_outputs),
+                                                     c_str_array(layer_output_names),
+                                                     c_array(ctypes.c_float, min_vals),
+                                                     c_array(ctypes.c_float, max_vals),
+                                                     ctypes.byref(calibrated_sym)))
+    return Symbol(calibrated_sym)
+
+
+def _collect_layer_statistics(mod, data, collector, max_num_examples=None, logger=None):
+    if not isinstance(data, DataIter):
+        raise ValueError('Only supports data as a type of DataIter, while received type %s'
+                         % str(type(data)))
+    mod._exec_group.execs[0].set_monitor_callback(collector.collect)
+    num_batches = 0
+    num_examples = 0
+    for batch in data:
+        mod.forward(data_batch=batch, is_train=False)
+        num_batches += 1
+        num_examples += data.batch_size
+        if max_num_examples is not None and num_examples >= max_num_examples:
+            break
+    if logger is not None:
+        logger.info("Collected statistics from %d batches with batch_size=%d"
+                    % (num_batches, data.batch_size))
+    return num_examples
+
+
+def _collect_layer_output_min_max(mod, data, include_layer=None,
+                                  max_num_examples=None, logger=None):
+    """Collect min and max values from layer outputs and save them in
+    a dictionary mapped by layer names.
+    """
+    collector = _LayerOutputMinMaxCollector(include_layer=include_layer, logger=logger)
+    num_examples = _collect_layer_statistics(mod, data, collector, max_num_examples, logger)
+    return collector.min_max_dict, num_examples
+
+
+def _collect_layer_outputs(mod, data, include_layer=None, max_num_examples=None, logger=None):
+    """Collect layer outputs and save them in a dictionary mapped by layer names."""
+    collector = _LayerOutputCollector(include_layer=include_layer, logger=logger)
+    num_examples = _collect_layer_statistics(mod, data, collector, max_num_examples, logger)
+    return collector.nd_dict, num_examples
+
+
+def _smooth_distribution(p, eps=0.0001):
+    """Given a discrete distribution (may have not been normalized to 1),
+    smooth it by replacing zeros with eps multiplied by a scaling factor and taking the
+    corresponding amount off the non-zero values.
+    Ref: http://web.engr.illinois.edu/~hanj/cs412/bk3/KL-divergence.pdf
+    """
+    is_zeros = (p == 0).astype(np.float32)
+    is_nonzeros = (p != 0).astype(np.float32)
+    n_zeros = is_zeros.sum()
+    n_nonzeros = p.size - n_zeros
+    eps1 = eps * float(n_zeros) / float(n_nonzeros)
+    assert eps1 < 1.0, 'n_zeros=%d, n_nonzeros=%d, eps1=%f' % (n_zeros, n_nonzeros, eps1)
+    hist = p.astype(np.float32)
+    hist += eps * is_zeros + (-eps1) * is_nonzeros
+    assert (hist <= 0).sum() == 0
+    return hist
+
+
+# pylint: disable=line-too-long
+def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
+    """Given a dataset, find the optimal threshold for quantizing it.
+    Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
+    """
+    if isinstance(arr, NDArray):
+        arr = arr.asnumpy()
+    elif isinstance(arr, list):
+        assert len(arr) != 0
+        for i, nd in enumerate(arr):
+            if isinstance(nd, NDArray):
+                arr[i] = nd.asnumpy()
+            elif not isinstance(nd, np.ndarray):
+                raise TypeError('get_optimal_threshold only supports input type of NDArray,'
+                                ' list of np.ndarrays or NDArrays, and np.ndarray,'
+                                ' while received type=%s' % (str(type(nd))))
+        arr = np.concatenate(arr)
+    elif not isinstance(arr, np.ndarray):
+        raise TypeError('get_optimal_threshold only supports input type of NDArray,'
+                        ' list of NDArrays and np.ndarray,'
+                        ' while received type=%s' % (str(type(arr))))
+    min_val = np.min(arr)
+    max_val = np.max(arr)
+    th = max(abs(min_val), abs(max_val))
+
+    hist, hist_edeges = np.histogram(arr, bins=num_bins, range=(-th, th))
+    zero_bin_idx = num_bins // 2
+    num_half_quantized_bins = num_quantized_bins // 2
+    assert np.allclose(hist_edeges[zero_bin_idx] + hist_edeges[zero_bin_idx + 1],
+                       0, rtol=1e-5, atol=1e-7)
+
+    thresholds = np.zeros(num_bins // 2 + 1 - num_quantized_bins // 2)
+    divergence = np.zeros_like(thresholds)
+    quantized_bins = np.zeros(num_quantized_bins, dtype=np.int32)
+    # i means the number of bins on half axis excluding the zero bin
+    for i in range(num_quantized_bins // 2,
+                   num_bins // 2 + 1):
+        p_bin_idx_start = zero_bin_idx - i
+        p_bin_idx_stop = zero_bin_idx + i + 1
+        thresholds[i - num_half_quantized_bins] = hist_edeges[p_bin_idx_stop]
+        # sliced_nd_hist is used to generate candidate distribution q
+        sliced_nd_hist = hist[p_bin_idx_start:p_bin_idx_stop]
+
+        # generate reference distribution p
+        p = sliced_nd_hist.copy()
+        assert p.size % 2 == 1
+        assert p.size >= num_quantized_bins
+        # put left outlier count in p[0]
+        left_outlier_count = np.sum(hist[0:p_bin_idx_start])
+        p[0] += left_outlier_count
+        # put right outlier count in p[-1]
+        right_outlier_count = np.sum(hist[p_bin_idx_stop:])
+        p[-1] += right_outlier_count
+        # is_nonzeros[k] indicates whether hist[k] is nonzero
+        is_nonzeros = (sliced_nd_hist != 0).astype(np.int32)
+
+        # calculate how many bins should be merged to generate quantized distribution q
+        num_merged_bins = p.size // num_quantized_bins
+        # merge hist into num_quantized_bins bins
+        for j in range(num_quantized_bins):
+            start = j * num_merged_bins
+            stop = start + num_merged_bins
+            quantized_bins[j] = sliced_nd_hist[start:stop].sum()
+        quantized_bins[-1] += sliced_nd_hist[num_quantized_bins * num_merged_bins:].sum()
+        # expand quantized_bins into p.size bins
+        q = np.zeros(p.size, dtype=np.float32)
+        for j in range(num_quantized_bins):
+            start = j * num_merged_bins
+            if j == num_quantized_bins - 1:
+                stop = -1
+            else:
+                stop = start + num_merged_bins
+            norm = is_nonzeros[start:stop].sum()
+            if norm != 0:
+                q[start:stop] = float(quantized_bins[j]) / float(norm)
+        q[sliced_nd_hist == 0] = 0
+        p = _smooth_distribution(p)
+        q = _smooth_distribution(q)
+        divergence[i - num_half_quantized_bins] = stats.entropy(p, q)
+        quantized_bins[:] = 0
+
+    min_divergence_idx = np.argmin(divergence)
+    min_divergence = divergence[min_divergence_idx]
+    opt_th = thresholds[min_divergence_idx]
+    return min_val, max_val, min_divergence, opt_th
+# pylint: enable=line-too-long
+
+
+def _get_optimal_thresholds(nd_dict, num_bins=8001, num_quantized_bins=255, logger=None):
+    """Given a ndarray dict, find the optimal threshold for quantizing each value of the key."""
+    if stats is None:
+        raise ImportError('scipy.stats is required for running entropy mode of calculating'
+                          ' the optimal thresholds for quantizing FP32 ndarrays into int8.'
+                          ' Please check if the scipy python bindings are installed.')
+    assert isinstance(nd_dict, dict)
+    if logger is not None:
+        logger.info('Calculating optimal thresholds for quantization using KL divergence'
+                    ' with num_bins=%d and num_quantized_bins=%d' % (num_bins, num_quantized_bins))
+    th_dict = {}
+    # copy nd_dict keys since the keys() only returns a view in python3
+    layer_names = list(nd_dict.keys())
+    for name in layer_names:
+        assert name in nd_dict
+        min_val, max_val, min_divergence, opt_th =\
+            _get_optimal_threshold(nd_dict[name], num_bins=num_bins,
+                                   num_quantized_bins=num_quantized_bins)
+        del nd_dict[name]  # release the memory of ndarray
+        th_dict[name] = (-opt_th, opt_th)
+        if logger is not None:
+            logger.info('layer=%s, min_val=%f, max_val=%f, min_divergence=%f, optimal_threshold=%f'
+                        % (name, min_val, max_val, min_divergence, opt_th))
+    return th_dict
+
+
+def _load_sym(sym, logger=logging):
+    """Given a str as a path the symbol .json file or a symbol, returns a Symbol object."""
+    if isinstance(sym, str):  # sym is a symbol file path
+        cur_path = os.path.dirname(os.path.realpath(__file__))
+        symbol_file_path = os.path.join(cur_path, sym)
+        logger.info('Loading symbol from file %s' % symbol_file_path)
+        return sym_load(symbol_file_path)
+    elif isinstance(sym, Symbol):
+        return sym
+    else:
+        raise ValueError('_load_sym only accepts Symbol or path to the symbol file,'
+                         ' while received type %s' % str(type(sym)))
+
+
+def _load_params(params, logger=logging):
+    """Given a str as a path to the .params file or a pair of params,
+    returns two dictionaries representing arg_params and aux_params.
+    """
+    if isinstance(params, str):
+        cur_path = os.path.dirname(os.path.realpath(__file__))
+        param_file_path = os.path.join(cur_path, params)
+        logger.info('Loading params from file %s' % param_file_path)
+        save_dict = nd_load(param_file_path)
+        arg_params = {}
+        aux_params = {}
+        for k, v in save_dict.items():
+            tp, name = k.split(':', 1)
+            if tp == 'arg':
+                arg_params[name] = v
+            if tp == 'aux':
+                aux_params[name] = v
+        return arg_params, aux_params
+    elif isinstance(params, (tuple, list)) and len(params) == 2:
+        return params[0], params[1]
+    else:
+        raise ValueError('Unsupported params provided. Must be either a path to the param file or'
+                         ' a pair of dictionaries representing arg_params and aux_params')
+
+
+def quantize_model(sym, arg_params, aux_params,
+                   data_names=('data',), label_names=('softmax_label',),
+                   ctx=cpu(), excluded_sym_names=None, calib_mode='entropy',
+                   calib_data=None, num_calib_examples=None, calib_layer=None, logger=logging):
+    """User-level API for generating a quantized model from a FP32 model w/ or w/o calibration.
+    The backend quantized operators are only enabled for Linux systems. Please do not run
+    inference using the quantized models on Windows for now.
+    The quantization implementation adopts the TensorFlow's approach:
+    https://www.tensorflow.org/performance/quantization.
+    The calibration implementation borrows the idea of Nvidia's 8-bit Inference with TensorRT:
+    http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
+    and adapts the method to MXNet.
+
+    Parameters
+    ----------
+    sym : str or Symbol
+        Defines the structure of a neural network for FP32 data types.
+    arg_params : dict
+        Dictionary of name to `NDArray`.
+    aux_params : dict
+        Dictionary of name to `NDArray`.
+    data_names : a list of strs
+        Data names required for creating a Module object to run forward propagation on the
+        calibration dataset.
+    label_names : a list of strs
+        Label names required for creating a Module object to run forward propagation on the
+        calibration dataset.
+    ctx : Context
+        Defines the device that users want to run forward propagation on the calibration
+        dataset for collecting layer output statistics. Currently, only supports single context.
+    excluded_sym_names : list of strings
+        A list of strings representing the names of the symbols that users want to excluding
+        from being quantized.
+    calib_mode : str
+        If calib_mode='none', no calibration will be used and the thresholds for
+        requantization after the corresponding layers will be calculated at runtime by
+        calling min and max operators. The quantized models generated in this
+        mode are normally 10-20% slower than those with calibrations during inference.
+        If calib_mode='naive', the min and max values of the layer outputs from a calibration
+        dataset will be directly taken as the thresholds for quantization.
+        If calib_mode='entropy' (default mode), the thresholds for quantization will be
+        derived such that the KL divergence between the distributions of FP32 layer outputs and
+        quantized layer outputs is minimized based upon the calibration dataset.
+    calib_data : DataIter
+        A data iterator initialized by the calibration dataset.
+    num_calib_examples : int or None
+        The maximum number of examples that user would like to use for calibration. If not provided,
+        the whole calibration dataset will be used.
+    calib_layer : function
+        Given a layer's output name in string, return True or False for deciding whether to
+        calibrate this layer. If yes, the statistics of the layer's output will be collected;
+        otherwise, no information of the layer's output will be collected. If not provided,
+        all the layers' outputs that need requantization will be collected.
+    logger : Object
+        A logging object for printing information during the process of quantization.
+
+    Returns
+    -------
+    tuple
+        A tuple of quantized symbol, quantized arg_params, and aux_params.
+    -------
+    """
+    if excluded_sym_names is None:
+        excluded_sym_names = []
+    if not isinstance(excluded_sym_names, list):
+        raise ValueError('excluded_sym_names must be a list of strings representing'
+                         ' the names of the symbols that will not be quantized,'
+                         ' while received type %s' % str(type(excluded_sym_names)))
+    excluded_syms = []
+    if excluded_sym_names is not None:
+        for sym_name in excluded_sym_names:
+            nodes = sym.get_internals()
+            idx = nodes.list_outputs().index(sym_name + '_output')
+            excluded_syms.append(nodes[idx])
+    logger.info('Quantizing symbol')
+    qsym = _quantize_symbol(sym, excluded_symbols=excluded_syms,
+                            offline_params=list(arg_params.keys()))
+
+    logger.info('Quantizing parameters')
+    qarg_params = _quantize_params(qsym, arg_params)
+
+    if calib_mode is not None and calib_mode != 'none':
+        if not isinstance(ctx, Context):
+            raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
+        if calib_data is None:
+            raise ValueError('calib_data must be provided when calib_mode=%s' % calib_mode)
+        if not isinstance(calib_data, DataIter):
+            raise ValueError('calib_data must be of DataIter type when calib_mode=%s,'
+                             ' while received type %s' % (calib_mode, str(type(calib_data))))
+        if calib_layer is None:
+            calib_layer = lambda name: name.endswith('_output')
+
+        mod = Module(symbol=sym, data_names=data_names, label_names=label_names, context=ctx)
+        if len(calib_data.provide_label) > 0:
+            mod.bind(for_training=False, data_shapes=calib_data.provide_data,
+                     label_shapes=calib_data.provide_label)
+        else:
+            mod.bind(for_training=False, data_shapes=calib_data.provide_data)
+        mod.set_params(arg_params, aux_params)
+        if calib_mode == 'entropy':
+            nd_dict, num_examples = _collect_layer_outputs(mod, calib_data,
+                                                           include_layer=calib_layer,
+                                                           max_num_examples=num_calib_examples,
+                                                           logger=logger)
+            logger.info('Collected layer outputs from FP32 model using %d examples' % num_examples)
+            logger.info('Calculating optimal thresholds for quantization')
+            th_dict = _get_optimal_thresholds(nd_dict, logger=logger)
+        elif calib_mode == 'naive':
+            th_dict, num_examples = _collect_layer_output_min_max(
+                mod, calib_data, include_layer=calib_layer, max_num_examples=num_calib_examples,
+                logger=logger)
+            logger.info('Collected layer output min/max values from FP32 model using %d examples'
+                        % num_examples)
+        else:
+            raise ValueError('unknown calibration mode %s received,'
+                             ' expected `none`, `naive`, or `entropy`' % calib_mode)
+        logger.info('Calibrating quantized symbol')
+        qsym = _calibrate_quantized_sym(qsym, th_dict)
+
+    return qsym, qarg_params, aux_params
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index c66b17cab34d..26e885a1cd8d 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -28,7 +28,7 @@
 import numpy as np
 
 from . import io
-from . import nd
+from . import ndarray as nd
 from . import symbol as sym
 from . import optimizer as opt
 from . import metric
@@ -592,15 +592,17 @@ def __setstate__(self, state):
 
     def _init_predictor(self, input_shapes, type_dict=None):
         """Initialize the predictor module for running prediction."""
+        shapes = {name: self.arg_params[name].shape for name in self.arg_params}
+        shapes.update(dict(input_shapes))
         if self._pred_exec is not None:
-            arg_shapes, _, _ = self.symbol.infer_shape(**dict(input_shapes))
+            arg_shapes, _, _ = self.symbol.infer_shape(**shapes)
             assert arg_shapes is not None, "Incomplete input shapes"
             pred_shapes = [x.shape for x in self._pred_exec.arg_arrays]
             if arg_shapes == pred_shapes:
                 return
         # for now only use the first device
         pred_exec = self.symbol.simple_bind(
-            self.ctx[0], grad_req='null', type_dict=type_dict, **dict(input_shapes))
+            self.ctx[0], grad_req='null', type_dict=type_dict, **shapes)
         pred_exec.copy_params_from(self.arg_params, self.aux_params)
 
         _check_arguments(self.symbol)
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 3668af060069..4666b6adf0c3 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -571,3 +571,54 @@ int MXSymbolGrad(SymbolHandle sym, mx_uint num_wrt, const char** wrt, SymbolHand
   LOG(FATAL) << "not implemented";
   API_END();
 }
+
+int MXQuantizeSymbol(SymbolHandle sym_handle,
+                     SymbolHandle *ret_sym_handle,
+                     const mx_uint num_excluded_symbols,
+                     const SymbolHandle *excluded_symbols,
+                     const mx_uint num_offline,
+                     const char **offline_params) {
+  nnvm::Symbol *s = new nnvm::Symbol();
+  API_BEGIN();
+  nnvm::Symbol *sym = static_cast<nnvm::Symbol*>(sym_handle);
+  nnvm::Graph g = Symbol2Graph(*sym);
+  std::unordered_set<nnvm::NodePtr> excluded_nodes;
+  for (size_t i = 0; i < num_excluded_symbols; ++i) {
+    nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(excluded_symbols[i]);
+    for (const auto& e : sym->outputs) {
+      excluded_nodes.emplace(e.node);
+    }
+  }
+  g.attrs["excluded_nodes"] = std::make_shared<nnvm::any>(std::move(excluded_nodes));
+  std::unordered_set<std::string> offline;
+  for (size_t i = 0; i < num_offline; ++i) {
+    offline.emplace(offline_params[i]);
+  }
+  g.attrs["offline_params"] = std::make_shared<nnvm::any>(std::move(offline));
+  g = ApplyPass(std::move(g), "QuantizeGraph");
+  s->outputs = g.outputs;
+  *ret_sym_handle = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
+int MXSetCalibTableToQuantizedSymbol(SymbolHandle qsym_handle,
+                                     const mx_uint num_layers,
+                                     const char** layer_names,
+                                     const float* min_ranges,
+                                     const float* max_ranges,
+                                     SymbolHandle* ret_qsym_handle) {
+  nnvm::Symbol* s = new nnvm::Symbol();
+  API_BEGIN();
+  nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(qsym_handle);
+  nnvm::Graph g = Symbol2Graph(*sym);
+  const std::string prefix = "quantized_";
+  std::unordered_map<std::string, std::pair<float, float>> calib_table;
+  for (size_t i = 0; i < num_layers; ++i) {
+    calib_table.emplace(prefix+layer_names[i], std::make_pair(min_ranges[i], max_ranges[i]));
+  }
+  g.attrs["calib_table"] = std::make_shared<nnvm::any>(std::move(calib_table));
+  g = ApplyPass(std::move(g), "SetCalibTableToQuantizedGraph");
+  s->outputs = g.outputs;
+  *ret_qsym_handle = s;
+  API_END_HANDLE_ERROR(delete s);
+}
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index 2682b94b4fae..f06a4e4aabe9 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -29,6 +29,7 @@
 
 #include <mxnet/io.h>
 #include <mxnet/base.h>
+#include <mxnet/tensor_blob.h>
 #include <dmlc/base.h>
 #include <mshadow/tensor.h>
 #include <vector>
diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h
index c98a010774d7..5632d73c2611 100644
--- a/src/operator/nn/convolution-inl.h
+++ b/src/operator/nn/convolution-inl.h
@@ -32,6 +32,7 @@
 #include <mxnet/ndarray.h>
 #include <mxnet/operator.h>
 #include <mxnet/operator_util.h>
+#include <mxnet/op_attr_types.h>
 #include <dmlc/logging.h>
 #include <dmlc/optional.h>
 #include <algorithm>
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 951063fb4b2f..79f2e800ee14 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -468,6 +468,10 @@ There are other options to tune the performance.
   else
     return std::vector<std::string>{"data", "weight", "bias"};
 })
+.set_attr<nnvm::FListInputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output"};
+})
 .set_attr<nnvm::FInferShape>("FInferShape", ConvolutionShape)
 .set_attr<nnvm::FInferType>("FInferType", ConvolutionType)
 .set_attr<FInferStorageType>("FInferStorageType", ConvStorageType)
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index 75d594ffd91d..475b63625166 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -267,6 +267,10 @@ This could be used for model inference with `row_sparse` weights trained with `S
     return std::vector<std::string>{"data", "weight"};
   }
 })
+.set_attr<nnvm::FListInputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output"};
+})
 #if MXNET_USE_MKLDNN == 1
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index ff429df9d7b4..18f088f758e4 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -103,7 +103,8 @@ void LayerNormCompute(const nnvm::NodeAttrs& attrs,
   size_t workspace_size = 0;
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
-      workspace_size = broadcast::ReduceWorkspaceSize<NDim, DType>(s, mean_data, req[0], in_data);
+      workspace_size =
+        broadcast::ReduceWorkspaceSize<NDim, DType>(s, mean_data.shape_, req[0], in_data.shape_);
     });
   });
   workspace = ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
@@ -202,16 +203,14 @@ void LayerNormGradCompute(const nnvm::NodeAttrs& attrs,
     BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
       reduce_workspace_size =
         std::max(reduce_workspace_size,
-                 broadcast::ReduceWorkspaceSize<NDim, DType>(
-                   s, ograd.reshape(red_src_shape), kAddTo,
-                   mean.reshape(red_dst_shape)));
+                 broadcast::ReduceWorkspaceSize<NDim, DType>(s, red_src_shape,
+                                                             kAddTo, red_dst_shape));
     });
     BROADCAST_NDIM_SWITCH(red_exclude_dst_shape.ndim(), NDim, {
       reduce_workspace_size =
         std::max(reduce_workspace_size,
-                 broadcast::ReduceWorkspaceSize<NDim, DType>(
-                   s, ograd.reshape(red_exclude_src_shape), kAddTo,
-                   gamma.reshape(red_exclude_dst_shape)));
+                 broadcast::ReduceWorkspaceSize<NDim, DType>(s, red_exclude_src_shape, kAddTo,
+                                                             red_exclude_dst_shape));
     });
   });
   workspace = ctx.requested[0].get_space_typed<xpu, 1, char>(
diff --git a/src/operator/contrib/dequantize-inl.h b/src/operator/quantization/dequantize-inl.h
similarity index 52%
rename from src/operator/contrib/dequantize-inl.h
rename to src/operator/quantization/dequantize-inl.h
index 8f24a8fd7b5c..799e13665664 100644
--- a/src/operator/contrib/dequantize-inl.h
+++ b/src/operator/quantization/dequantize-inl.h
@@ -22,8 +22,8 @@
  * \file dequantize-inl.h
  * \brief Implementation of dequantize operation
  */
-#ifndef MXNET_OPERATOR_CONTRIB_DEQUANTIZE_INL_H_
-#define MXNET_OPERATOR_CONTRIB_DEQUANTIZE_INL_H_
+#ifndef MXNET_OPERATOR_QUANTIZATION_DEQUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_DEQUANTIZE_INL_H_
 
 #include <mxnet/operator_util.h>
 #include <vector>
@@ -31,6 +31,7 @@
 #include "../elemwise_op_common.h"
 #include "../mshadow_op.h"
 #include "../mxnet_op.h"
+#include "./quantization_utils.h"
 
 namespace mxnet {
 namespace op {
@@ -40,18 +41,30 @@ struct DequantizeParam : public dmlc::Parameter<DequantizeParam> {
   DMLC_DECLARE_PARAMETER(DequantizeParam) {
     DMLC_DECLARE_FIELD(out_type)
     .add_enum("float32", mshadow::kFloat32)
+    .set_default(mshadow::kFloat32)
     .describe("Output data type.");
   }
 };
 
-struct dequantize {
+// dequantize unsigned int8 to float32
+struct dequantize_unsigned {
   template<typename DstDType, typename SrcDType>
   MSHADOW_XINLINE static void Map(int i, DstDType *out, const SrcDType *in,
-                                  float *imin_range, float *imax_range,
-                                  double imin_limit, double imax_limit,
-                                  float half_range) {
-    float scale = (*imax_range - *imin_range) / (imax_limit - imin_limit);
-    out[i] = static_cast<DstDType>((in[i] + half_range) * scale + *imin_range);
+                                  const float *imin_range, const float *imax_range,
+                                  const float imin_limit, const float imax_limit) {
+    const float scale = (*imax_range - *imin_range) / (imax_limit - imin_limit);
+    out[i] = static_cast<DstDType>(in[i] * scale + *imin_range);
+  }
+};
+
+// keep zero-center
+struct dequantize_zero_centered {
+  template<typename DstDType, typename SrcDType>
+  MSHADOW_XINLINE static void Map(int i, DstDType *out, const SrcDType *in,
+                                  const float *imin_range, const float *imax_range,
+                                  const float quantized_range) {
+    const float real_range = MaxAbs(*imax_range, *imin_range);
+    out[i] = in[i] * (real_range / quantized_range);
   }
 };
 
@@ -63,20 +76,20 @@ void DequantizeCompute(const nnvm::NodeAttrs& attrs,
                      const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mxnet_op;
+  using mshadow::red::limits::MinValue;
+  using mshadow::red::limits::MaxValue;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-
-  // for now, only supports dequantize from float to uint8
-  typedef float   DstDType;
-  typedef uint8_t SrcDType;
-  double min_limit = static_cast<double>(std::numeric_limits<SrcDType>::min());
-  double max_limit = static_cast<double>(std::numeric_limits<SrcDType>::max());
-  float half_range = !std::is_signed<SrcDType>::value
-    ? 0.0f
-    : (max_limit - min_limit + 1) / 2.0;
-
-  Kernel<dequantize, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DstDType>(),
-    inputs[0].dptr<SrcDType>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
-    min_limit, max_limit, half_range);
+  if (inputs[0].type_flag_ == mshadow::kUint8) {
+    Kernel<dequantize_unsigned, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<float>(),
+      inputs[0].dptr<uint8_t>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
+      MinValue<uint8_t>(), MaxValue<uint8_t>());
+  } else if (inputs[0].type_flag_ == mshadow::kInt8) {
+    Kernel<dequantize_zero_centered, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<float>(),
+      inputs[0].dptr<int8_t>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
+      MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+  } else {
+    LOG(FATAL) << "dequantize op only supports input type int8 or uint8";
+  }
 }
 
 inline bool DequantizeShape(const nnvm::NodeAttrs& attrs,
@@ -85,30 +98,27 @@ inline bool DequantizeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 1U);
 
-  CHECK(!shape_is_none(in_attrs->at(0)));
   for (size_t i = 1; i < 3; ++i) {
-    CHECK(shape_is_scalar(in_attrs->at(i))) << in_attrs->at(i);
+    SHAPE_ASSIGN_CHECK(*in_attrs, i, TShape({1}));
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  return true;
+  return !shape_is_none(out_attrs->at(0));
 }
 
 inline bool DequantizeType(const nnvm::NodeAttrs& attrs,
-                         std::vector<int> *in_attrs,
-                         std::vector<int> *out_attrs) {
+                           std::vector<int> *in_attrs,
+                           std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 1U);
-  CHECK_EQ((*in_attrs)[0], mshadow::kUint8)
-    << "`dequantize` only supports uint8 input for now";
-  CHECK_EQ((*in_attrs)[1], mshadow::kFloat32)
-    << "the second input of `dequantize` should be a tensor with type of float";
-  CHECK_EQ((*in_attrs)[2], mshadow::kFloat32)
-    << "the third input of `dequantize` should be a tensor with type of float";
+  CHECK(in_attrs->at(0) == mshadow::kUint8 || in_attrs->at(0) == mshadow::kInt8)
+    << "the input data type of dequantize op must be provided, either uint8 or int8";
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
   TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32);
   return (*in_attrs)[0] != -1;
 }
 
 }  // namespace op
 }  // namespace mxnet
-#endif  // MXNET_OPERATOR_CONTRIB_DEQUANTIZE_INL_H_
+#endif  // MXNET_OPERATOR_QUANTIZATION_DEQUANTIZE_INL_H_
diff --git a/src/operator/contrib/dequantize.cc b/src/operator/quantization/dequantize.cc
similarity index 71%
rename from src/operator/contrib/dequantize.cc
rename to src/operator/quantization/dequantize.cc
index 7814a157719c..92b808dd460d 100644
--- a/src/operator/contrib/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -30,14 +30,20 @@ DMLC_REGISTER_PARAMETER(DequantizeParam);
 
 NNVM_REGISTER_OP(_contrib_dequantize)
 .describe(R"code(Dequantize the input tensor into a float tensor.
-[min_range, max_range] are scalar floats that spcify the range for
+min_range and max_range are scalar floats that specify the range for
 the output data.
 
-Each value of the tensor will undergo the following:
+When input data type is `uint8`, the output is calculated using the following equation:
 
-`out[i] = min_range + (in[i] * (max_range - min_range) / range(INPUT_TYPE))`
+`out[i] = in[i] * (max_range - min_range) / 255.0`,
 
-here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+When input data type is `int8`, the output is calculate using the following equation
+by keep zero centered for the quantized value:
+
+`out[i] = in[i] * MaxAbs(min_range, max_range) / 127.0`,
+
+.. Note::
+    This operator only supports forward propogation. DO NOT use it in training.
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<DequantizeParam>)
 .set_num_inputs(3)
@@ -45,12 +51,11 @@ here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 .set_attr<nnvm::FInferShape>("FInferShape", DequantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", DequantizeType)
 .set_attr<FCompute>("FCompute<cpu>", DequantizeCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_dequantize"})
-.add_argument("input", "NDArray-or-Symbol", "A ndarray/symbol of type `uint8`")
+.add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `uint8`")
 .add_argument("min_range", "NDArray-or-Symbol", "The minimum scalar value "
-  "possibly produced for the input")
+  "possibly produced for the input in float32")
 .add_argument("max_range", "NDArray-or-Symbol", "The maximum scalar value "
-  "possibly produced for the input")
+  "possibly produced for the input in float32")
 .add_arguments(DequantizeParam::__FIELDS__());
 
 }  // namespace op
diff --git a/src/operator/contrib/dequantize.cu b/src/operator/quantization/dequantize.cu
similarity index 100%
rename from src/operator/contrib/dequantize.cu
rename to src/operator/quantization/dequantize.cu
diff --git a/src/operator/quantization/quantization_utils.h b/src/operator/quantization/quantization_utils.h
new file mode 100644
index 000000000000..5b096ac0057a
--- /dev/null
+++ b/src/operator/quantization/quantization_utils.h
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file quantization_utils-inl.h
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZATION_UTILS_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZATION_UTILS_H_
+
+#include <mxnet/base.h>
+#include <algorithm>
+#include "../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+
+template<typename T>
+MSHADOW_XINLINE int Sign(T val) {
+  return (val > T(0)) - (val < T(0));
+}
+
+template<typename T>
+MSHADOW_XINLINE T Abs(T a) {
+#ifdef __CUDACC__
+  return ::abs(a);
+#else
+  return std::abs(a);
+#endif
+}
+
+template<typename T>
+MSHADOW_XINLINE T Max(T a, T b) {
+#ifdef __CUDACC__
+  return ::max(a, b);
+#else
+  return std::max(a, b);
+#endif
+}
+
+template<typename T>
+MSHADOW_XINLINE T Min(T a, T b) {
+#ifdef __CUDACC__
+  return ::min(a, b);
+#else
+  return std::min(a, b);
+#endif
+}
+
+template<typename T>
+MSHADOW_XINLINE float MaxAbs(T a, T b) {
+  return Max(Abs(static_cast<float>(a)), Abs(static_cast<float>(b)));
+}
+
+template<typename T>
+MSHADOW_XINLINE float MinAbs(T a, T b) {
+  return Min(Abs(static_cast<float>(a)), Abs(static_cast<float>(b)));
+}
+
+template<typename T>
+MSHADOW_XINLINE T FloatToQuantized(float input, float min_range, float max_range) {
+  using mshadow::red::limits::MinValue;
+  using mshadow::red::limits::MaxValue;
+  float real_range = MaxAbs(min_range, max_range);
+  float quantized_range = MinAbs(MaxValue<T>(), MinValue<T>());
+  float scale = quantized_range / real_range;
+  return Sign(input) * Min(Abs(input) * scale + 0.5f, quantized_range);
+}
+
+template <typename T>
+MSHADOW_XINLINE float QuantizedToFloat(T input, float min_range, float max_range) {
+  using mshadow::red::limits::MinValue;
+  using mshadow::red::limits::MaxValue;
+  float quantized_range = MinAbs(MinValue<T>(), MaxValue<T>());
+  float real_range = MaxAbs(min_range, max_range);
+  float scale = real_range / quantized_range;
+  return input * scale;
+}
+
+struct QuantizedToFloatStruct {
+  template<typename T>
+  MSHADOW_XINLINE static void Map(int i, float *output, const T *input,
+                                  const float *range_min, const float *range_max) {
+    output[i] = QuantizedToFloat(input[i], *range_min, *range_max);
+  }
+};
+
+template <class T1, class T2>
+MSHADOW_XINLINE T2 RequantizeInNewRange(T1 input, float min_input, float max_input,
+                                        float min_new, float max_new) {
+  const float input_float = QuantizedToFloat<T1>(input, min_input, max_input);
+  return FloatToQuantized<T2>(input_float, min_new, max_new);
+}
+
+template <class T1, class T2>
+MSHADOW_XINLINE void RequantizeManyInNewRange(size_t count, T2* output, const T1 *input,
+                                              float input_min, float input_max,
+                                              float actual_min, float actual_max) {
+  for (size_t i = 0; i < count; ++i) {
+    const float input_float =
+        QuantizedToFloat<T1>(input[i], input_min, input_max);
+    output[i] = FloatToQuantized<T2>(input_float, actual_min, actual_max);
+  }
+}
+
+/*!
+ * \brief Get the scaling factor for converting type T to float.
+ */
+template<typename T>
+MSHADOW_XINLINE float FloatForOneQuantizedLevel(float range_min, float range_max) {
+  using mshadow::red::limits::MinValue;
+  using mshadow::red::limits::MaxValue;
+  const int64_t highest = static_cast<int64_t>(MaxValue<T>());
+  const int64_t lowest  = static_cast<int64_t>(MinValue<T>());
+  const float float_for_one_quantized_level =
+      (range_max - range_min) / (highest - lowest);
+  return float_for_one_quantized_level;
+}
+
+template <typename TA, typename TB, typename TC>
+MSHADOW_XINLINE void QuantizationRangeForMultiplication(float min_a, float max_a,
+                                                        float min_b, float max_b,
+                                                        float* min_c, float* max_c) {
+  using mshadow::red::limits::MinValue;
+  using mshadow::red::limits::MaxValue;
+  const float a_float_for_one_quant_level =
+    FloatForOneQuantizedLevel<TA>(min_a, max_a);
+  const float b_float_for_one_quant_level =
+    FloatForOneQuantizedLevel<TB>(min_b, max_b);
+
+  const int64_t c_highest =
+    static_cast<int64_t>(MaxValue<TC>());
+  const int64_t c_lowest  =
+    static_cast<int64_t>(MinValue<TC>());
+  const float c_float_for_one_quant_level =
+    a_float_for_one_quant_level * b_float_for_one_quant_level;
+
+  *min_c = c_float_for_one_quant_level * c_lowest;
+  *max_c = c_float_for_one_quant_level * c_highest;
+}
+
+struct QuantizationRangeForMultiplicationStruct {
+  MSHADOW_XINLINE static void Map(int i,
+                                  float *min_c,
+                                  float *max_c,
+                                  const float *min_a,
+                                  const float *max_a,
+                                  const float *min_b,
+                                  const float *max_b) {
+  QuantizationRangeForMultiplication<int8_t, int8_t, int32_t>(
+    min_a[i], max_a[i], min_b[i], max_b[i], min_c, max_c);
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZATION_UTILS_H_
diff --git a/src/operator/contrib/quantize-inl.h b/src/operator/quantization/quantize-inl.h
similarity index 52%
rename from src/operator/contrib/quantize-inl.h
rename to src/operator/quantization/quantize-inl.h
index 4d55b1b5c6d0..8b7a11cc5a89 100644
--- a/src/operator/contrib/quantize-inl.h
+++ b/src/operator/quantization/quantize-inl.h
@@ -22,8 +22,8 @@
  * \file quantize-inl.h
  * \brief implementation of quantize operation
  */
-#ifndef MXNET_OPERATOR_CONTRIB_QUANTIZE_INL_H_
-#define MXNET_OPERATOR_CONTRIB_QUANTIZE_INL_H_
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZE_INL_H_
 
 #include <mxnet/operator_util.h>
 #include <vector>
@@ -31,6 +31,7 @@
 #include "../elemwise_op_common.h"
 #include "../mshadow_op.h"
 #include "../mxnet_op.h"
+#include "./quantization_utils.h"
 
 namespace mxnet {
 namespace op {
@@ -39,25 +40,47 @@ struct QuantizeParam : public dmlc::Parameter<QuantizeParam> {
   int   out_type;
   DMLC_DECLARE_PARAMETER(QuantizeParam) {
     DMLC_DECLARE_FIELD(out_type)
+    .add_enum("int8", mshadow::kInt8)
     .add_enum("uint8", mshadow::kUint8)
     .set_default(mshadow::kUint8)
     .describe("Output data type.");
   }
 };
 
-struct quantize {
+// quantize float to uint8_t
+struct quantize_unsigned {
   template<typename DstDType, typename SrcDType>
   MSHADOW_XINLINE static void Map(int i, DstDType *out, float *omin_range,
                                   float *omax_range, const SrcDType *in,
                                   const float *imin_range, const float *imax_range,
-                                  double min_limit, double max_limit) {
-    float scale = (max_limit - min_limit) / (*imax_range - *imin_range);
+                                  const double min_limit, const double max_limit) {
+    using mshadow::red::limits::MinValue;
+    using mshadow::red::limits::MaxValue;
+    const float scale = (max_limit - min_limit) / (*imax_range - *imin_range);
     out[i] = static_cast<DstDType>((in[i] - *imin_range) * scale + 0.5);
     *omin_range = *imin_range;
     *omax_range = *imax_range;
   }
 };
 
+
+// keep zero-center
+struct quantize_zero_centered {
+  template<typename DstDType, typename SrcDType>
+  MSHADOW_XINLINE static void Map(int i, DstDType *out, float *omin_range,
+                                  float *omax_range, const SrcDType *in,
+                                  const float *imin_range, const float *imax_range,
+                                  const float quantized_range) {
+    float real_range = MaxAbs(*imin_range, *imax_range);
+    float scale = quantized_range / real_range;
+    SrcDType x = in[i];
+    out[i] = static_cast<DstDType>(
+        Sign(x) * Min(Abs(x) * scale + 0.5f, quantized_range));
+    *omin_range = -real_range;
+    *omax_range =  real_range;
+  }
+};
+
 template<typename xpu>
 void QuantizeCompute(const nnvm::NodeAttrs& attrs,
                      const OpContext& ctx,
@@ -66,16 +89,24 @@ void QuantizeCompute(const nnvm::NodeAttrs& attrs,
                      const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mxnet_op;
+  using mshadow::red::limits::MinValue;
+  using mshadow::red::limits::MaxValue;
   Stream<xpu> *s = ctx.get_stream<xpu>();
 
-  // for now, only supports quantize from uint8 to float
-  // TODO(ziheng) consider add MSHADOW_INTEGER_TYPE_SWITCH
-  typedef uint8_t DstDType;
-  typedef float SrcDType;
-  Kernel<quantize, xpu>::Launch(s, outputs[0].Size(),
-    outputs[0].dptr<DstDType>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
-    inputs[0].dptr<SrcDType>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
-    std::numeric_limits<DstDType>::min(), std::numeric_limits<DstDType>::max());
+  const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
+  if (param.out_type == mshadow::kUint8) {
+    Kernel<quantize_unsigned, xpu>::Launch(s, outputs[0].Size(),
+      outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
+      inputs[0].dptr<float>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
+      MinValue<uint8_t>(), MaxValue<uint8_t>());
+  } else if (param.out_type == mshadow::kInt8) {  // zero-centered quantization
+    Kernel<quantize_zero_centered, xpu>::Launch(s, outputs[0].Size(),
+      outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
+      inputs[0].dptr<float>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
+      MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+  } else {
+    LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+  }
 }
 
 inline bool QuantizeShape(const nnvm::NodeAttrs& attrs,
@@ -84,15 +115,14 @@ inline bool QuantizeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 3U);
 
-  CHECK(!shape_is_none(in_attrs->at(0)));
   for (size_t i = 1; i < 3; ++i) {
-    CHECK(shape_is_scalar(in_attrs->at(i)));
+    SHAPE_ASSIGN_CHECK(*in_attrs, i, TShape({1}));
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape{1});
   SHAPE_ASSIGN_CHECK(*out_attrs, 2, TShape{1});
-  return true;
+  return !shape_is_none(out_attrs->at(0));
 }
 
 inline bool QuantizeType(const nnvm::NodeAttrs& attrs,
@@ -100,13 +130,17 @@ inline bool QuantizeType(const nnvm::NodeAttrs& attrs,
                          std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 3U);
-  CHECK_EQ((*in_attrs)[0], mshadow::kFloat32)
-    << "`quantize` only supports float32 input for now";
-  CHECK_EQ((*in_attrs)[1], mshadow::kFloat32)
-    << "the second input of `quantize` should be a tensor with type of float";
-  CHECK_EQ((*in_attrs)[2], mshadow::kFloat32)
-    << "the third input of `quantize` should be a tensor with type of float";
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
+  const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
+  if (param.out_type == mshadow::kUint8) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
+  } else if (param.out_type == mshadow::kInt8) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  } else {
+    LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+  }
   TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kFloat32);
   TYPE_ASSIGN_CHECK(*out_attrs, 2, mshadow::kFloat32);
   return (*in_attrs)[0] != -1;
@@ -114,4 +148,4 @@ inline bool QuantizeType(const nnvm::NodeAttrs& attrs,
 
 }  // namespace op
 }  // namespace mxnet
-#endif  // MXNET_OPERATOR_CONTRIB_QUANTIZE_INL_H_
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZE_INL_H_
diff --git a/src/operator/contrib/quantize.cc b/src/operator/quantization/quantize.cc
similarity index 65%
rename from src/operator/contrib/quantize.cc
rename to src/operator/quantization/quantize.cc
index 43d60d1dd83d..32eb952fa5d7 100644
--- a/src/operator/contrib/quantize.cc
+++ b/src/operator/quantization/quantize.cc
@@ -32,21 +32,37 @@ NNVM_REGISTER_OP(_contrib_quantize)
 .describe(R"code(Quantize a input tensor from float to `out_type`,
 with user-specified `min_range` and `max_range`.
 
-[min_range, max_range] are scalar floats that spcify the range for
-the input data. Each value of the tensor will undergo the following:
+min_range and max_range are scalar floats that specify the range for
+the input data.
 
-`out[i] = (in[i] - min_range) * range(OUTPUT_TYPE) / (max_range - min_range)`
+When out_type is `uint8`, the output is calculated using the following equation:
 
-here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-)code" ADD_FILELINE)
+`out[i] = (in[i] - min_range) * range(OUTPUT_TYPE) / (max_range - min_range) + 0.5`,
+
+where `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`.
+
+When out_type is `int8`, the output is calculate using the following equation
+by keep zero centered for the quantized value:
+
+`out[i] = sign(in[i]) * min(abs(in[i] * scale + 0.5f, quantized_range)`,
+
+where
+`quantized_range = MinAbs(max(int8), min(int8))` and
+`scale = quantized_range / MaxAbs(min_range, max_range).`
+
+.. Note::
+    This operator only supports forward propogation. DO NOT use it in training.)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<QuantizeParam>)
 .set_num_inputs(3)
 .set_num_outputs(3)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "min_range", "max_range"};
+  })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizeType)
 .set_attr<FCompute>("FCompute<cpu>", QuantizeCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_quantize"})
-.add_argument("input", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`")
+.add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`")
 .add_argument("min_range", "NDArray-or-Symbol", "The minimum scalar value "
   "possibly produced for the input")
 .add_argument("max_range", "NDArray-or-Symbol", "The maximum scalar value "
diff --git a/src/operator/contrib/quantize.cu b/src/operator/quantization/quantize.cu
similarity index 100%
rename from src/operator/contrib/quantize.cu
rename to src/operator/quantization/quantize.cu
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
new file mode 100644
index 000000000000..5ec745ccdf31
--- /dev/null
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -0,0 +1,308 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file quantization.cc
+ * \brief
+ */
+#include <nnvm/graph.h>
+#include <nnvm/pass.h>
+#include <mxnet/op_attr_types.h>
+#include <unordered_set>
+
+namespace mxnet {
+namespace op {
+
+using nnvm::Symbol;
+using nnvm::Node;
+using nnvm::NodePtr;
+using nnvm::NodeEntry;
+using nnvm::Graph;
+
+NodePtr CreateNode(std::string op_name, std::string node_name) {
+  NodePtr node = Node::Create();
+  node->attrs.name = node_name;
+  if (op_name == "nullptr") {
+    node->attrs.op = nullptr;
+    // ugly workaround because VariableParam is not exposed
+    node->attrs.parsed =
+      nnvm::Symbol::CreateVariable(node->attrs.name).outputs[0].node->attrs.parsed;
+  } else {
+    node->attrs.op = Op::Get(op_name);
+  }
+  return node;
+}
+
+/*!
+ * \brief Insert a node named with node_name holding the op of op_name
+ * before the node current and after the node previous.
+ */
+NodePtr InsertNode(std::string op_name,
+    std::string node_name, NodePtr current, NodeEntry previous) {
+  NodePtr node = CreateNode(op_name, node_name);
+  node->inputs.emplace_back(previous);
+  current->inputs.emplace_back(NodeEntry{node, 0, 0});
+  return node;
+}
+
+std::vector<NodeEntry> OfflineParams(std::vector<NodeEntry>&& outputs,
+                                     std::unordered_set<std::string>&& offline_params) {
+  std::string node_suffixs[3] = {"", "_min", "_max"};
+  std::unordered_map<Node*, NodePtr> mirror_map;
+  nnvm::NodeEntryMap<NodePtr> entry_var;
+  auto need_offline = [&](NodePtr n) {
+    return n->op() &&
+           (n->op()->name == "_contrib_quantize") &&
+           n->inputs[0].node->is_variable() &&
+           offline_params.count(n->inputs[0].node->attrs.name);
+  };
+  DFSVisit(outputs, [&](const NodePtr& node) {
+    for (NodeEntry& e : node->inputs) {
+      if (need_offline(e.node)) {
+        std::string node_name = e.node->attrs.name;
+        if (!entry_var.count(e)) {
+          entry_var[e] = CreateNode("nullptr", node_name + node_suffixs[e.index]);
+        }
+        e.node = entry_var[e];
+        e.index = 0;
+        e.version = 0;
+      }
+    }
+  });
+  return outputs;
+}
+
+inline bool NeedQuantize(NodePtr node, const std::unordered_set<NodePtr> excluded_nodes) {
+  static auto& quantized_op_map = Op::GetAttr<mxnet::FQuantizedOp>("FQuantizedOp");
+  return quantized_op_map.count(node->op()) && !excluded_nodes.count(node);
+}
+
+Graph QuantizeGraph(Graph &&src) {
+  static auto& quantized_op_map = Op::GetAttr<mxnet::FQuantizedOp>("FQuantizedOp");
+  static auto& need_requantize_map = Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
+  auto offline_params = src.GetAttr<std::unordered_set<std::string>>("offline_params");
+  auto excluded_nodes = src.GetAttr<std::unordered_set<NodePtr>>("excluded_nodes");
+
+  // mirror_map stores the mapping from the currently visited graph to the newly created quantized
+  // graph. Key is the currently visited graph's node pointer, and value is a copied node of the key
+  // node. The existing key's value may be updated with the newly created quantize/dequantize op.
+  std::unordered_map<Node*, NodePtr> mirror_map;
+  DFSVisit(src.outputs, [&](const NodePtr& node) {
+    NodePtr new_node = Node::Create();
+    // If the currently visited node needs quantization, insert a quantize op node before the
+    // current node and replace the current node with the quantized version in the new graph.
+    if (NeedQuantize(node, excluded_nodes)) {
+      auto fquantized_op = quantized_op_map[node->op()];
+      // If the currently visited node's op registered the FQuantizedOp property, new_node is a
+      // quantizated version of a that op, such as quantized_conv2d.
+      new_node = fquantized_op(node->attrs);
+
+      // add data into quantized op input
+      for (const auto& e : node->inputs) {
+        NodePtr mirror_node = mirror_map.at(e.node.get());
+        NodeEntry mirror_entry = NodeEntry{
+          mirror_node, e.index, e.version};
+        // If the NodeEntry e's node does not need quantization, and (the mirror_node is a variable,
+        // or the mirror_node's op is not a quantize op), create quantize op, min op, and max op
+        // taking mirror_entry as input to generate a quantized NDArray. Save the mapping between
+        // e's source node and the newly created quantize op so that the quantize op can be
+        // reused next time when the same entry is visited again.
+        if (!NeedQuantize(e.node, excluded_nodes) &&
+            (mirror_node->op() == nullptr ||
+             mirror_node->op()->name != "_contrib_quantize")) {
+          NodePtr quantize_node = InsertNode("_contrib_quantize",
+            e.node->attrs.name + "_quantize", new_node, mirror_entry);
+          quantize_node->attrs.dict["out_type"] = "int8";
+          quantize_node->op()->attr_parser(&(quantize_node->attrs));
+
+          NodePtr min_node = InsertNode("min",
+              e.node->attrs.name + "_min", quantize_node, mirror_entry);
+          min_node->op()->attr_parser(&(min_node->attrs));
+
+          NodePtr max_node = InsertNode("max",
+              e.node->attrs.name + "_max", quantize_node, mirror_entry);
+          max_node->op()->attr_parser(&(max_node->attrs));
+
+          mirror_map[e.node.get()] = std::move(quantize_node);
+        } else {
+          // If the entry e's node needs quantization, or mirror_entry is from a quantize op,
+          // simply add mirror_entry to the input of the new_node.
+          new_node->inputs.emplace_back(mirror_entry);
+        }
+        // the input should be `quantize` or quantized version op now
+      }
+
+      // add min and max into quantized op input assume order of quantized op inputs is:
+      // data1, data2, ..., min1, max1, min2, max2, ...
+      for (const auto& e : node->inputs) {
+        NodePtr mirror_node = mirror_map.at(e.node.get());
+        NodeEntry mirror_entry = NodeEntry{
+          mirror_node, e.index, e.version};
+        // for quantize node
+        uint32_t min_index = 1;
+        uint32_t max_index = 2;
+        if (quantized_op_map.count(e.node->op())) {
+          size_t  num_outputs = e.node->num_outputs();
+          min_index = num_outputs + 2 * e.index;
+          max_index = num_outputs + 2 * e.index + 1;
+        } else {
+          CHECK(mirror_node->op()->name == "_contrib_quantize")
+            << "The input is not quantize or quantized_op";
+        }
+        new_node->inputs.emplace_back(NodeEntry{mirror_node, min_index, 0});
+        new_node->inputs.emplace_back(NodeEntry{mirror_node, max_index, 0});
+      }
+
+      // If the new_node op registered attr FNeedRequantize, insert requantize node after it.
+      // Here it's assumed that the quantized_op node only produces three outputs:
+      // out_data, min_range, and max_range.
+      if (need_requantize_map.count(new_node->op()) > 0
+          && need_requantize_map[new_node->op()](new_node->attrs)) {
+        NodePtr requantize_node = Node::Create();
+        requantize_node->attrs.op = Op::Get("_contrib_requantize");
+        requantize_node->attrs.name = "requantize_" + node->attrs.name;
+        if (requantize_node->op()->attr_parser != nullptr) {
+          requantize_node->op()->attr_parser(&(requantize_node->attrs));
+        }
+        for (size_t i = 0; i < 3; ++i) {
+          requantize_node->inputs.emplace_back(NodeEntry{new_node, static_cast<uint32_t>(i), 0});
+        }
+        new_node = requantize_node;
+      }
+    } else {
+      // If the currently visited node does not need quantization, copy the current node to become
+      // the new_node. Meanwhile, check whether any inputs of the current node need quantization
+      // (e.g., a quantized_conv2d node), and insert a dequantize op node in the new graph if there
+      // are any. Otherwise, simply add a copy of the current node's entry to the inputs of
+      // the new_node.
+      *new_node = *node;
+      new_node->inputs.clear();
+      for (const auto& e : node->inputs) {
+        NodePtr mirror_node = mirror_map.at(e.node.get());
+        NodeEntry mirror_entry = NodeEntry{
+          mirror_node, e.index, e.version};
+        size_t num_outputs = e.node->num_outputs();
+        uint32_t min_index = num_outputs + 2 * e.index;
+        uint32_t max_index = num_outputs + 2 * e.index + 1;
+
+        // if input node is quantized operator, add dequantize node
+        if (NeedQuantize(e.node, excluded_nodes)) {
+          NodePtr dequantize_node = CreateNode("_contrib_dequantize",
+            e.node->attrs.name + "_dequantize");
+          dequantize_node->inputs.emplace_back(mirror_entry);
+          dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, min_index, 0});
+          dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, max_index, 0});
+          dequantize_node->op()->attr_parser(&(dequantize_node->attrs));
+
+          new_node->inputs.emplace_back(NodeEntry{dequantize_node, 0, 0});
+          mirror_map[e.node.get()] = std::move(dequantize_node);
+        } else {
+          new_node->inputs.emplace_back(NodeEntry{mirror_node, e.index, e.version});
+        }
+      }
+    }
+    mirror_map[node.get()] = std::move(new_node);
+  });
+
+  std::vector<NodeEntry> outputs;
+  for (const auto& e : src.outputs) {
+    if (quantized_op_map.count(e.node->op())) {
+      NodePtr mirror_node = mirror_map.at(e.node.get());
+      NodeEntry mirror_entry = NodeEntry{mirror_node, e.index, e.version};
+      size_t num_inputs = e.node->num_inputs();
+      uint32_t min_index = num_inputs + 2 * e.index;
+      uint32_t max_index = num_inputs + 2 * e.index + 1;
+
+      NodePtr dequantize_node = CreateNode("_contrib_dequantize",
+          e.node->attrs.name + "_dequantize");
+      dequantize_node->inputs.emplace_back(mirror_entry);
+      dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, min_index, 0});
+      dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, max_index, 0});
+      dequantize_node->op()->attr_parser(&(dequantize_node->attrs));
+      outputs.emplace_back(NodeEntry{dequantize_node, 0, 0});
+    } else {
+      outputs.emplace_back(NodeEntry{mirror_map.at(e.node.get()), e.index, e.version});
+    }
+  }
+
+  if (!offline_params.empty()) outputs =
+    OfflineParams(std::move(outputs), std::move(offline_params));
+
+  Graph ret;
+  ret.outputs = std::move(outputs);
+  return ret;
+}
+
+Graph SetCalibTableToQuantizedGraph(Graph&& g) {
+  static const auto& flist_outputs =
+    nnvm::Op::GetAttr<nnvm::FListOutputNames>("FListOutputNames");
+  static const auto& need_requantize_map =
+    nnvm::Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
+  const auto& calib_table =
+    g.GetAttr<std::unordered_map<std::string, std::pair<float, float>>>("calib_table");
+  DFSVisit(g.outputs, [&](const NodePtr& node) {
+    // If the current op is requantize
+    // find the thresholds from the calibration table with the key equal
+    // to the current op's input node name, e.g. a quantized_conv2d node.
+    if (node->op() != nullptr && node->op()->name == "_contrib_requantize") {
+      NodePtr quantized_op_node = node->inputs[0].node;
+      CHECK(quantized_op_node->op() != nullptr) << quantized_op_node->attrs.name
+                                                << " must be an quantized op node";
+      CHECK(need_requantize_map.count(quantized_op_node->op()) > 0
+          && need_requantize_map[quantized_op_node->op()](quantized_op_node->attrs))
+          << quantized_op_node->attrs.name << " op must register FNeedRequantize attr"
+                                              " and the attr func should return true";
+      std::string out_data_name = quantized_op_node->attrs.name + "_";
+      auto list_output_names_func = flist_outputs.get(quantized_op_node->op(), nullptr);
+      // Here it's assumed that the quantized_op node only produces three outputs:
+      // out_data, min_range, and max_range. So we want to get the pre-calculated min_calib_range
+      // and max_calib_range from the calibration table for out_data. Here we create the output
+      // data name same as its constructed in GraphExecutor::ExecuteMonCallback.
+      if (list_output_names_func != nullptr) {
+        std::vector<std::string> names = list_output_names_func(quantized_op_node->attrs);
+        CHECK_EQ(names.size(), 3U) << "ListOutputNames is expected to return three string for"
+                                      " quantized operators";
+        out_data_name += names[0];
+      } else {
+        out_data_name += "0";
+      }
+      const auto calib_table_iter = calib_table.find(out_data_name);
+      if (calib_table_iter != calib_table.end()) {
+        node->attrs.dict["min_calib_range"] = std::to_string(calib_table_iter->second.first);
+        node->attrs.dict["max_calib_range"] = std::to_string(calib_table_iter->second.second);
+        node->op()->attr_parser(&(node->attrs));
+      }
+    }
+  });
+  return g;
+}
+
+NNVM_REGISTER_PASS(QuantizeGraph)
+.describe("")
+.set_body(QuantizeGraph)
+.set_change_graph(true);
+
+NNVM_REGISTER_PASS(SetCalibTableToQuantizedGraph)
+.describe("")
+.set_body(SetCalibTableToQuantizedGraph)
+.set_change_graph(true);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_conv.cc b/src/operator/quantization/quantized_conv.cc
new file mode 100644
index 000000000000..d7dc9fe4dbd8
--- /dev/null
+++ b/src/operator/quantization/quantized_conv.cc
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file quantized_conv.cc
+ * \brief
+ * \author Ziheng Jiang, Jun Wu
+*/
+#include "../nn/convolution-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool QuantizedConvShape(const nnvm::NodeAttrs& attrs,
+                        std::vector<TShape>* in_shape,
+                        std::vector<TShape>* out_shape) {
+  using namespace mshadow;
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  CHECK_EQ(param.num_group, 1U) << "quantized_conv only supports num_group=1 for now";
+  CHECK_EQ(in_shape->size(), param.no_bias? 6U : 9U);
+  CHECK_EQ(out_shape->size(), 3U);
+  if (param.layout.has_value()) {
+    CHECK_EQ(param.layout.value(), mshadow::kNCHW) << "quantized_conv only supports NCHW for now";
+  }
+  CHECK_EQ(param.kernel.ndim(), 2U) << "quantized_conv only supports 2D convolution for now";
+  CHECK(param.dilate.ndim() == 0U || param.dilate.Size() == 1U)
+    << "quantized_conv only supports dilation=1 for all dimensions";
+  const TShape& dshape =  in_shape->at(0);
+  CHECK_EQ(dshape.ndim(), 4U);
+  if (dshape.ndim() == 0U) return false;
+
+  const int N = 0, H = 2, W = 3, C = 1;
+  CHECK_EQ(dshape[C] % 4,  0U)
+    << "for 8bit cudnn conv, the number of channel must be multiple of 4";
+  CHECK_EQ(param.num_filter % 4, 0U)
+    << "for 8bit cudnn conv, the number of channel must be multiple of 4";
+
+  TShape wshape{0, 0, 0, 0};
+  wshape[N] = param.num_filter;
+  wshape[H] = param.kernel[0];
+  wshape[W] = param.kernel[1];
+  wshape[C] = dshape[C];
+  SHAPE_ASSIGN_CHECK(*in_shape, 1, wshape);
+  const int start = param.no_bias? 2 : 3;
+  const int end = param.no_bias? 6 : 9;
+  for (int i = start; i < end; ++i) {
+    SHAPE_ASSIGN_CHECK(*in_shape, i, TShape{1});
+  }
+  if (!param.no_bias) {
+    SHAPE_ASSIGN_CHECK(*in_shape, 2, Shape1(param.num_filter));
+  }
+
+  auto AddPad = [](index_t dsize, index_t pad) { return dsize + 2 * pad; };
+  TShape oshape{1, 1, 1, 1};
+  oshape[N] = dshape[N];
+  oshape[C] = wshape[N];
+  oshape[H] = (AddPad(dshape[H], param.pad[0]) - wshape[H]) / param.stride[0] + 1;
+  oshape[W] = (AddPad(dshape[W], param.pad[1]) - wshape[W]) / param.stride[1] + 1;
+
+  SHAPE_ASSIGN_CHECK(*out_shape, 0, oshape);
+  SHAPE_ASSIGN_CHECK(*out_shape, 1, TShape({1}));
+  SHAPE_ASSIGN_CHECK(*out_shape, 2, TShape({1}));
+  return true;
+}
+
+bool QuantizedConvType(const nnvm::NodeAttrs& attrs,
+                       std::vector<int> *in_type,
+                       std::vector<int> *out_type) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  CHECK_EQ(in_type->size(), param.no_bias? 6U : 9U);
+  CHECK_EQ(out_type->size(), 3U);
+  TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*in_type, 1, mshadow::kInt8);
+  if (!param.no_bias) {
+    TYPE_ASSIGN_CHECK(*in_type, 2, mshadow::kInt8);
+  }
+
+  const size_t start = param.no_bias? 2 : 3;
+  const size_t end = param.no_bias? 6 : 9;
+  for (size_t i = start; i < end; ++i) {
+    TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kFloat32);
+  }
+
+  TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kInt32);
+  TYPE_ASSIGN_CHECK(*out_type, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_type, 2, mshadow::kFloat32);
+  return true;
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_conv)
+.describe(R"code(Convolution operator for input, weight and bias data type of int8,
+and accumulates in type int32 for the output. For each argument, two more arguments of type
+float32 must be provided representing the thresholds of quantizing argument from data
+type float32 to int8. The final outputs contain the convolution result in int32, and min
+and max thresholds representing the threholds for quantizing the float32 output into int32.
+
+.. Note::
+    This operator only supports forward propogation. DO NOT use it in training.)code" ADD_FILELINE)
+.set_num_inputs(
+  [](const NodeAttrs& attrs) {
+    const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+    return param.no_bias? 6 : 9;
+  })
+.set_num_outputs(3)
+.set_attr_parser(ParamParser<ConvolutionParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+    if (param.no_bias) {
+      return std::vector<std::string>{"data", "weight", "min_data", "max_data",
+                                      "min_weight", "max_weight"};
+    } else {
+      return std::vector<std::string>{"data", "weight", "bias", "min_data", "max_data",
+                                      "min_weight", "max_weight", "min_bias", "max_bias"};
+    }
+  })
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output", "min_output", "max_output"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", QuantizedConvShape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizedConvType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>(1, ResourceRequest::kTempSpace);
+  })
+.set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.add_argument("data", "NDArray-or-Symbol", "Input data.")
+.add_argument("weight", "NDArray-or-Symbol", "weight.")
+.add_argument("bias", "NDArray-or-Symbol", "bias.")
+.add_argument("min_data", "NDArray-or-Symbol", "Minimum value of data.")
+.add_argument("max_data", "NDArray-or-Symbol", "Maximum value of data.")
+.add_argument("min_weight", "NDArray-or-Symbol", "Minimum value of weight.")
+.add_argument("max_weight", "NDArray-or-Symbol", "Maximum value of weight.")
+.add_argument("min_bias", "NDArray-or-Symbol", "Minimum value of bias.")
+.add_argument("max_bias", "NDArray-or-Symbol", "Maximum value of bias.")
+.add_arguments(ConvolutionParam::__FIELDS__());
+
+NNVM_REGISTER_OP(Convolution)
+.set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
+    nnvm::NodePtr node = nnvm::Node::Create();
+    node->attrs.op = Op::Get("_contrib_quantized_conv");
+    node->attrs.name = "quantized_" + attrs.name;
+    node->attrs.dict = attrs.dict;
+    if (node->op()->attr_parser != nullptr) {
+      node->op()->attr_parser(&(node->attrs));
+    }
+    return node;
+  });
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_conv.cu b/src/operator/quantization/quantized_conv.cu
new file mode 100644
index 000000000000..2db5416309b5
--- /dev/null
+++ b/src/operator/quantization/quantized_conv.cu
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file quantized_conv.cu
+ * \brief
+ * \author Ziheng Jiang, Jun Wu
+*/
+#include "../nn/convolution-inl.h"
+#include "./quantization_utils.h"
+#include "../tensor/matrix_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+// value + bias_value * (range1 / limit_range1) * (limit_range2 / range2)
+struct QuantizedBiasAddKernel {
+  MSHADOW_XINLINE static void Map(int i, size_t bias_size, int32_t *out,
+                                  const int8_t *bias, const float *min_out,
+                                  const float *max_out, const float *min_bias,
+                                  const float *max_bias, const size_t spatial_size) {
+    using mshadow::red::limits::MinValue;
+    using mshadow::red::limits::MaxValue;
+    float float_for_one_out_quant  =
+      MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<int32_t>());
+    float float_for_one_bias_quant =
+      MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<int8_t>());
+    const size_t channel_id = (i / spatial_size) % bias_size;
+    out[i] = (out[i] * float_for_one_out_quant +
+              bias[channel_id] * float_for_one_bias_quant) /
+             float_for_one_out_quant;
+  }
+};
+
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6
+template<typename SrcType, typename DstType, typename CmpType>
+class QuantizedCuDNNConvOp {
+ public:
+  QuantizedCuDNNConvOp() {
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&data_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
+  }
+
+  void Init(const ConvolutionParam& param,
+            const OpContext& ctx,
+            const std::vector<TShape>& in_shape,
+            const std::vector<TShape>& out_shape) {
+    param_ = param;
+    CHECK_EQ(param_.kernel.ndim(), 2U)
+      << "QuantizedCuDNNConvOp only supports 2D convolution for now";
+    if (param_.layout.has_value()) {
+      CHECK_EQ(param_.layout.value(), mshadow::kNCHW)
+        << "QuantizedConvOp only supports NCHW for now";
+    }
+    if (param_.stride.ndim() == 0U) param_.stride = mshadow::Shape2(1, 1);
+    if (param_.dilate.ndim() == 0U) param_.dilate = mshadow::Shape2(1, 1);
+    if (param_.pad.ndim() == 0U)    param_.pad = mshadow::Shape2(0, 0);
+    N = 0, H = 2, W = 3, C = 1;
+    src_type_ = mshadow::DataType<SrcType>::kCudnnFlag;
+    dst_type_ = mshadow::DataType<DstType>::kCudnnFlag;
+    cmp_type_ = mshadow::DataType<CmpType>::kCudnnFlag;
+    algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    format_ = CUDNN_TENSOR_NHWC;
+    InitDescriptors(in_shape, out_shape);
+    GetTempSize(ctx);
+  }
+
+  ~QuantizedCuDNNConvOp() {
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc_));
+  }
+
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
+    CHECK_EQ(param_.kernel.ndim(), 2U)
+      << "QuantizedCuDNNConvOp only supports 2D convolution for now";
+    using namespace mshadow;
+    CHECK_EQ(in_data.size(), param_.no_bias? 6U : 9U);
+    CHECK_EQ(out_data.size(), 3U);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    CHECK_EQ(s->dnn_handle_ownership_, Stream<gpu>::OwnHandle);
+
+    const TBlob& data   = in_data[0];
+    const TBlob& filter = in_data[1];
+    const TBlob& out    = out_data[0];
+    const TShape& dshape = data.shape_;
+    const TShape& fshape = filter.shape_;
+    const TShape& oshape = out.shape_;
+
+    // allocate workspace
+    const int dev_id = ctx.run_ctx.ctx.dev_id;
+    const int dev_mask = gpu::kDevMask;
+    if (!param_.layout.has_value() || param_.layout.value() == mshadow::kNCHW) {
+      const size_t data_size = dshape.Size();
+      const size_t weight_size = fshape.Size();
+      const size_t output_size = oshape.Size();
+      size_t total_temp_bytes = (workspace_ + data_size + weight_size) * sizeof(SrcType)
+                              + output_size * (sizeof(DstType) + sizeof(int32_t));
+      Tensor<gpu, 1, char> temp_space =
+        ctx.requested[0].get_space_typed<gpu, 1, char>(mshadow::Shape1(total_temp_bytes), s);
+      char* temp_dptr = temp_space.dptr_;
+      TBlob data_(reinterpret_cast<SrcType*>(temp_dptr),
+                  TShape({dshape[N], dshape[H], dshape[W], dshape[C]}),
+                  dev_mask, DataType<SrcType>::kFlag, dev_id);
+      temp_dptr += data_size * sizeof(SrcType);
+      TBlob filter_(reinterpret_cast<SrcType*>(temp_dptr),
+                    TShape({fshape[N], fshape[H], fshape[W], fshape[C]}),
+                    dev_mask, DataType<SrcType>::kFlag, dev_id);
+      temp_dptr += weight_size * sizeof(SrcType);
+
+      // input:  [NCHW] => [NHWC](batch, in_height, in_width, in_channels)
+      // filter: [NCHW] => [NHWC](out_channels, filter_height, filter_width, in_channels)
+      TransposeImpl<gpu>(ctx.run_ctx, data,   data_,   TShape({N, H, W, C}));
+      TransposeImpl<gpu>(ctx.run_ctx, filter, filter_, TShape({N, H, W, C}));
+      TBlob out_(reinterpret_cast<DstType*>(temp_dptr),
+                 TShape({oshape[N], oshape[H], oshape[W], oshape[C]}),
+                 dev_mask, DataType<DstType>::kFlag, dev_id);
+      temp_dptr += output_size * sizeof(DstType);
+      TBlob out_tcast(reinterpret_cast<int32_t*>(temp_dptr),
+                      TShape({oshape[N], oshape[H], oshape[W], oshape[C]}),
+                      dev_mask, DataType<int32_t>::kFlag, dev_id);
+      temp_dptr += output_size * sizeof(int32_t);
+      // input:  [NHWC](batch, in_height, in_width, in_channels)
+      // filter: [HWNC](out_channels, filter_height, filter_width, in_channels)
+      // output: [NHWC](batch, out_height, out_width, out_channels)
+
+      CUDNN_CALL(cudnnConvolutionForward(s->dnn_handle_,
+                                         &alpha_,
+                                         data_desc_,
+                                         data_.dptr_,
+                                         filter_desc_,
+                                         filter_.dptr_,
+                                         conv_desc_,
+                                         algo_,
+                                         temp_dptr,
+                                         workspace_byte_,
+                                         &beta_,
+                                         out_desc_,
+                                         out_.dptr_));
+
+      Tensor<gpu, 1, DstType> out_tensor = out_.FlatTo1D<gpu, DstType>(s);
+      Tensor<gpu, 1, int32_t> out_tcast_tensor = out_tcast.FlatTo1D<gpu, int32_t>(s);
+      Assign(out_tcast_tensor, kWriteTo, mshadow::expr::tcast<int32_t>(out_tensor));
+      // output: [NHWC](batch, out_height, out_width, out_channels) => [NCHW]
+      TransposeImpl<gpu>(ctx.run_ctx, out_tcast, out, TShape({0, 3, 1, 2}));
+    } else {
+      LOG(FATAL) << "quantized_conv only supports NCHW for now";
+    }
+
+    // calculate the min/max range for out_data as it's a multiplication
+    // of in_data[0] and in_data[1]. Need to rescale the min/max range of out_data
+    // based on the min/max ranges of in_data[0] and in_data[1].
+    const size_t num_inputs = param_.no_bias ? 2 : 3;
+    mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, gpu>::Launch(s, 1,
+      out_data[1].dptr<float>(), out_data[2].dptr<float>(),
+       in_data[num_inputs].dptr<float>(),  in_data[num_inputs+1].dptr<float>(),
+       in_data[num_inputs+2].dptr<float>(),  in_data[num_inputs+3].dptr<float>());
+
+    if (!param_.no_bias) {
+      if (param_.layout.has_value()) {
+        CHECK_EQ(param_.layout.value(), mshadow::kNCHW)
+          << "quantized_conv only supports NCHW when there is a bias";
+      }
+      const TBlob& bias = in_data[2];
+      mxnet_op::Kernel<QuantizedBiasAddKernel, gpu>::Launch(s, out.Size(),
+          bias.Size(), out.dptr<int32_t>(), bias.dptr<int8_t>(),
+          out_data[1].dptr<float>(), out_data[2].dptr<float>(),
+          in_data[7].dptr<float>(),  in_data[8].dptr<float>(),
+          oshape[2] * oshape[3]);
+    }
+  }
+
+  void InitDescriptors(const std::vector<TShape>& in_shape,
+                       const std::vector<TShape>& out_shape) {
+    const TShape& dshape =  in_shape[0];
+    const TShape& kshape =  in_shape[1];
+    const TShape& oshape = out_shape[0];
+    CUDNN_CALL(cudnnSetConvolution2dDescriptor(conv_desc_,
+                                               param_.pad[0],
+                                               param_.pad[1],
+                                               param_.stride[0],
+                                               param_.stride[1],
+                                               1,
+                                               1,
+                                               CUDNN_CROSS_CORRELATION,
+                                               cmp_type_));
+
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(data_desc_,
+                                          format_,
+                                          src_type_,
+                                          dshape[N],
+                                          dshape[C],
+                                          dshape[H],
+                                          dshape[W]));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
+                                          format_,
+                                          dst_type_,
+                                          oshape[N],
+                                          oshape[C],
+                                          oshape[H],
+                                          oshape[W]));
+    CUDNN_CALL(cudnnSetFilter4dDescriptor(filter_desc_,
+                                          src_type_,
+                                          format_,
+                                          kshape[N],
+                                          kshape[C],
+                                          kshape[H],
+                                          kshape[W]));
+  }
+
+  void GetTempSize(const OpContext& ctx) {
+    mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+    CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
+                                                       data_desc_,
+                                                       filter_desc_,
+                                                       conv_desc_,
+                                                       out_desc_,
+                                                       algo_,
+                                                       &workspace_byte_));
+    workspace_ = workspace_byte_ / sizeof(SrcType) + 1;
+  }
+
+ private:
+  ConvolutionParam param_;
+  size_t workspace_;
+  size_t workspace_byte_;
+  cudnnDataType_t src_type_;
+  cudnnDataType_t dst_type_;
+  cudnnDataType_t cmp_type_;
+  cudnnTensorFormat_t format_;
+  cudnnConvolutionDescriptor_t conv_desc_;
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  cudnnTensorDescriptor_t out_desc_;
+  cudnnConvolutionFwdAlgo_t algo_;
+  uint32_t N, H, W, C;
+  float alpha_ = 1.0f;
+  float beta_ = 0.0f;
+};  // class QuantizedCuDNNConvOp
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6
+
+void QuantizedConvForwardGPU(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  CHECK_EQ(param.kernel.ndim(), 2U)
+    << "QuantizedConvForward<gpu> only supports 2D convolution for now";
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6
+  typedef QuantizedCuDNNConvOp<int8_t, float, int32_t> QuantizedConvOpInt8;
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local QuantizedConvOpInt8 op;
+#else
+  static MX_THREAD_LOCAL QuantizedConvOpInt8 op;
+#endif  // DMLC_CXX11_THREAD_LOCAL
+  op.Init(param, ctx, {inputs[0].shape_, inputs[1].shape_}, {outputs[0].shape_});
+  op.Forward(ctx, inputs, req, outputs);
+#else
+  LOG(FATAL) << "QuantizedConvForward<gpu> only supports cudnnConvolutionForward for now";
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_conv)
+.set_attr<FCompute>("FCompute<gpu>", QuantizedConvForwardGPU);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_flatten-inl.h b/src/operator/quantization/quantized_flatten-inl.h
new file mode 100644
index 000000000000..95f366154022
--- /dev/null
+++ b/src/operator/quantization/quantized_flatten-inl.h
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file quantized_flatten-inl.h
+ * \brief implementation of quantized flatten operation
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZED_FLATTEN_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZED_FLATTEN_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <limits>
+#include "../elemwise_op_common.h"
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "./quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+// keep zero-center
+struct quantized_flatten {
+  template<typename DstDType, typename SrcDType>
+  MSHADOW_XINLINE static void Map(int i, DstDType *out, float *omin_range,
+                                  float *omax_range, const SrcDType *in,
+                                  const float *imin_range, const float *imax_range) {
+    out[i] = in[i];
+    omin_range[0] = imin_range[0];
+    omax_range[0] = imax_range[0];
+  }
+};
+
+template<typename xpu>
+void QuantizedFlattenCompute(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 3U);
+  CHECK_EQ(req.size(), 3U);
+  if (req[0] == kWriteInplace && req[1] == kWriteInplace && req[2] == kWriteInplace) return;
+  using namespace mshadow;
+  using namespace mxnet_op;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  typedef int8_t DstDType;
+  typedef int8_t  SrcDType;
+  Kernel<quantized_flatten, xpu>::Launch(s, outputs[0].Size(),
+    outputs[0].dptr<DstDType>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
+    inputs[0].dptr<SrcDType>(), inputs[1].dptr<float>(), inputs[2].dptr<float>());
+}
+
+inline bool QuantizedFlattenShape(const nnvm::NodeAttrs& attrs,
+                                  std::vector<TShape> *in_attrs,
+                                  std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U);
+  CHECK_EQ(out_attrs->size(), 3U);
+
+  const TShape &dshape = (*in_attrs)[0];
+  if (shape_is_none(dshape)) return false;
+
+  uint32_t target_dim = 1;
+  for (uint32_t i = 1; i < dshape.ndim(); ++i) {
+    target_dim *= dshape[i];
+  }
+
+  SHAPE_ASSIGN_CHECK(*in_attrs, 1, TShape{1});
+  SHAPE_ASSIGN_CHECK(*in_attrs, 2, TShape{1});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape2(dshape[0], target_dim));
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape{1});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 2, TShape{1});
+  return true;
+}
+
+inline bool QuantizedFlattenType(const nnvm::NodeAttrs& attrs,
+                                 std::vector<int> *in_attrs,
+                                 std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U);
+  CHECK_EQ(out_attrs->size(), 3U);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 2, mshadow::kFloat32);
+  return (*in_attrs)[0] != -1;
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZED_FLATTEN_INL_H_
diff --git a/src/operator/quantization/quantized_flatten.cc b/src/operator/quantization/quantized_flatten.cc
new file mode 100644
index 000000000000..3f426a59bdd2
--- /dev/null
+++ b/src/operator/quantization/quantized_flatten.cc
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file quantized_flatten.cc
+ * \brief
+ */
+#include <mxnet/op_attr_types.h>
+#include "./quantized_flatten-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_contrib_quantized_flatten)
+.set_num_inputs(3)
+.set_num_outputs(3)
+.set_attr<nnvm::FInferShape>("FInferShape", QuantizedFlattenShape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizedFlattenType)
+.set_attr<FCompute>("FCompute<cpu>", QuantizedFlattenCompute<cpu>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "min_data", "max_data"};
+  })
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output", "min_output", "max_output"};
+  })
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}, {1, 1}, {2, 2}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`")
+.add_argument("min_data", "NDArray-or-Symbol", "The minimum scalar value "
+  "possibly produced for the data")
+.add_argument("max_data", "NDArray-or-Symbol", "The maximum scalar value "
+  "possibly produced for the data");
+
+NNVM_REGISTER_OP(Flatten)
+.set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
+    nnvm::NodePtr node = nnvm::Node::Create();
+    node->attrs.op = Op::Get("_contrib_quantized_flatten");
+    node->attrs.name = "quantized_" + attrs.name;
+    node->attrs.dict = attrs.dict;
+    if (node->op()->attr_parser != nullptr) {
+      node->op()->attr_parser(&(node->attrs));
+    }
+    return node;
+  });
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_flatten.cu b/src/operator/quantization/quantized_flatten.cu
new file mode 100644
index 000000000000..4f0c8f93ab06
--- /dev/null
+++ b/src/operator/quantization/quantized_flatten.cu
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file quantized_flatten.cu
+ * \brief
+ */
+#include "./quantized_flatten-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_contrib_quantized_flatten)
+.set_attr<FCompute>("FCompute<gpu>", QuantizedFlattenCompute<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
new file mode 100644
index 000000000000..e334fe7ec9b2
--- /dev/null
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file quantized_fully_connected.cc
+ * \brief
+ * \author Ziheng Jiang, Jun Wu
+*/
+#include "../nn/fully_connected-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
+                                  std::vector<TShape> *in_shape,
+                                  std::vector<TShape> *out_shape) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  CHECK(param.flatten) << "QuantizedFullyConnectedOp only supports flatten=true for now";
+  using namespace mshadow;
+  uint32_t num_inputs = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_shape->size(), num_inputs * 3);
+  CHECK_EQ(out_shape->size(), 3U);
+
+  CHECK(!shape_is_none(in_shape->at(0)))
+    << "QuantizedFullyConnectedOp input data shape must be given";
+  const TShape& dshape = in_shape->at(0);
+  TShape wshape = Shape2(param.num_hidden, dshape.ProdShape(1, dshape.ndim()));
+  SHAPE_ASSIGN_CHECK(*in_shape, 1, wshape);
+  if (!param.no_bias) {
+    TShape bshape = Shape1(param.num_hidden);
+    SHAPE_ASSIGN_CHECK(*in_shape, 2, bshape);
+  }
+
+  for (size_t i = num_inputs; i < 3 * num_inputs; ++i) {
+    SHAPE_ASSIGN_CHECK(*in_shape, i, TShape{1});
+  }
+
+  SHAPE_ASSIGN_CHECK(*out_shape, 0, TShape({dshape[0], wshape[0]}));
+  SHAPE_ASSIGN_CHECK(*out_shape, 1, TShape({1}));
+  SHAPE_ASSIGN_CHECK(*out_shape, 2, TShape({1}));
+  return true;
+}
+
+bool QuantizedFullyConnectedType(const nnvm::NodeAttrs& attrs,
+                                 std::vector<int> *in_type,
+                                 std::vector<int> *out_type) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t num_inputs = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_type->size(), num_inputs * 3);
+  CHECK_EQ(out_type->size(), 3U);
+
+  for (size_t i = 0; i < num_inputs; ++i) {
+    TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kInt8);
+  }
+  for (size_t i = num_inputs; i < 3 * num_inputs; ++i) {
+    TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kFloat32);
+  }
+
+  TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kInt32);
+  TYPE_ASSIGN_CHECK(*out_type, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_type, 2, mshadow::kFloat32);
+  return true;
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
+.describe(R"code(Fully Connected operator for input, weight and bias data type of int8,
+and accumulates in type int32 for the output. For each argument, two more arguments of type
+float32 must be provided representing the thresholds of quantizing argument from data
+type float32 to int8. The final outputs contain the convolution result in int32, and min
+and max thresholds representing the threholds for quantizing the float32 output into int32.
+
+.. Note::
+    This operator only supports forward propogation. DO NOT use it in training.)code" ADD_FILELINE)
+.set_num_inputs(
+  [](const NodeAttrs& attrs) {
+    const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+    return param.no_bias? 6 : 9;
+  })
+.set_num_outputs(3)
+.set_attr_parser(ParamParser<FullyConnectedParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+    if (param.no_bias) {
+      return std::vector<std::string>{"data", "weight", "min_data", "max_data",
+                                      "min_weight", "max_weight"};
+    } else {
+      return std::vector<std::string>{"data", "weight", "bias", "min_data", "max_data",
+                                      "min_weight", "max_weight", "min_bias", "max_bias"};
+    }
+  })
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output", "min_output", "max_output"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", QuantizedFullyConnectedShape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizedFullyConnectedType)
+.set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.add_argument("data", "NDArray-or-Symbol", "Input data.")
+.add_argument("weight", "NDArray-or-Symbol", "weight.")
+.add_argument("bias", "NDArray-or-Symbol", "bias.")
+.add_argument("min_data", "NDArray-or-Symbol", "Minimum value of data.")
+.add_argument("max_data", "NDArray-or-Symbol", "Maximum value of data.")
+.add_argument("min_weight", "NDArray-or-Symbol", "Minimum value of weight.")
+.add_argument("max_weight", "NDArray-or-Symbol", "Maximum value of weight.")
+.add_argument("min_bias", "NDArray-or-Symbol", "Minimum value of bias.")
+.add_argument("max_bias", "NDArray-or-Symbol", "Maximum value of bias.")
+.add_arguments(FullyConnectedParam::__FIELDS__());
+
+NNVM_REGISTER_OP(FullyConnected)
+.set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
+    nnvm::NodePtr node = nnvm::Node::Create();
+    node->attrs.op = Op::Get("_contrib_quantized_fully_connected");
+    node->attrs.name = "quantized_" + attrs.name;
+    node->attrs.dict = attrs.dict;
+    if (node->op()->attr_parser != nullptr) {
+      node->op()->attr_parser(&(node->attrs));
+    }
+    return node;
+  });
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_fully_connected.cu b/src/operator/quantization/quantized_fully_connected.cu
new file mode 100644
index 000000000000..ac7ba1e21df8
--- /dev/null
+++ b/src/operator/quantization/quantized_fully_connected.cu
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file quantized_fully_connected.cu
+ * \brief
+ * \author Ziheng Jiang, Jun Wu
+*/
+#include "./quantization_utils.h"
+#include "../mxnet_op.h"
+#include "../nn/fully_connected-inl.h"
+
+namespace mxnet {
+namespace op {
+
+// value + bias_value * (range1 / limit_range1) * (limit_range2 / range2)
+struct QuantizedBiasAddKernel {
+  MSHADOW_XINLINE static void Map(int i, size_t k, int32_t *out,
+                                  const int8_t *bias, const float *min_out,
+                                  const float *max_out, const float *min_bias,
+                                  const float *max_bias) {
+    typedef int32_t T1;
+    typedef int8_t  T2;
+    using mshadow::red::limits::MinValue;
+    using mshadow::red::limits::MaxValue;
+    float float_for_one_out_quant  =
+      MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<T1>());
+    float float_for_one_bias_quant =
+      MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<T2>());
+    out[i] = (out[i] * float_for_one_out_quant +
+              bias[i%k] * float_for_one_bias_quant) /
+             float_for_one_out_quant;
+  }
+};
+
+template<typename SrcType, typename DstType, typename CmpType>
+void QuantizedFullyConnectedForwardGPU(const nnvm::NodeAttrs& attrs,
+                                       const OpContext &ctx,
+                                       const std::vector<TBlob> &inputs,
+                                       const std::vector<OpReqType> &req,
+                                       const std::vector<TBlob> &outputs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  using namespace mshadow;
+  using namespace mxnet_op;
+  size_t num_inputs = param.no_bias ? 2 : 3;
+  CHECK_EQ(inputs.size(),  num_inputs * 3);
+  CHECK_EQ(outputs.size(), 3U);
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  CHECK_EQ(s->blas_handle_ownership_, Stream<gpu>::OwnHandle);
+  const TBlob& data   =  inputs[0];
+  const TBlob& weight =  inputs[1];
+  const TBlob& out    = outputs[0];
+  TShape dshape = data.shape_;
+  TShape wshape = weight.shape_;
+  TShape oshape = out.shape_;
+  // (m, n) * (k, n).T = (m, k)
+  // A * B.T = C
+
+  // row_C = col_C(T) = cublas(col_B * col_A(T)) = cublas(row_B(T), row_A)
+  // row_C = col_C(T) = cublas(col_B(T) * col_A(T)) = cublas(row_B, row_A)
+  const int m = dshape[0], n = dshape.ProdShape(1, dshape.ndim()), k = wshape[0];
+  CmpType alpha = 1.0f;
+  CmpType beta  = 0.0f;
+  const cudaDataType src_type = mshadow::DataType<SrcType>::kCudaFlag;
+  const cudaDataType dst_type = mshadow::DataType<DstType>::kCudaFlag;
+  const cudaDataType cmp_type = mshadow::DataType<CmpType>::kCudaFlag;
+  CUBLAS_CALL(cublasGemmEx(s->blas_handle_,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           k,
+                           m,
+                           n,
+                           &alpha,
+                           weight.dptr_,
+                           src_type,
+                           n,
+                           data.dptr_,
+                           src_type,
+                           n,
+                           &beta,
+                           out.dptr_,
+                           dst_type,
+                           k,
+                           cmp_type,
+                           CUBLAS_GEMM_DFALT));
+
+  Kernel<QuantizationRangeForMultiplicationStruct, gpu>::Launch(s, 1,
+    outputs[1].dptr<float>(), outputs[2].dptr<float>(),
+     inputs[num_inputs].dptr<float>(),   inputs[num_inputs+1].dptr<float>(),
+     inputs[num_inputs+2].dptr<float>(), inputs[num_inputs+3].dptr<float>());
+
+  if (!param.no_bias) {
+    const TBlob& bias = inputs[2];
+    Kernel<QuantizedBiasAddKernel, gpu>::Launch(s, out.Size(),
+        k, out.dptr<int32_t>(), bias.dptr<int8_t>(),
+        outputs[1].dptr<float>(), outputs[2].dptr<float>(),
+         inputs[7].dptr<float>(),  inputs[8].dptr<float>());
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
+.set_attr<FCompute>("FCompute<gpu>", QuantizedFullyConnectedForwardGPU<int8_t, int32_t, int32_t>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_pooling.cc b/src/operator/quantization/quantized_pooling.cc
new file mode 100644
index 000000000000..71f4e738161d
--- /dev/null
+++ b/src/operator/quantization/quantized_pooling.cc
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file quantized_pooling.cc
+*/
+#include <mxnet/op_attr_types.h>
+#include "../nn/pooling-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool QuantizedPoolingShape(const nnvm::NodeAttrs& attrs,
+                           std::vector<TShape> *in_shape,
+                           std::vector<TShape> *out_shape) {
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 3U);
+  if (shape_is_none(in_shape->at(0))) return false;
+  const TShape &dshape = (*in_shape)[0];
+  CHECK_EQ(dshape.ndim(), 4U)
+      << "quantized_pooling: Input data should be 4D in "
+      << "(batch, channel, y, x)";
+  // NCHW layout
+  const int N = 0, H = 2, W = 3, C = 1;
+  TShape oshape(4);
+  CHECK_EQ(param.kernel.ndim(), 2) << "QuantizedPoolingOp only supports 2D pooling for now";
+  CHECK(param.kernel[0] <= dshape[H] + 2 * param.pad[0])
+      << "kernel size (" << param.kernel[0]
+      << ") exceeds input (" << dshape[H]
+      << " padded to " << (dshape[H] + 2*param.pad[0]) << ")";
+  CHECK(param.kernel[1] <= dshape[W] + 2 * param.pad[1])
+      << "kernel size (" << param.kernel[1]
+      << ") exceeds input (" << dshape[W]
+      << " padded to " << (dshape[W] + 2*param.pad[1]) << ")";
+  // only support valid convention
+  oshape[N] = dshape[N];
+  oshape[C] = dshape[C];
+  if (param.global_pool) {
+    oshape[H] = 1;
+    oshape[W] = 1;
+  } else {
+    oshape[H] = 1 + (dshape[H] + 2 * param.pad[0] - param.kernel[0]) /
+        param.stride[0];
+    oshape[W] = 1 + (dshape[W] + 2 * param.pad[1] - param.kernel[1]) /
+        param.stride[1];
+  }
+
+  SHAPE_ASSIGN_CHECK(*in_shape, 1, TShape{1});
+  SHAPE_ASSIGN_CHECK(*in_shape, 2, TShape{1});
+
+  out_shape->clear();
+  out_shape->push_back(oshape);
+  out_shape->push_back(TShape{1});
+  out_shape->push_back(TShape{1});
+  return true;
+}
+
+bool QuantizedPoolingType(const nnvm::NodeAttrs& attrs,
+                          std::vector<int> *in_type,
+                          std::vector<int> *out_type) {
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(in_type->size(), 3U);
+  CHECK_EQ(out_type->size(), 3U);
+  if (param.pool_type == pool_enum::kMaxPooling || param.pool_type == pool_enum::kAvgPooling) {
+    TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kInt8);
+    TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kInt8);
+  } else {
+    LOG(FATAL) << "QuantizedPoolingOp only supports pool_type=max/avg for now";
+  }
+  TYPE_ASSIGN_CHECK(*in_type, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, 2, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_type, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_type, 2, mshadow::kFloat32);
+  return true;
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_pooling)
+.set_num_inputs(3)
+.set_num_outputs(3)
+.set_attr_parser(ParamParser<PoolingParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "min_data", "max_data"};
+  })
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output", "min_output", "max_output"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", QuantizedPoolingShape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizedPoolingType)
+.set_attr<FNeedRequantize>("FNeedRequantize",
+  [](const NodeAttrs& attrs) {
+    const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+    CHECK(param.pool_type == pool_enum::kMaxPooling || param.pool_type == pool_enum::kAvgPooling)
+      << "QuantizedPoolingOp only supports pool_type=max/avg for now";
+    return false;
+  })
+.add_argument("data", "NDArray-or-Symbol", "Input data.")
+.add_argument("min_data", "NDArray-or-Symbol", "Minimum value of data.")
+.add_argument("max_data", "NDArray-or-Symbol", "Maximum value of data.")
+.add_arguments(PoolingParam::__FIELDS__());
+
+NNVM_REGISTER_OP(Pooling)
+.describe(R"code(Pooling operator for input and output data type of int8.
+The input and output data comes with min and max thresholds for quantizing
+the float32 data into int8.
+
+.. Note::
+    This operator only supports forward propogation. DO NOT use it in training.
+    This operator only supports `pool_type` of `avg` or `max`.)code" ADD_FILELINE)
+.set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
+    PoolingParam param;
+    param.Init(attrs.dict);
+    // TODO(junwu): Uncomment the following line and remove the above lines
+    // after pooling op is refactored
+    // const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+    nnvm::NodePtr node = nnvm::Node::Create();
+    if (param.pool_type == pool_enum::kMaxPooling || param.pool_type == pool_enum::kAvgPooling) {
+      node->attrs.op = Op::Get("_contrib_quantized_pooling");
+      node->attrs.name = "quantized_" + attrs.name;
+    } else {
+      node->attrs.op = Op::Get("Pooling");
+      node->attrs.name = attrs.name;
+    }
+    node->attrs.dict = attrs.dict;
+    if (node->op()->attr_parser != nullptr) {
+      node->op()->attr_parser(&(node->attrs));
+    }
+    return node;
+  });
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_pooling.cu b/src/operator/quantization/quantized_pooling.cu
new file mode 100644
index 000000000000..78011b885c53
--- /dev/null
+++ b/src/operator/quantization/quantized_pooling.cu
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file quantized_pooling.cu
+*/
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../nn/pooling-inl.h"
+#include "../mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6
+template<typename DType>
+class QuantizedCuDNNPoolingOp {
+ public:
+  QuantizedCuDNNPoolingOp() {
+    CUDNN_CALL(cudnnCreatePoolingDescriptor(&pool_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+  }
+
+  void Init(const PoolingParam& param, const TShape& dshape, const TShape& oshape) {
+    const int N = 0, H = 2, W = 3, C = 1;
+    const cudnnDataType_t dtype = mshadow::DataType<DType>::kCudnnFlag;
+    CHECK(param.kernel.ndim() == 2) << "Only support 2D pooling";
+    if (param.pool_type == pool_enum::kMaxPooling) {
+      mode_ = CUDNN_POOLING_MAX;
+    } else if (param.pool_type == pool_enum::kAvgPooling) {
+      mode_ = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+    } else {
+      LOG(FATAL) << "QuantizedCuDNNPoolingOp only supports pool_type=max/avg";
+    }
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype,
+                                          dshape[N],
+                                          dshape[C],
+                                          dshape[H],
+                                          dshape[W]));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype,
+                                          oshape[N],
+                                          oshape[C],
+                                          oshape[H],
+                                          oshape[W]));
+    CUDNN_CALL(cudnnSetPooling2dDescriptor(pool_desc_,
+                                           mode_,
+                                           CUDNN_NOT_PROPAGATE_NAN,
+                                           param.global_pool ? dshape[2] : param.kernel[0],
+                                           param.global_pool ? dshape[3] : param.kernel[1],
+                                           param.pad[0],
+                                           param.pad[1],
+                                           param.global_pool ? 1 : param.stride[0],
+                                           param.global_pool ? 1 :param.stride[1]));
+  }
+
+  ~QuantizedCuDNNPoolingOp() {
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+    CUDNN_CALL(cudnnDestroyPoolingDescriptor(pool_desc_));
+  }
+
+  void Forward(mshadow::Stream<gpu>* s,
+               const std::vector<TBlob> &inputs,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &outputs) {
+    CHECK_EQ(inputs.size(), 3U);
+    CHECK_EQ(outputs.size(), 3U);
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+    float alpha = 1.0f;
+    float beta  = 0.0f;
+    CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_,
+                                   pool_desc_,
+                                   &alpha,
+                                   in_desc_,
+                                   inputs[0].dptr_,
+                                   &beta,
+                                   out_desc_,
+                                   outputs[0].dptr_));
+
+    Tensor<gpu, 1, float> omin_range = outputs[1].FlatTo1D<gpu, float>(s);
+    Tensor<gpu, 1, float> omax_range = outputs[2].FlatTo1D<gpu, float>(s);
+    ASSIGN_DISPATCH(omin_range, req[1],
+      F<mshadow_op::identity>(inputs[1].FlatTo1D<gpu, float>(s)));
+    ASSIGN_DISPATCH(omax_range, req[2],
+      F<mshadow_op::identity>(inputs[2].FlatTo1D<gpu, float>(s)));
+  }
+
+ private:
+  cudnnPoolingMode_t mode_;
+  cudnnTensorDescriptor_t in_desc_;
+  cudnnTensorDescriptor_t out_desc_;
+  cudnnPoolingDescriptor_t pool_desc_;
+};  // class QuantizedCuDNNPoolingOp
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6
+
+void QuantizedPoolingForwardGPU(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<TBlob>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<TBlob>& outputs) {
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(param.kernel.ndim(), 2U)
+    << "QuantizedPoolingForward<gpu> only supports 2D convolution for now";
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local QuantizedCuDNNPoolingOp<int8_t> op;
+#else
+  static MX_THREAD_LOCAL QuantizedCuDNNPoolingOp<int8_t> op;
+#endif  // DMLC_CXX11_THREAD_LOCAL
+  op.Init(param, {inputs[0].shape_}, {outputs[0].shape_});
+  op.Forward(ctx.get_stream<gpu>(), inputs, req, outputs);
+#else
+  LOG(FATAL) << "QuantizedPoolingForward<gpu> only supports cudnnPoolingForward for now";
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_pooling)
+.set_attr<FCompute>("FCompute<gpu>", QuantizedPoolingForwardGPU);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/requantize-inl.h b/src/operator/quantization/requantize-inl.h
new file mode 100644
index 000000000000..e07a149f8a6b
--- /dev/null
+++ b/src/operator/quantization/requantize-inl.h
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file requantize-inl.h
+ * \brief implementation of quantize operation
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_REQUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_REQUANTIZE_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <limits>
+#include "../elemwise_op_common.h"
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "./quantization_utils.h"
+#include "../tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct RequantizeParam : public dmlc::Parameter<RequantizeParam> {
+  dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
+  dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
+  DMLC_DECLARE_PARAMETER(RequantizeParam) {
+    DMLC_DECLARE_FIELD(min_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The minimum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int32 data into int8.");
+    DMLC_DECLARE_FIELD(max_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The maximum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int32 data into int8.");
+  }
+};
+
+inline bool RequantizeType(const nnvm::NodeAttrs& attrs,
+                           std::vector<int> *in_attrs,
+                           std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U);
+  CHECK_EQ(out_attrs->size(), 3U);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 2, mshadow::kFloat32);
+  return (*in_attrs)[0] != -1;
+}
+
+struct RequantizeKernel {
+  template<typename T1, typename T2>
+  MSHADOW_XINLINE static void Map(int i, T2 *output, float *omin_range, float *omax_range,
+      const T1 *input, const float *imin_range, const float *imax_range, const float real_range) {
+    const float input_float = QuantizedToFloat<T1>(input[i], *imin_range, *imax_range);
+    *omin_range = -real_range;
+    *omax_range =  real_range;
+    output[i] = FloatToQuantized<T2>(input_float, -real_range, real_range);
+  }
+
+  template<typename T1, typename T2>
+  MSHADOW_XINLINE static void Map(int i, T2 *output, float *omin_range, float *omax_range,
+      const T1 *input, const float *imin_range, const float *imax_range,
+      const float *actual_min, const float *actual_max) {
+    Map(i, output, omin_range, omax_range, input, imin_range, imax_range,
+        MaxAbs(*actual_min, *actual_max));
+  }
+};
+
+template<typename xpu, typename DType>
+inline size_t ConfigReduce(mshadow::Stream<xpu>* s,
+                           const TShape& data_shape,
+                           const TShape& out_shape,
+                           TShape* src_shape,
+                           TShape* dst_shape) {
+  BroadcastReduceShapeCompact(data_shape, out_shape, src_shape, dst_shape);
+  constexpr int NDim = 2;
+  CHECK_EQ(src_shape->ndim(), NDim);
+  CHECK_EQ(dst_shape->ndim(), NDim);
+
+  return broadcast::ReduceWorkspaceSize<NDim, DType>(s, *dst_shape, kWriteTo, *src_shape);
+}
+
+template<typename xpu>
+void RequantizeForward(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  typedef int32_t SrcDType;
+  typedef int8_t  DstDType;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const RequantizeParam& param =
+    nnvm::get<RequantizeParam>(attrs.parsed);
+
+  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+    Kernel<RequantizeKernel, xpu>::Launch(s, inputs[0].Size(),
+        outputs[0].dptr<DstDType>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
+        inputs[0].dptr<SrcDType>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
+        MaxAbs(param.min_calib_range.value(), param.max_calib_range.value()));
+  } else {  // model is not calibrated
+    TShape src_shape, dst_shape;
+    const size_t actual_float_size = sizeof(float);
+    const size_t actual_quantized_size = sizeof(SrcDType);
+    const size_t temp_reduce_size = ConfigReduce<xpu, SrcDType>(
+        s, inputs[0].shape_, TShape({1}), &src_shape, &dst_shape);
+    Tensor<xpu, 1, char> temp_space =
+      ctx.requested[0].get_space_typed<xpu, 1, char>(
+          Shape1(2*actual_float_size+2*actual_quantized_size+temp_reduce_size), s);
+    Tensor<xpu, 1, float> actual_min_float(
+        reinterpret_cast<float*>(temp_space.dptr_), Shape1(1), s);
+    Tensor<xpu, 1, float> actual_max_float(
+        reinterpret_cast<float*>(temp_space.dptr_) + 1, Shape1(1), s);
+
+    const int dev_id = ctx.run_ctx.ctx.dev_id;
+    TBlob actual_min_quantized(reinterpret_cast<SrcDType*>(
+          temp_space.dptr_ + 8), Shape1(1), xpu::kDevMask, dev_id);
+    TBlob actual_max_quantized(reinterpret_cast<SrcDType*>(
+          temp_space.dptr_ + 8) + 1, Shape1(1), xpu::kDevMask, dev_id);
+    Tensor<xpu, 1, char> workspace(
+        temp_space.dptr_+2*actual_float_size+2*actual_quantized_size, Shape1(temp_reduce_size), s);
+    broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
+      s, actual_min_quantized.reshape(dst_shape),
+      kWriteTo, workspace, inputs[0].reshape(src_shape));
+    Kernel<QuantizedToFloatStruct, xpu>::Launch(s, 1,
+        actual_min_float.dptr_, actual_min_quantized.dptr<SrcDType>(),
+        inputs[1].dptr<float>(), inputs[2].dptr<float>());
+
+    broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
+      s, actual_max_quantized.reshape(dst_shape),
+      kWriteTo, workspace, inputs[0].reshape(src_shape));
+    Kernel<QuantizedToFloatStruct, xpu>::Launch(s, 1,
+        actual_max_float.dptr_, actual_max_quantized.dptr<SrcDType>(),
+        inputs[1].dptr<float>(), inputs[2].dptr<float>());
+
+    Kernel<RequantizeKernel, xpu>::Launch(s, inputs[0].Size(),
+        outputs[0].dptr<DstDType>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
+        inputs[0].dptr<SrcDType>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
+        actual_min_float.dptr_, actual_max_float.dptr_);
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_QUANTIZATION_REQUANTIZE_INL_H_
diff --git a/src/operator/quantization/requantize.cc b/src/operator/quantization/requantize.cc
new file mode 100644
index 000000000000..83ea37b835cf
--- /dev/null
+++ b/src/operator/quantization/requantize.cc
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file requantize.cc
+ * \brief
+ */
+#include "./requantize-inl.h"
+#include "./quantize-inl.h"
+
+namespace mxnet {
+namespace op {
+DMLC_REGISTER_PARAMETER(RequantizeParam);
+
+NNVM_REGISTER_OP(_contrib_requantize)
+.describe(R"code(Given data that is quantized in int32 and the corresponding thresholds,
+requantize the data into int8 using min and max thresholds either calculated at runtime
+or from calibration. It's highly recommended to pre-calucate the min and max thresholds
+through calibration since it is able to save the runtime of the operator and improve the
+inference accuracy.
+
+.. Note::
+    This operator only supports forward propogation. DO NOT use it in training.)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<RequantizeParam>)
+.set_num_inputs(3)
+.set_num_outputs(3)
+.set_attr<nnvm::FInferShape>("FInferShape", QuantizeShape)
+.set_attr<nnvm::FInferType>("FInferType", RequantizeType)
+.set_attr<FCompute>("FCompute<cpu>", RequantizeForward<cpu>)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
+    const RequantizeParam& param =
+      nnvm::get<RequantizeParam>(attrs.parsed);
+    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+      return std::vector<ResourceRequest>();
+    } else {
+      return std::vector<ResourceRequest>(1, ResourceRequest::kTempSpace);
+    }
+  })
+.add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `int32`")
+.add_argument("min_range", "NDArray-or-Symbol", "The original minimum scalar value "
+  "in the form of float32 used for quantizing data into int32.")
+.add_argument("max_range", "NDArray-or-Symbol", "The original maximum scalar value "
+  "in the form of float32 used for quantizing data into int32.")
+.add_arguments(RequantizeParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/requantize.cu b/src/operator/quantization/requantize.cu
new file mode 100644
index 000000000000..be8ae59124e5
--- /dev/null
+++ b/src/operator/quantization/requantize.cu
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file quantize.cu
+ * \brief
+ */
+#include "./requantize-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_contrib_requantize)
+.set_attr<FCompute>("FCompute<gpu>", RequantizeForward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index 630fef65a52e..b6bb39a19847 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -348,21 +348,21 @@ static inline uint64_t calc_num_load(const int X, const int Y, const int* stride
 }
 
 template<int ndim, typename DType>
-ReduceImplConfig<ndim> ConfigureReduceImpl(const TBlob& small, const TBlob& big, const TBlob* lhs,
-  const TBlob* rhs) {
+ReduceImplConfig<ndim> ConfigureReduceImpl(const TShape& small, const TShape& big, const TShape* lhs,
+  const TShape* rhs) {
 
   ReduceImplConfig<ndim> config;
 
-  diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &config.rshape, &config.rstride);
-  config.N = small.shape_.Size();
+  diff(small.get<ndim>(), big.get<ndim>(), &config.rshape, &config.rstride);
+  config.N = small.Size();
   config.M = config.rshape.Size();
 
   bool multiOp = false;
   if (lhs != NULL) {
     CHECK_NOTNULL(rhs);
-    diff(small.shape_.get<ndim>(), lhs->shape_.get<ndim>(), &config.lhs_shape,
+    diff(small.get<ndim>(), lhs->get<ndim>(), &config.lhs_shape,
       &config.lhs_stride);
-    diff(small.shape_.get<ndim>(), rhs->shape_.get<ndim>(), &config.rhs_shape,
+    diff(small.get<ndim>(), rhs->get<ndim>(), &config.rhs_shape,
       &config.rhs_stride);
     multiOp = true;
   }
@@ -376,20 +376,20 @@ ReduceImplConfig<ndim> ConfigureReduceImpl(const TBlob& small, const TBlob& big,
   } else {
 
     int reduce_strides[3];
-    reduce_strides[0] = fastest_stride(small.shape_.get<ndim>(), big.shape_.get<ndim>(),
-      big.shape_.get<ndim>());
-    reduce_strides[1] = (multiOp) ? fastest_stride(small.shape_.get<ndim>(),
-      lhs->shape_.get<ndim>(), lhs->shape_.get<ndim>()) : 1;
-    reduce_strides[2] = (multiOp) ? fastest_stride(small.shape_.get<ndim>(),
-      rhs->shape_.get<ndim>(), rhs->shape_.get<ndim>()) : 1;
+    reduce_strides[0] = fastest_stride(small.get<ndim>(), big.get<ndim>(),
+      big.get<ndim>());
+    reduce_strides[1] = (multiOp) ? fastest_stride(small.get<ndim>(),
+      lhs->get<ndim>(), lhs->get<ndim>()) : 1;
+    reduce_strides[2] = (multiOp) ? fastest_stride(small.get<ndim>(),
+      rhs->get<ndim>(), rhs->get<ndim>()) : 1;
 
     int reduce_strides_transp[3];
-    reduce_strides_transp[0] = fastest_stride(small.shape_.get<ndim>(), config.rshape,
+    reduce_strides_transp[0] = fastest_stride(small.get<ndim>(), config.rshape,
       config.rstride);
     reduce_strides_transp[1] = (multiOp) ?
-      fastest_stride(small.shape_.get<ndim>(), config.lhs_shape, config.lhs_stride) : 1;
+      fastest_stride(small.get<ndim>(), config.lhs_shape, config.lhs_stride) : 1;
     reduce_strides_transp[2] = (multiOp) ?
-      fastest_stride(small.shape_.get<ndim>(), config.rhs_shape, config.rhs_stride) : 1;
+      fastest_stride(small.get<ndim>(), config.rhs_shape, config.rhs_stride) : 1;
 
     uint64_t num_load = calc_num_load(config.N, config.M, reduce_strides);
     uint64_t num_load_transp = calc_num_load(config.M, config.N, reduce_strides_transp);
@@ -597,7 +597,8 @@ void Reduce(Stream<gpu> *s, const TBlob& small, const OpReqType req,
             const Tensor<gpu, 1, char>& workspace, const TBlob& big) {
   if (req == kNullOp) return;
   cudaStream_t stream = Stream<gpu>::GetStream(s);
-  ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, NULL, NULL);
+  ReduceImplConfig<ndim> config =
+    ConfigureReduceImpl<ndim, DType>(small.shape_, big.shape_, NULL, NULL);
   ReduceImpl<Reducer, ndim, DType, OP>(stream, small, req, big, workspace, config);
 }
 
@@ -607,21 +608,22 @@ void Reduce(Stream<gpu> *s, const TBlob& small, const OpReqType req,
             const TBlob& lhs, const TBlob& rhs) {
   if (req == kNullOp) return;
   cudaStream_t stream = Stream<gpu>::GetStream(s);
-  ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, &lhs, &rhs);
+  ReduceImplConfig<ndim> config =
+    ConfigureReduceImpl<ndim, DType>(small.shape_, big.shape_, &lhs.shape_, &rhs.shape_);
   ReduceImpl<Reducer, ndim, DType, OP1, OP2>(stream, small, lhs, rhs, req, big, workspace, config);
 }
 
 template<int ndim, typename DType>
-size_t ReduceWorkspaceSize(Stream<gpu> *s, const TBlob& small, const OpReqType req,
-                           const TBlob& big) {
+size_t ReduceWorkspaceSize(Stream<gpu> *s, const TShape& small, const OpReqType req,
+                           const TShape& big) {
   if (req == kNullOp) return 0;
   ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, NULL, NULL);
   return config.workspace_size;
 }
 
 template<int ndim, typename DType>
-size_t ReduceWorkspaceSize(Stream<gpu> *s, const TBlob& small, const OpReqType req,
-                           const TBlob& big, const TBlob& lhs, const TBlob& rhs) {
+size_t ReduceWorkspaceSize(Stream<gpu> *s, const TShape& small, const OpReqType req,
+                           const TShape& big, const TShape& lhs, const TShape& rhs) {
   if (req == kNullOp) return 0;
   ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, &lhs, &rhs);
   return config.workspace_size;
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 7f3e5685a086..76ec92a9e724 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -217,14 +217,14 @@ void Reduce(Stream<cpu> *s, const TBlob& small, const OpReqType req,
 }
 
 template<int ndim, typename DType>
-size_t ReduceWorkspaceSize(Stream<cpu> *s, const TBlob& small, const OpReqType req,
-                           const TBlob& big) {
+size_t ReduceWorkspaceSize(Stream<cpu> *s, const TShape& small, const OpReqType req,
+                           const TShape& big) {
   return 0;
 }
 
 template<int ndim, typename DType>
-size_t ReduceWorkspaceSize(Stream<cpu> *s, const TBlob& small, const OpReqType req,
-                           const TBlob& big, const TBlob& lhs, const TBlob& rhs) {
+size_t ReduceWorkspaceSize(Stream<cpu> *s, const TShape& small, const OpReqType req,
+                           const TShape& big, const TShape& lhs, const TShape& rhs) {
   return 0;
 }
 
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index 02d48b469703..f124ba3021e5 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -421,7 +421,7 @@ void ReduceAxesComputeImpl(const nnvm::NodeAttrs& attrs,
     const TBlob out_data = outputs[0].reshape(dst_shape);
     BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
       size_t workspace_size = broadcast::ReduceWorkspaceSize<NDim, DType>(
-          s, out_data, req[0], in_data);
+          s, out_data.shape_, req[0], in_data.shape_);
       Tensor<xpu, 1, char> workspace =
           ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
       broadcast::Reduce<reducer, NDim, DType, op::mshadow_op::identity>(
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index af5f5ce3af80..a2e63fefad58 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -205,8 +205,10 @@ void BinaryBroadcastBackwardUseNone(const nnvm::NodeAttrs& attrs,
       const TBlob out = inputs[0].reshape(new_oshape);
       BROADCAST_NDIM_SWITCH(ndim, NDim, {
         // Request temporary storage
-        size_t workspace_size_l = ReduceWorkspaceSize<NDim, DType>(s, lhs, req[0], out);
-        size_t workspace_size_r = ReduceWorkspaceSize<NDim, DType>(s, rhs, req[1], out);
+        size_t workspace_size_l = ReduceWorkspaceSize<NDim, DType>(
+            s, lhs.shape_, req[0], out.shape_);
+        size_t workspace_size_r = ReduceWorkspaceSize<NDim, DType>(
+            s, rhs.shape_, req[1], out.shape_);
         size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
         Tensor<xpu, 1, char> workspace =
           ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
@@ -234,8 +236,10 @@ inline void BinaryBroadcastBackwardUseInImpl(const OpContext& ctx,
   const TBlob ograd = inputs[0].reshape(new_oshape);
   const TBlob lhs = inputs[1].reshape(new_lshape);
   const TBlob rhs = inputs[2].reshape(new_rshape);
-  size_t workspace_size_l = ReduceWorkspaceSize<ndim, DType>(s, lgrad, req[0], ograd, lhs, rhs);
-  size_t workspace_size_r = ReduceWorkspaceSize<ndim, DType>(s, rgrad, req[1], ograd, lhs, rhs);
+  size_t workspace_size_l = ReduceWorkspaceSize<ndim, DType>(
+      s, lgrad.shape_, req[0], ograd.shape_, lhs.shape_, rhs.shape_);
+  size_t workspace_size_r = ReduceWorkspaceSize<ndim, DType>(
+      s, rgrad.shape_, req[1], ograd.shape_, lhs.shape_, rhs.shape_);
   size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
   Tensor<xpu, 1, char> workspace =
     ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 475409e6a774..0c74cac2dca5 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -88,6 +88,7 @@ struct EyeParam : public dmlc::Parameter<EyeParam> {
     .add_enum("float64", mshadow::kFloat64)
     .add_enum("float16", mshadow::kFloat16)
     .add_enum("uint8", mshadow::kUint8)
+    .add_enum("int8", mshadow::kInt8)
     .add_enum("int32", mshadow::kInt32)
     .add_enum("int64", mshadow::kInt64)
     .describe("Target data type.");
diff --git a/tests/python/quantization/common.py b/tests/python/quantization/common.py
new file mode 120000
index 000000000000..dccb90b10675
--- /dev/null
+++ b/tests/python/quantization/common.py
@@ -0,0 +1 @@
+../unittest/common.py
\ No newline at end of file
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
new file mode 100644
index 000000000000..7b08f46e836a
--- /dev/null
+++ b/tests/python/quantization/test_quantization.py
@@ -0,0 +1,447 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Some of the tests using CUDNN require a special GPU instruction called dp4a.
+Ref: http://images.nvidia.com/content/pdf/tesla/184457-Tesla-P4-Datasheet-NV-Final-Letter-Web.pdf
+"""
+import mxnet as mx
+import numpy as np
+from mxnet.test_utils import assert_almost_equal, rand_ndarray, rand_shape_nd, same, DummyIter
+from common import with_seed
+from mxnet.module import Module
+from mxnet.io import NDArrayIter
+
+
+@with_seed()
+def test_quantize_float32_to_int8():
+    shape = rand_shape_nd(4)
+    data = rand_ndarray(shape, 'default', dtype='float32')
+    min_range = mx.nd.min(data)
+    max_range = mx.nd.max(data)
+    qdata, min_val, max_val = mx.nd.contrib.quantize(data, min_range, max_range, out_type='int8')
+    data_np = data.asnumpy()
+    min_range = min_range.asscalar()
+    max_range = max_range.asscalar()
+    real_range = np.maximum(np.abs(min_range), np.abs(max_range))
+    quantized_range = 127.0
+    scale = quantized_range / real_range
+    assert qdata.dtype == np.int8
+    assert min_val.dtype == np.float32
+    assert max_val.dtype == np.float32
+    assert same(min_val.asscalar(), -real_range)
+    assert same(max_val.asscalar(), real_range)
+    qdata_np = (np.sign(data_np) * np.minimum(np.abs(data_np) * scale + 0.5, quantized_range)).astype(np.int8)
+    assert same(qdata.asnumpy(), qdata_np)
+
+
+@with_seed()
+def test_dequantize_int8_to_float32():
+    shape = rand_shape_nd(4)
+    qdata_np = np.random.uniform(low=-127, high=127, size=shape).astype(dtype=np.int8)
+    qdata = mx.nd.array(qdata_np, dtype=np.int8)
+    real_range = 402.3347
+    min_range = mx.nd.array([-real_range], dtype=np.float32)
+    max_range = mx.nd.array([real_range], dtype=np.float32)
+    data = mx.nd.contrib.dequantize(qdata, min_range, max_range, out_type='float32')
+    quantized_range = 127.0
+    scale = real_range / quantized_range
+    assert data.dtype == np.float32
+    data_np = qdata_np * scale
+    assert_almost_equal(data.asnumpy(), data_np)
+
+
+@with_seed()
+def test_requantize_int32_to_int8():
+    def quantized_int32_to_float(qdata, min_range, max_range):
+        assert qdata.dtype == 'int32'
+        quantized_range = np.iinfo('int32').max
+        real_range = np.maximum(np.abs(min_range), np.abs(max_range))
+        scale = float(real_range) / float(quantized_range)
+        return qdata.astype('float32') * scale
+
+    def float_to_quantized_int8(data, min_range, max_range):
+        assert data.dtype == 'float32'
+        real_range = np.maximum(np.abs(min_range), np.abs(max_range))
+        quantized_range = np.iinfo('int8').max
+        scale = float(quantized_range) / float(real_range)
+        return (np.sign(data) * np.minimum(np.abs(data) * scale + 0.5, quantized_range)).astype('int8')
+
+    def requantize(qdata, min_data, max_data, real_range):
+        data = quantized_int32_to_float(qdata, min_data, max_data)
+        output = float_to_quantized_int8(data, -real_range, real_range)
+        return output, -real_range, real_range
+
+    def requantize_baseline(qdata, min_data, max_data, min_calib_range=None, max_calib_range=None):
+        if min_calib_range is not None and max_calib_range is not None:
+            real_range = np.maximum(np.abs(min_calib_range), np.abs(max_calib_range))
+            return requantize(qdata, min_data, max_data, real_range)
+        else:
+            min_range = quantized_int32_to_float(np.min(qdata), min_data, max_data)
+            max_range = quantized_int32_to_float(np.max(qdata), min_data, max_data)
+            return requantize(qdata, min_data, max_data, np.maximum(np.abs(min_range), np.abs(max_range)))
+
+    def check_requantize(shape, min_calib_range=None, max_calib_range=None):
+        qdata = mx.nd.random.uniform(low=-1000.0, high=1000.0, shape=shape).astype('int32')
+        min_range = mx.nd.array([-1010.0])
+        max_range = mx.nd.array([1020.0])
+        if min_calib_range is None or max_calib_range is None:
+            qdata_int8, min_output, max_output = mx.nd.contrib.requantize(qdata, min_range, max_range)
+        else:
+            qdata_int8, min_output, max_output = mx.nd.contrib.requantize(qdata, min_range, max_range,
+                                                                          min_calib_range, max_calib_range)
+
+        qdata_int8_np, min_output_np, max_output_np = requantize_baseline(qdata.asnumpy(), min_range.asscalar(),
+                                                                          max_range.asscalar(),
+                                                                          min_calib_range=min_calib_range,
+                                                                          max_calib_range=max_calib_range)
+        assert_almost_equal(qdata_int8.asnumpy(), qdata_int8_np)
+        assert_almost_equal(min_output.asnumpy(), np.array([min_output_np]))
+        assert_almost_equal(max_output.asnumpy(), np.array([max_output_np]))
+
+    check_requantize((3, 4, 10, 10))
+    check_requantize((32, 3, 23, 23))
+    check_requantize((3, 4, 10, 10), min_calib_range=-1050.0, max_calib_range=1040.0)
+    check_requantize((32, 3, 23, 23), min_calib_range=-134.349, max_calib_range=523.43)
+
+
+@with_seed()
+def test_quantized_conv():
+    if mx.current_context().device_type != 'gpu':
+        print('skipped testing quantized_conv on cpu since it is not implemented yet')
+        return
+
+    def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, no_bias):
+        with mx.Context('gpu', 0):
+            # run fp32 conv
+            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
+            conv2d = mx.sym.Convolution(data=data, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride,
+                                        no_bias=no_bias, cudnn_off=False, name='conv2d')
+            arg_shapes, _, _ = conv2d.infer_shape(data=data_shape)
+            arg_names = conv2d.list_arguments()
+            conv_exe_fp32 = conv2d.simple_bind(ctx=mx.current_context(), grad_req='null')
+            conv_exe_fp32.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                           shape=data_shape).astype('int32')
+            conv_exe_fp32.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                           shape=arg_shapes[1]).astype('int32')
+            if not no_bias:
+                conv_exe_fp32.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                               shape=arg_shapes[2]).astype('int32')
+            output = conv_exe_fp32.forward()[0]
+
+            # run quantized conv
+            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
+            qweight = mx.sym.Variable(name='qweight', dtype='int8')
+            min_data = mx.sym.Variable(name='min_data')
+            max_data = mx.sym.Variable(name='max_data')
+            min_weight = mx.sym.Variable(name='min_weight')
+            max_weight = mx.sym.Variable(name='max_weight')
+            quantized_conv2d = mx.sym.contrib.quantized_conv(data=qdata, weight=qweight, min_data=min_data,
+                                                             max_data=max_data, min_weight=min_weight,
+                                                             max_weight=max_weight, kernel=kernel,
+                                                             num_filter=num_filter, pad=pad, stride=stride,
+                                                             no_bias=no_bias)
+            qarg_names = quantized_conv2d.list_arguments()
+            type_dict = None
+            if not no_bias:
+                type_dict = {qarg_names[2]: 'int8'}
+            conv_exe_int8 = quantized_conv2d.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
+            conv_exe_int8.arg_dict[qarg_names[0]][:] = conv_exe_fp32.arg_dict[arg_names[0]].astype('int8')
+            conv_exe_int8.arg_dict[qarg_names[1]][:] = conv_exe_fp32.arg_dict[arg_names[1]].astype('int8')
+            quantized_range = 127.0
+            if no_bias:
+                conv_exe_int8.arg_dict[qarg_names[2]][:] = -quantized_range
+                conv_exe_int8.arg_dict[qarg_names[3]][:] = quantized_range
+                conv_exe_int8.arg_dict[qarg_names[4]][:] = -quantized_range
+                conv_exe_int8.arg_dict[qarg_names[5]][:] = quantized_range
+            else:
+                conv_exe_int8.arg_dict[qarg_names[2]][:] = conv_exe_fp32.arg_dict[arg_names[2]].astype('int8')
+                conv_exe_int8.arg_dict[qarg_names[3]][:] = -quantized_range
+                conv_exe_int8.arg_dict[qarg_names[4]][:] = quantized_range
+                conv_exe_int8.arg_dict[qarg_names[5]][:] = -quantized_range
+                conv_exe_int8.arg_dict[qarg_names[6]][:] = quantized_range
+                conv_exe_int8.arg_dict[qarg_names[7]][:] = -quantized_range
+                conv_exe_int8.arg_dict[qarg_names[8]][:] = quantized_range
+            qoutput, min_range, max_range = conv_exe_int8.forward()
+
+            if no_bias:
+                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
+            else:
+                # with adding bias, accuracy loss should not be greater than one
+                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
+                cond = mx.nd.lesser(2, diff).sum().asscalar()
+                assert cond == 0
+
+    check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), True)
+    check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), False)
+
+
+@with_seed()
+def test_quantized_pooling():
+    if mx.current_context().device_type != 'gpu':
+        print('skipped testing quantized_pooling on cpu since it is not implemented yet')
+        return
+
+    def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool):
+        with mx.Context('gpu', 0):
+            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
+            pooling_fp32 = mx.sym.Pooling(data=data, kernel=kernel, pad=pad, stride=stride,
+                                          pool_type=pool_type, global_pool=global_pool, cudnn_off=False)
+            arg_shapes, _, _ = pooling_fp32.infer_shape(data=data_shape)
+            arg_names = pooling_fp32.list_arguments()
+            pooling_fp32_exe = pooling_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+            pooling_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                              shape=data_shape).astype('int32')
+            output = pooling_fp32_exe.forward()[0]
+
+            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
+            min_data = mx.sym.Variable(name='min_data')
+            max_data = mx.sym.Variable(name='max_data')
+            quantized_pooling = mx.sym.contrib.quantized_pooling(data=qdata, min_data=min_data,
+                                                                 max_data=max_data, kernel=kernel,
+                                                                 pad=pad, stride=stride, pool_type=pool_type,
+                                                                 global_pool=global_pool)
+            pooling_int8_exe = quantized_pooling.simple_bind(ctx=mx.current_context(), grad_req='null')
+            qarg_names = quantized_pooling.list_arguments()
+            pooling_int8_exe.arg_dict[qarg_names[0]][:] = pooling_fp32_exe.arg_dict[arg_names[0]].astype('int8')
+            quantized_range = 127.0
+            pooling_int8_exe.arg_dict[qarg_names[1]][:] = -quantized_range
+            pooling_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range
+            qoutput, min_range, max_range = pooling_int8_exe.forward()
+
+            if pool_type == 'max':
+                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
+            elif pool_type == 'avg':  # for avg pooling, fp32 and int8 may be different due to rounding errors
+                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
+                cond = mx.nd.lesser(2, diff).sum().asscalar()
+                assert cond == 0
+
+    check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), False)
+    check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), True)
+    check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), False)
+    check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), True)
+
+
+@with_seed()
+def test_quantized_fc():
+    if mx.current_context().device_type != 'gpu':
+        print('skipped testing quantized_fc on cpu since it is not implemented yet')
+        return
+
+    def check_quantized_fc(data_shape, num_hidden, no_bias, flatten=True):
+        with mx.Context('gpu', 0):
+            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
+            fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten)
+            arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape)
+            arg_names = fc_fp32.list_arguments()
+            fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+            fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                         shape=data_shape).astype('int32')
+            fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                         shape=arg_shapes[1]).astype('int32')
+            if not no_bias:
+                fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                             shape=arg_shapes[2]).astype('int32')
+            output = fc_fp32_exe.forward()[0]
+
+            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
+            fc_int8 = mx.sym.contrib.quantized_fully_connected(data=qdata, num_hidden=num_hidden,
+                                                               no_bias=no_bias, flatten=flatten)
+            qarg_names = fc_int8.list_arguments()
+            type_dict = {qarg_names[1]: 'int8'}
+            if not no_bias:
+                type_dict.update({qarg_names[2]: 'int8'})
+            fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
+            fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[arg_names[0]].astype('int8')
+            fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[arg_names[1]].astype('int8')
+            quantized_range = 127.0
+            if no_bias:
+                fc_int8_exe.arg_dict[qarg_names[2]][:] = -quantized_range
+                fc_int8_exe.arg_dict[qarg_names[3]][:] = quantized_range
+                fc_int8_exe.arg_dict[qarg_names[4]][:] = -quantized_range
+                fc_int8_exe.arg_dict[qarg_names[5]][:] = quantized_range
+            else:
+                fc_int8_exe.arg_dict[qarg_names[2]][:] = fc_fp32_exe.arg_dict[arg_names[2]].astype('int8')
+                fc_int8_exe.arg_dict[qarg_names[3]][:] = -quantized_range
+                fc_int8_exe.arg_dict[qarg_names[4]][:] = quantized_range
+                fc_int8_exe.arg_dict[qarg_names[5]][:] = -quantized_range
+                fc_int8_exe.arg_dict[qarg_names[6]][:] = quantized_range
+                fc_int8_exe.arg_dict[qarg_names[7]][:] = -quantized_range
+                fc_int8_exe.arg_dict[qarg_names[8]][:] = quantized_range
+            qoutput, min_range, max_range = fc_int8_exe.forward()
+
+            if no_bias:
+                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
+            else:
+                # with adding bias, accuracy loss should not be greater than one
+                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
+                cond = mx.nd.lesser(2, diff).sum().asscalar()
+                assert cond == 0
+
+    check_quantized_fc((32, 512, 2, 2), 100, True)
+    check_quantized_fc((32, 111, 2, 2), 100, True)
+    check_quantized_fc((32, 512, 2, 2), 100, False)
+    check_quantized_fc((32, 111, 2, 2), 100, False)
+
+
+@with_seed()
+def test_quantized_flatten():
+    def check_quantized_flatten(shape):
+        qdata = mx.nd.random.uniform(low=-127, high=127, shape=shape).astype('int8')
+        min_data = mx.nd.array([-1023.343], dtype='float32')
+        max_data = mx.nd.array([2343.324275], dtype='float32')
+        qoutput, min_output, max_output = mx.nd.contrib.quantized_flatten(qdata, min_data, max_data)
+        assert qoutput.ndim == 2
+        assert qoutput.shape[0] == qdata.shape[0]
+        assert qoutput.shape[1] == np.prod(qdata.shape[1:])
+        assert same(qdata.asnumpy().flatten(), qoutput.asnumpy().flatten())
+        assert same(min_data.asnumpy(), min_output.asnumpy())
+        assert same(max_data.asnumpy(), max_output.asnumpy())
+
+    check_quantized_flatten((10,))
+    check_quantized_flatten((10, 15))
+    check_quantized_flatten((10, 15, 18))
+    check_quantized_flatten((3, 4, 23, 23))
+
+
+@with_seed()
+def test_quantize_params():
+    data = mx.sym.Variable('data')
+    conv = mx.sym.Convolution(data, kernel=(1, 1), num_filter=2048, name='conv')
+    sym = mx.sym.BatchNorm(data=conv, eps=2e-05, fix_gamma=False, momentum=0.9, use_global_stats=False, name='bn')
+    offline_params = [name for name in sym.list_arguments()
+                      if not name.startswith('data') and not name.endswith('label')]
+    params = {}
+    for name in offline_params:
+        params[name] = mx.nd.uniform(shape=(2, 2))
+    qsym = mx.contrib.quant._quantize_symbol(sym, offline_params=offline_params)
+    qparams = mx.contrib.quant._quantize_params(qsym, params)
+    param_names = params.keys()
+    qparam_names = qparams.keys()
+    for name in qparam_names:
+        if name.startswith('bn'):
+            assert name in param_names
+        elif name.startswith('conv'):
+            assert name not in param_names
+            assert name.find('quantize') != -1
+
+
+def get_fp32_sym():
+    data = mx.sym.Variable('data')
+    conv = mx.sym.Convolution(data, kernel=(1, 1), num_filter=16, name='conv')
+    bn = mx.sym.BatchNorm(data=conv, eps=2e-05, fix_gamma=False, momentum=0.9, use_global_stats=False, name='bn')
+    act = mx.sym.Activation(data=bn, act_type='relu', name='relu')
+    pool = mx.sym.Pooling(act, kernel=(4, 4), pool_type='avg', name='pool')
+    fc = mx.sym.FullyConnected(pool, num_hidden=10, flatten=True, name='fc')
+    sym = mx.sym.SoftmaxOutput(fc, grad_scale=1, ignore_label=-1, multi_output=False,
+                               out_grad=False, preserve_shape=False, use_ignore=False, name='softmax')
+    return sym
+
+
+@with_seed()
+def test_quantize_model():
+    def check_params(params, qparams, qsym=None):
+        if qsym is None:
+            assert len(params) == len(qparams)
+            for k, v in params.items():
+                assert k in qparams
+                assert same(v.asnumpy(), qparams[k].asnumpy())
+        else:
+            qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params)
+            assert len(qparams) == len(qparams_ground_truth)
+            for k, v in qparams_ground_truth.items():
+                assert k in qparams
+                assert same(v.asnumpy(), qparams[k].asnumpy())
+
+    def check_qsym_calibrated(qsym):
+        attrs = qsym.attr_dict()
+        for k, v in attrs.items():
+            if k.find('requantize_') != -1:
+                assert 'min_calib_range' in v
+                assert 'max_calib_range' in v
+
+    sym = get_fp32_sym()
+    mod = Module(symbol=sym)
+    batch_size = 4
+    data_shape = (batch_size, 4, 10, 10)
+    label_shape = (batch_size, 10)
+    mod.bind(data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)])
+    mod.init_params()
+    arg_params, aux_params = mod.get_params()
+    qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
+                                                                     arg_params=arg_params,
+                                                                     aux_params=aux_params,
+                                                                     ctx=mx.current_context(),
+                                                                     calib_mode='none')
+    check_params(arg_params, qarg_params, qsym)
+    check_params(aux_params, qaux_params)
+
+    calib_data = mx.nd.random.uniform(shape=data_shape)
+    calib_data = NDArrayIter(data=calib_data)
+    calib_data = DummyIter(calib_data)
+    qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
+                                                                     arg_params=arg_params,
+                                                                     aux_params=aux_params,
+                                                                     ctx=mx.current_context(),
+                                                                     calib_mode='naive',
+                                                                     calib_data=calib_data,
+                                                                     num_calib_examples=20)
+    check_params(arg_params, qarg_params, qsym)
+    check_params(aux_params, qaux_params)
+    check_qsym_calibrated(qsym)
+
+
+@with_seed()
+def test_quantize_sym_with_calib():
+    sym = get_fp32_sym()
+    offline_params = [name for name in sym.list_arguments()
+                      if not name.startswith('data') and not name.endswith('label')]
+    qsym = mx.contrib.quant._quantize_symbol(sym, offline_params=offline_params)
+    requantize_op_names = ['requantize_conv', 'requantize_fc']
+    th_dict = {'conv_output': (np.random.uniform(low=100.0, high=200.0), np.random.uniform(low=100.0, high=200.0)),
+               'fc_output': (np.random.uniform(low=100.0, high=200.0), np.random.uniform(low=100.0, high=200.0))}
+    op_name_to_th_name = {'requantize_conv': 'conv_output', 'requantize_fc': 'fc_output'}
+    cqsym = mx.contrib.quant._calibrate_quantized_sym(qsym, th_dict)
+    attr_dict = cqsym.attr_dict()
+    for name in requantize_op_names:
+        assert name in attr_dict
+        lhs = float(attr_dict[name]['min_calib_range'])
+        rhs = th_dict[op_name_to_th_name[name]][0]
+        assert_almost_equal(np.array([lhs]), np.array([rhs]))
+        lhs = float(attr_dict[name]['max_calib_range'])
+        rhs = th_dict[op_name_to_th_name[name]][1]
+        assert_almost_equal(np.array([lhs]), np.array([rhs]), rtol=1e-3, atol=1e-4)
+
+
+@with_seed()
+def test_get_optimal_thresholds():
+    # Given an ndarray with elements following a uniform distribution, the optimal threshold
+    # for quantizing the ndarray should be either abs(min(nd)) or abs(max(nd)).
+    def get_threshold(nd):
+        min_nd = mx.nd.min(nd)
+        max_nd = mx.nd.max(nd)
+        return mx.nd.maximum(mx.nd.abs(min_nd), mx.nd.abs(max_nd)).asnumpy()
+
+    nd_dict = {'layer1': mx.nd.uniform(low=-10.532, high=11.3432, shape=(8, 3, 23, 23))}
+    expected_threshold = get_threshold(nd_dict['layer1'])
+    th_dict = mx.contrib.quant._get_optimal_thresholds(nd_dict)
+    assert 'layer1' in th_dict
+    assert_almost_equal(np.array([th_dict['layer1'][1]]), expected_threshold, rtol=0.001, atol=0.001)
+
+
+if __name__ == "__main__":
+    import nose
+    nose.runmodule()
diff --git a/tests/python/quantization_gpu/test_quantization_gpu.py b/tests/python/quantization_gpu/test_quantization_gpu.py
new file mode 100644
index 000000000000..4f2d70effd49
--- /dev/null
+++ b/tests/python/quantization_gpu/test_quantization_gpu.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import sys
+import mxnet as mx
+
+
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(curr_path, '../quantization'))
+from mxnet.test_utils import set_default_context
+from test_quantization import *
+
+set_default_context(mx.gpu(0))
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 240c06a5d7a2..2486be04a52f 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -2539,7 +2539,7 @@ def npy_layer_norm(data, gamma, beta, axis=1, eps=1E-5):
     for req in ['write', 'add']:
         check_numeric_gradient(out_s, {'data': data, 'gamma': gamma, 'beta': beta},
                                grad_nodes={'data': req, 'gamma': req, 'beta': req},
-                               numeric_eps=1e-2, rtol=1e-2, atol=1e-3)
+                               numeric_eps=1e-2, rtol=1e-2, atol=1e-2)
 
 def test_layer_norm():
     for dtype, forward_check_eps in zip([np.float16, np.float32, np.float64],