diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index 52c3052df8ec..cb2cc7ac17ff 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit 52c3052df8ec1d5b8b45cb6c350a952840eabd42
+Subproject commit cb2cc7ac17ff4e2ef50805c7048d33256d82be4d
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fcbe707aa9ef..f0779abd06d4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.0.2)
+cmake_minimum_required(VERSION 3.13)
 
 # workaround to store CMAKE_CROSSCOMPILING because is getting reset by the project command
 if(CMAKE_CROSSCOMPILING)
@@ -452,6 +452,7 @@ if(USE_OPENMP)
     set(OPENMP_STANDALONE_BUILD TRUE)
     set(LIBOMP_ENABLE_SHARED TRUE)
     set(CMAKE_BUILD_TYPE Release)
+    set(OPENMP_ENABLE_LIBOMPTARGET OFF CACHE BOOL "LLVM OpenMP offloading support")  # Requires CMP0077 CMake 3.13
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp)
   endfunction()
 
diff --git a/cd/python/pypi/Jenkins_pipeline.groovy b/cd/python/pypi/Jenkins_pipeline.groovy
index e9f172a570fe..fa9300db3ca0 100644
--- a/cd/python/pypi/Jenkins_pipeline.groovy
+++ b/cd/python/pypi/Jenkins_pipeline.groovy
@@ -27,7 +27,7 @@
 // This is a temporary solution until we are confident with the packages generated by CI
 // This should be removed in the not too distant future.
 // We only skip the publish step so we can still QA the other variants.
-pypi_releases = ["cu92", "cu92mkl"]
+pypi_releases = []
 
 def get_pipeline(mxnet_variant) {
   def node_type = mxnet_variant.startsWith('cu') ? NODE_LINUX_GPU : NODE_LINUX_CPU
@@ -72,6 +72,7 @@ def push(mxnet_variant) {
     } else {
       echo "Temporarily skipping publishing PyPI package for '${mxnet_variant}'."
     }
+    sh "./ci/docker/runtime_functions.sh cd_s3_publish"
   }
 }
 
diff --git a/cd/python/pypi/pypi_publish.py b/cd/python/pypi/pypi_publish.py
index 7e09f644c734..2729068dd503 100755
--- a/cd/python/pypi/pypi_publish.py
+++ b/cd/python/pypi/pypi_publish.py
@@ -35,10 +35,8 @@ def post_wheel(path):
     logging.info('Posting {} to PyPI'.format(path))
     pypi_credentials = get_secret()
 
-    cmd = 'python3 -m twine upload --username {} --password {} {}'.format(
-        pypi_credentials['username'],
-        pypi_credentials['password'],
-        path)
+    cmd = 'python3 -m twine upload {}'.format(path)
+    version = os.path.basename(path).split('-')[1]
 
     # The PyPI credentials for DEV has username set to 'skipPublish'
     # This way we do not attempt to publish the PyPI package
@@ -47,14 +45,15 @@ def post_wheel(path):
         print('In DEV account, skipping publish')
         print('Would have run: {}'.format(cmd))
         return 0
-    else:
+    elif any(test_version_mark in version for test_version_mark in ['a', 'b', 'dev']):
         print('Skipping publishing nightly builds to Pypi.')
         print('See https://github.com/pypa/pypi-support/issues/50 for details')
         return 0
-
-        # DO NOT PRINT CMD IN THIS BLOCK, includes password
-        p = subprocess.run(cmd.split(' '),
-                        stdout=subprocess.PIPE)
+    else:
+        env = os.environ.copy()
+        env['TWINE_USERNAME'] = pypi_credentials['username']
+        env['TWINE_PASSWORD'] = pypi_credentials['password']
+        p = subprocess.run(cmd.split(' '), stdout=subprocess.PIPE, env=env)
         logging.info(p.stdout)
         return p.returncode
 
@@ -85,7 +84,7 @@ def get_secret():
             raise e
     else:
         return json.loads(get_secret_value_response['SecretString'])
-        
-            
+
+
 if __name__ == '__main__':
     sys.exit(post_wheel(sys.argv[1]))
diff --git a/ci/build_windows.py b/ci/build_windows.py
index ce77c316ab20..5839e8d793d1 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -1,4 +1,4 @@
-﻿#!/usr/bin/env python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -28,7 +28,9 @@
 import platform
 import shutil
 import sys
+import tempfile
 import time
+import zipfile
 from distutils.dir_util import copy_tree
 from enum import Enum
 from subprocess import check_call
@@ -147,22 +149,33 @@ def windows_build(args):
     mxnet_root = get_mxnet_root()
     logging.info("Found MXNet root: {}".format(mxnet_root))
 
-    with remember_cwd():
-        os.chdir(path)
-        cmd = "\"{}\" && cmake -G \"NMake Makefiles JOM\" {} {}".format(args.vcvars,
-                                                                        CMAKE_FLAGS[args.flavour],
-                                                                        mxnet_root)
-        logging.info("Generating project with CMake:\n{}".format(cmd))
-        check_call(cmd, shell=True)
-
-        cmd = "\"{}\" && jom".format(args.vcvars)
-        logging.info("Building with jom:\n{}".format(cmd))
-
-        t0 = int(time.time())
-        check_call(cmd, shell=True)
-
-        logging.info("Build flavour: {} complete in directory: \"{}\"".format(args.flavour, os.path.abspath(path)))
-        logging.info("Build took {}".format(datetime.timedelta(seconds=int(time.time() - t0))))
+    url = 'https://github.com/Kitware/CMake/releases/download/v3.16.1/cmake-3.16.1-win64-x64.zip'
+    with tempfile.TemporaryDirectory() as tmpdir:
+        cmake_file_path = download_file(url, tmpdir)
+        with zipfile.ZipFile(cmake_file_path, 'r') as zip_ref:
+            # Create $tmpdir\cmake-3.16.1-win64-x64\bin\cmake.exe
+            zip_ref.extractall(tmpdir)
+
+        with remember_cwd():
+            os.chdir(path)
+            cmd = "\"{}\" && {} -G \"NMake Makefiles JOM\" {} {}".format(
+                args.vcvars,
+                os.path.join(tmpdir, 'cmake-3.16.1-win64-x64', 'bin', 'cmake.exe'),
+                CMAKE_FLAGS[args.flavour], mxnet_root)
+            logging.info("Generating project with CMake:\n{}".format(cmd))
+            check_call(cmd, shell=True)
+
+            cmd = "\"{}\" && jom".format(args.vcvars)
+            logging.info("Building with jom:\n{}".format(cmd))
+
+            t0 = int(time.time())
+            check_call(cmd, shell=True)
+
+            logging.info(
+                "Build flavour: {} complete in directory: \"{}\"".format(
+                    args.flavour, os.path.abspath(path)))
+            logging.info("Build took {}".format(
+                datetime.timedelta(seconds=int(time.time() - t0))))
     windows_package(args)
 
 
@@ -262,4 +275,3 @@ def main():
 
 if __name__ == '__main__':
     sys.exit(main())
-
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index a2e98cd2efe1..2c923a015b63 100644
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -18,7 +18,7 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
-FROM mxnetcipinned/dockcross-base:11262018
+FROM dockcross/base
 MAINTAINER Pedro Larroy "pllarroy@amazon.com"
 
 # The cross-compiling emulator
diff --git a/ci/docker/Dockerfile.build.android_armv8 b/ci/docker/Dockerfile.build.android_armv8
index f7de86763457..ca62288129bb 100644
--- a/ci/docker/Dockerfile.build.android_armv8
+++ b/ci/docker/Dockerfile.build.android_armv8
@@ -18,7 +18,7 @@
 #
 # Dockerfile to build MXNet for Android ARM64/ARMv8
 
-FROM mxnetcipinned/dockcross-base:11262018
+FROM dockcross/base
 MAINTAINER Pedro Larroy "pllarroy@amazon.com"
 
 RUN apt-get update && apt-get install -y \
@@ -82,4 +82,4 @@ RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
 
-WORKDIR /work/build
\ No newline at end of file
+WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index 60e223b7a60f..e6a7ffe758b9 100644
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -18,7 +18,7 @@
 #
 # Dockerfile to build MXNet for ARMv6
 
-FROM mxnetcipinned/dockcross-linux-armv6:11262018
+FROM dockcross/linux-armv6
 
 ENV ARCH armv6l
 ENV HOSTCC gcc
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index 0b557d5839e9..bad9ab214050 100644
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -18,7 +18,7 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
-FROM mxnetcipinned/dockcross-linux-armv7:11262018
+FROM dockcross/linux-armv7
 
 ENV ARCH armv7l
 ENV HOSTCC gcc
diff --git a/ci/docker/Dockerfile.build.armv8 b/ci/docker/Dockerfile.build.armv8
index ef9c95865590..bd2373180f0b 100644
--- a/ci/docker/Dockerfile.build.armv8
+++ b/ci/docker/Dockerfile.build.armv8
@@ -18,7 +18,7 @@
 #
 # Dockerfile to build MXNet for ARM64/ARMv8
 
-FROM mxnetcipinned/dockcross-linux-arm64:11262018
+FROM dockcross/linux-arm64
 
 ENV ARCH aarch64
 ENV HOSTCC gcc
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 07097887f87d..e31ee43a93d8 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -22,7 +22,7 @@
 
 FROM nvidia/cuda:9.0-cudnn7-devel as cudabuilder
 
-FROM mxnetcipinned/dockcross-linux-arm64:11262018
+FROM dockcross/linux-arm64
 
 ENV ARCH aarch64
 ENV HOSTCC gcc
diff --git a/ci/docker/install/requirements b/ci/docker/install/requirements
index cbfc521e2c08..fd716f5fa815 100644
--- a/ci/docker/install/requirements
+++ b/ci/docker/install/requirements
@@ -26,8 +26,8 @@ h5py==2.8.0rc1
 mock==2.0.0
 nose==1.3.7
 nose-timer==0.7.3
-numpy>1.16.0,<2.0.0
+numpy>1.16.0,<1.18.0
 pylint==2.3.1; python_version >= '3.0'
 requests<2.19.0,>=2.18.4
-scipy==1.0.1
+scipy==1.2.1
 six==1.11.0
diff --git a/ci/docker/install/ubuntu_core.sh b/ci/docker/install/ubuntu_core.sh
index 8426b539f4c4..bd5d1f6fdf6f 100755
--- a/ci/docker/install/ubuntu_core.sh
+++ b/ci/docker/install/ubuntu_core.sh
@@ -57,11 +57,10 @@ apt-get install -y \
 ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0.1.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so
 
 
-# Note: we specify an exact cmake version to work around a cmake 3.10 CUDA 10 issue.
-# Reference: https://github.com/clab/dynet/issues/1457
+# CMake 3.13.2+ is required
 mkdir /opt/cmake && cd /opt/cmake
-wget -nv https://cmake.org/files/v3.12/cmake-3.12.4-Linux-x86_64.sh
-sh cmake-3.12.4-Linux-x86_64.sh --prefix=/opt/cmake --skip-license
+wget -nv https://cmake.org/files/v3.13/cmake-3.13.5-Linux-x86_64.sh
+sh cmake-3.13.5-Linux-x86_64.sh --prefix=/opt/cmake --skip-license
 ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
-rm cmake-3.12.4-Linux-x86_64.sh
+rm cmake-3.13.5-Linux-x86_64.sh
 cmake --version
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index b658f953a78a..e078b2a8f89c 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -2065,6 +2065,15 @@ cd_pypi_publish() {
     ./cd/python/pypi/pypi_publish.py `readlink -f wheel_build/dist/*.whl`
 }
 
+cd_s3_publish() {
+    set -ex
+    pip3 install --user awscli
+    filepath=$(readlink -f wheel_build/dist/*.whl)
+    filename=$(basename $file_path)
+    variant=$(echo $filename | cut -d'-' -f1 | cut -d'_' -f2 -s)
+    aws s3 cp --grants read=uri=http://acs.amazonaws.com/groups/global/AllUsers,full=id=43f628fab72838a4f0b929d7f1993b14411f4b0294b011261bc6bd3e950a6822 s3://apache-mxnet/dist/${variant}/${filename}
+}
+
 build_static_scala_mkl() {
     set -ex
     pushd .
diff --git a/ci/util.py b/ci/util.py
index 4b3a399184f9..cd5665d04df8 100644
--- a/ci/util.py
+++ b/ci/util.py
@@ -15,12 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
 import contextlib
 import logging
 import logging.config
+import os
+import subprocess
 import sys
 
+import requests
+
 
 def get_mxnet_root() -> str:
     curpath = os.path.abspath(os.path.dirname(__file__))
@@ -130,3 +133,31 @@ def config_logging():
     # or sensitive information
     logging.getLogger("botocore").setLevel(logging.WARNING)
     logging.getLogger("requests").setLevel(logging.WARNING)
+
+
+# Takes url and downloads it to the dest_path directory on Windows.
+def download_file(url, dest_path):
+    file_name = url.split('/')[-1]
+    full_path = "{}\\{}".format(dest_path, file_name)
+    logging.info("Downloading: {}".format(full_path))
+    r = requests.get(url, stream=True)
+    if r.status_code == 404:
+        return r.status_code
+    elif r.status_code != 200:
+        logging.error("{} returned status code {}".format(url, r.status_code))
+    with open(full_path, 'wb') as f:
+        for chunk in r.iter_content(chunk_size=1024):
+            if chunk: # filter out keep-alive new chunks
+                f.write(chunk)
+    return full_path
+
+
+# Takes arguments and runs command on host.  Shell is disabled by default.
+def run_command(args, shell=False):
+    try:
+        logging.info("Issuing command: {}".format(args))
+        res = subprocess.check_output(args, shell=shell, timeout=1800).decode("utf-8").replace("\r\n", "")
+        logging.info("Output: {}".format(res))
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return res
diff --git a/example/quantization/README.md b/example/quantization/README.md
index 8cdc1bb7e06f..b934a811f31d 100644
--- a/example/quantization/README.md
+++ b/example/quantization/README.md
@@ -9,7 +9,7 @@ This folder contains examples of quantizing a FP32 model with Intel® MKL-DNN or
 
 <h2 id="1">Model Quantization with Intel® MKL-DNN</h2>
 
-Intel® MKL-DNN supports quantization with subgraph features on Intel® CPU Platform and can bring performance improvements on the [Intel® Xeon® Scalable Platform](https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-platform.html). A new quantization script `imagenet_gen_qsym_mkldnn.py` has been designed to launch quantization for image-classification models with Intel® MKL-DNN. This script integrates with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that more pre-trained models can be downloaded from Gluon-CV and then converted for quantization. To apply quantization flow to your project directly, please refer [Quantize custom models with MKL-DNN backend](https://mxnet.apache.org/tutorials/mkldnn/mkldnn_quantization.html).
+Intel® MKL-DNN supports quantization with subgraph features on Intel® CPU Platform and can bring performance improvements on the [Intel® Xeon® Scalable Platform](https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-platform.html). A new quantization script `imagenet_gen_qsym_mkldnn.py` has been designed to launch quantization for image-classification models with Intel® MKL-DNN. This script integrates with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that more pre-trained models can be downloaded from Gluon-CV and then converted for quantization. To apply quantization flow to your project directly, please refer [Quantize custom models with MKL-DNN backend](https://mxnet.apache.org/api/python/docs/tutorials/performance/backend/mkldnn/mkldnn_quantization.html).
 
 ```
 usage: imagenet_gen_qsym_mkldnn.py [-h] [--model MODEL] [--epoch EPOCH]
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
index a4c1491039b9..130282714e30 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -140,17 +140,23 @@ def save_params(fname, arg_params, aux_params, logger=None):
                         help='If enabled, the quantize op will '
                              'be calibrated offline if calibration mode is '
                              'enabled')
+    parser.add_argument('--quiet', action='store_true', default=False,
+                        help='suppress most of log')
     args = parser.parse_args()
     ctx = mx.cpu(0)
-    logging.basicConfig()
-    logger = logging.getLogger('logger')
-    logger.setLevel(logging.INFO)
+    logger = None
+    if not args.quiet:
+        logging.basicConfig()
+        logger = logging.getLogger('logger')
+        logger.setLevel(logging.INFO)
 
-    logger.info(args)
-    logger.info('shuffle_dataset=%s' % args.shuffle_dataset)
+    if logger:
+        logger.info(args)
+        logger.info('shuffle_dataset=%s' % args.shuffle_dataset)
 
     calib_mode = args.calib_mode
-    logger.info('calibration mode set to %s' % calib_mode)
+    if logger:
+        logger.info('calibration mode set to %s' % calib_mode)
 
     # download calibration dataset
     if calib_mode != 'none':
@@ -158,13 +164,16 @@ def save_params(fname, arg_params, aux_params, logger=None):
 
     # download model
     if not args.no_pretrained:
-        logger.info('Get pre-trained model from MXNet or Gluoncv modelzoo.')
-        logger.info('If you want to use custom model, please set --no-pretrained.')
+        if logger:
+            logger.info('Get pre-trained model from MXNet or Gluoncv modelzoo.')
+            logger.info('If you want to use custom model, please set --no-pretrained.')
         if args.model in ['imagenet1k-resnet-152', 'imagenet1k-inception-bn']:
-            logger.info('model %s is downloaded from MXNet modelzoo' % args.model)
+            if logger:
+                logger.info('model %s is downloaded from MXNet modelzoo' % args.model)
             prefix, epoch = download_model(model_name=args.model, logger=logger)
         else:
-            logger.info('model %s is converted from GluonCV' % args.model)
+            if logger:
+                logger.info('model %s is converted from GluonCV' % args.model)
             prefix = convert_from_gluon(model_name=args.model, image_shape=args.image_shape, classes=1000, logger=logger)
             rgb_mean = '123.68,116.779,103.939'
             rgb_std = '58.393, 57.12, 57.375'
@@ -178,14 +187,16 @@ def save_params(fname, arg_params, aux_params, logger=None):
 
     # get batch size
     batch_size = args.batch_size
-    logger.info('batch size = %d for calibration' % batch_size)
+    if logger:
+        logger.info('batch size = %d for calibration' % batch_size)
 
     # get number of batches for calibration
     num_calib_batches = args.num_calib_batches
-    if calib_mode == 'none':
-        logger.info('skip calibration step as calib_mode is none')
-    else:
-        logger.info('number of batches = %d for calibration' % num_calib_batches)
+    if logger:
+        if calib_mode == 'none':
+            logger.info('skip calibration step as calib_mode is none')
+        else:
+            logger.info('number of batches = %d for calibration' % num_calib_batches)
 
     # get number of threads for decoding the dataset
     data_nthreads = args.data_nthreads
@@ -195,7 +206,8 @@ def save_params(fname, arg_params, aux_params, logger=None):
 
     exclude_first_conv = args.exclude_first_conv
     if args.quantized_dtype == "uint8":
-        logger.info('quantized dtype is set to uint8, will exclude first conv.')
+        if logger:
+            logger.info('quantized dtype is set to uint8, will exclude first conv.')
         exclude_first_conv = True
     excluded_sym_names = []
     if not args.no_pretrained:
@@ -242,42 +254,48 @@ def save_params(fname, arg_params, aux_params, logger=None):
         else:
             raise ValueError('Currently, model %s is not supported in this script' % args.model)
     else:
-        logger.info('Please set proper RGB configs for model %s' % args.model)
+        if logger:
+            logger.info('Please set proper RGB configs for model %s' % args.model)
         # add rgb mean/std of your model.
         rgb_mean = '0,0,0'
         rgb_std = '0,0,0'
         # add layer names you donnot want to quantize.
-        logger.info('Please set proper excluded_sym_names for model %s' % args.model)
+        if logger:
+            logger.info('Please set proper excluded_sym_names for model %s' % args.model)
         excluded_sym_names += ['layers']
         if exclude_first_conv:
             excluded_sym_names += ['layers']
 
-    logger.info('These layers have been excluded %s' % excluded_sym_names)
+    if logger:
+        logger.info('These layers have been excluded %s' % excluded_sym_names)
 
     label_name = args.label_name
-    logger.info('label_name = %s' % label_name)
+    if logger:
+        logger.info('label_name = %s' % label_name)
 
     data_shape = tuple([int(i) for i in image_shape.split(',')])
-    logger.info('Input data shape = %s' % str(data_shape))
-
-    logger.info('rgb_mean = %s' % rgb_mean)
+    if logger:
+        logger.info('Input data shape = %s' % str(data_shape))
+        logger.info('rgb_mean = %s' % rgb_mean)
+        logger.info('rgb_std = %s' % rgb_std)
     rgb_mean = [float(i) for i in rgb_mean.split(',')]
     mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': rgb_mean[2]}
-    logger.info('rgb_std = %s' % rgb_std)
     rgb_std = [float(i) for i in rgb_std.split(',')]
     std_args = {'std_r': rgb_std[0], 'std_g': rgb_std[1], 'std_b': rgb_std[2]}
     combine_mean_std = {}
     combine_mean_std.update(mean_args)
     combine_mean_std.update(std_args)
     if calib_mode == 'none':
-        logger.info('Quantizing FP32 model %s' % args.model)
+        if logger:
+            logger.info('Quantizing FP32 model %s' % args.model)
         qsym, qarg_params, aux_params = quantize_model_mkldnn(sym=sym, arg_params=arg_params, aux_params=aux_params,
                                                               ctx=ctx, excluded_sym_names=excluded_sym_names,
                                                               calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
                                                               logger=logger)
         sym_name = '%s-symbol.json' % (prefix + '-quantized')
     else:
-        logger.info('Creating ImageRecordIter for reading calibration dataset')
+        if logger:
+            logger.info('Creating ImageRecordIter for reading calibration dataset')
         data = mx.io.ImageRecordIter(path_imgrec=args.calib_dataset,
                                      label_width=1,
                                      preprocess_threads=data_nthreads,
diff --git a/julia/src/MXNet.jl b/julia/src/MXNet.jl
index 86d7d06806fc..4c50d7321158 100644
--- a/julia/src/MXNet.jl
+++ b/julia/src/MXNet.jl
@@ -80,7 +80,11 @@ export Context,
        cpu,
        gpu,
        num_gpus,
-       gpu_memory_info
+       gpu_memory_info,
+       current_context,
+       @context,
+       @cpu,
+       @gpu
 
 # model.jl
 export AbstractModel,
diff --git a/julia/src/context.jl b/julia/src/context.jl
index 68e69138e10e..bb80e0728724 100644
--- a/julia/src/context.jl
+++ b/julia/src/context.jl
@@ -31,11 +31,89 @@ struct Context
   Context(dev_type::CONTEXT_TYPE, dev_id::Integer = 0) = new(dev_type, dev_id)
 end
 
+const _default_ctx = Ref{Context}(Context(CPU, 0))
+
 Context(dev_type::Integer, dev_id::Integer = 0) =
   Context(convert(CONTEXT_TYPE, dev_type), dev_id)
 
 Base.show(io::IO, ctx::Context) =
-  print(io, "$(ctx.device_type)$(ctx.device_id)")
+  print(io, lowercase("$(ctx.device_type)$(ctx.device_id)"))
+
+function _with_context(dev_type::Union{Symbol,Expr}, dev_id, e::Expr)
+  global _default_ctx
+  quote
+    ctx = current_context()
+    ctx′ = Context($(esc(dev_type)), $(esc(dev_id)))
+    $_default_ctx[] = ctx′
+    try
+      return $(esc(e))
+    finally
+      $_default_ctx[] = ctx
+    end
+  end
+end
+
+"""
+    @context device_type [device_id] expr
+
+Change the default context in the following expression.
+
+# Examples
+```jl-repl
+julia> mx.@context mx.GPU begin
+         mx.zeros(2, 3)
+       end
+2×3 NDArray{Float32,2} @ gpu0:
+ 0.0f0  0.0f0  0.0f0
+ 0.0f0  0.0f0  0.0f0
+
+julia> @context mx.GPU mx.zeros(3, 2)
+3×2 NDArray{Float32,2} @ gpu0:
+ 0.0f0  0.0f0
+ 0.0f0  0.0f0
+ 0.0f0  0.0f0
+```
+"""
+macro context(dev_type, e::Expr)
+  _with_context(dev_type, 0, e)
+end
+
+macro context(dev_type, dev_id, e::Expr)
+  _with_context(dev_type, dev_id, e)
+end
+
+for dev ∈ [:cpu, :gpu]
+  ctx = QuoteNode(Symbol(uppercase(string(dev))))
+  docstring = """
+        @$dev [device_id] expr
+
+    A shorthand for `@context mx.GPU`.
+
+    # Examples
+    ```jl-repl
+    julia> mx.@with_gpu mx.zeros(2, 3)
+    2×3 NDArray{Float32,2} @ gpu0:
+     0.0f0  0.0f0  0.0f0
+     0.0f0  0.0f0  0.0f0
+    ```
+    """
+  @eval begin
+    @doc $docstring ->
+    macro $dev(e::Expr)
+      ctx = $ctx
+      quote
+        @context $ctx $(esc(e))
+      end
+    end
+
+    macro $dev(dev_id, e::Expr)
+      ctx = $ctx
+      quote
+        @context $ctx $(esc(dev_id)) $(esc(e))
+      end
+    end
+  end
+end  # for dev ∈ [:cpu, :gpu]
 
 """
     cpu(dev_id)
@@ -86,3 +164,27 @@ function gpu_memory_info(dev_id = 0)
   @mxcall :MXGetGPUMemoryInformation64 (Cint, Ref{UInt64}, Ref{UInt64}) dev_id free n
   free[], n[]
 end
+
+"""
+    current_context()
+
+Return the current context.
+
+By default, `mx.cpu()` is used for all the computations
+and it can be overridden by using the `@context` macro.
+
+# Examples
+```jl-repl
+julia> mx.current_context()
+cpu0
+
+julia> mx.@context mx.GPU 1 begin  # Context changed in the following code block
+         mx.current_context()
+       end
+gpu1
+
+julia> mx.current_context()
+cpu0
+```
+"""
+current_context() = _default_ctx[]
diff --git a/julia/src/ndarray/array.jl b/julia/src/ndarray/array.jl
index b71e5ddf9397..2cd9c2e24f9c 100644
--- a/julia/src/ndarray/array.jl
+++ b/julia/src/ndarray/array.jl
@@ -28,13 +28,14 @@ Base.similar(x::NDArray{T,N}; writable = x.writable, ctx = context(x)) where {T,
   NDArray{T,N}(undef, size(x)...; writable = writable, ctx = ctx)
 
 """
-    zeros([DType], dims, [ctx::Context = cpu()])
+    zeros([DType], dims, ctx::Context = current_context())
     zeros([DType], dims...)
     zeros(x::NDArray)
 
 Create zero-ed `NDArray` with specific shape and type.
 """
-function zeros(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType}
+function zeros(::Type{T}, dims::NTuple{N,Int},
+               ctx::Context = current_context()) where {N,T<:DType}
   x = NDArray{T}(undef, dims..., ctx = ctx)
   x[:] = zero(T)
   x
@@ -42,7 +43,7 @@ end
 
 zeros(::Type{T}, dims::Int...) where {T<:DType} = zeros(T, dims)
 
-zeros(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
+zeros(dims::NTuple{N,Int}, ctx::Context = current_context()) where N =
   zeros(MX_float, dims, ctx)
 zeros(dims::Int...) = zeros(dims)
 
@@ -50,13 +51,14 @@ zeros(x::NDArray)::typeof(x)      = zeros_like(x)
 Base.zeros(x::NDArray)::typeof(x) = zeros_like(x)
 
 """
-    ones([DType], dims, [ctx::Context = cpu()])
+    ones([DType], dims, ctx::Context = current_context())
     ones([DType], dims...)
     ones(x::NDArray)
 
 Create an `NDArray` with specific shape & type, and initialize with 1.
 """
-function ones(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType}
+function ones(::Type{T}, dims::NTuple{N,Int},
+              ctx::Context = current_context()) where {N,T<:DType}
   arr = NDArray{T}(undef, dims..., ctx = ctx)
   arr[:] = one(T)
   arr
@@ -64,7 +66,7 @@ end
 
 ones(::Type{T}, dims::Int...) where T<:DType = ones(T, dims)
 
-ones(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
+ones(dims::NTuple{N,Int}, ctx::Context = current_context()) where N =
   ones(MX_float, dims, ctx)
 ones(dims::Int...) = ones(dims)
 
@@ -458,12 +460,12 @@ function Base.fill!(arr::NDArray, x)
 end
 
 """
-    fill(x, dims, ctx=cpu())
+    fill(x, dims, ctx = current_context())
     fill(x, dims...)
 
 Create an `NDArray` filled with the value `x`, like `Base.fill`.
 """
-function fill(x::T, dims::NTuple{N,Integer}, ctx::Context = cpu()) where {T,N}
+function fill(x::T, dims::NTuple{N,Integer}, ctx::Context = current_context()) where {T,N}
   arr = NDArray{T}(undef, dims, ctx = ctx)
   arr[:] = x
   arr
diff --git a/julia/src/ndarray/type.jl b/julia/src/ndarray/type.jl
index 8d90d63f0360..e24c89291dcb 100644
--- a/julia/src/ndarray/type.jl
+++ b/julia/src/ndarray/type.jl
@@ -116,7 +116,7 @@ end
 
 # UndefInitializer constructors
 NDArray{T,N}(::UndefInitializer, dims::NTuple{N,Integer};
-             writable = true, ctx::Context = cpu()) where {T,N} =
+             writable = true, ctx::Context = current_context()) where {T,N} =
   NDArray{T,N}(_ndarray_alloc(T, dims, ctx, false), writable)
 NDArray{T,N}(::UndefInitializer, dims::Vararg{Integer,N}; kw...) where {T,N} =
   NDArray{T,N}(undef, dims; kw...)
diff --git a/julia/test/unittest/context.jl b/julia/test/unittest/context.jl
index 0a8f086a194a..e903f9212930 100644
--- a/julia/test/unittest/context.jl
+++ b/julia/test/unittest/context.jl
@@ -26,8 +26,85 @@ function test_num_gpus()
   @test num_gpus() >= 0
 end
 
+function test_context_macro()
+  @info "Context::@context"
+
+  @context mx.CPU 42 begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.CPU
+    @test ctx.device_id   == 42
+
+    @context mx.GPU 24 begin
+      ctx = mx.current_context()
+      @test ctx.device_type == mx.GPU
+      @test ctx.device_id   == 24
+    end
+
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.CPU
+    @test ctx.device_id   == 42
+  end
+
+  function f()
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.GPU
+    @test ctx.device_id   == 123
+  end
+
+  @context mx.GPU 123 begin
+    f()
+  end
+
+  @context mx.GPU begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.GPU
+    @test ctx.device_id   == 0
+  end
+
+  @context mx.CPU begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.CPU
+    @test ctx.device_id   == 0
+  end
+
+  @info "Context::@gpu"
+  @gpu 123 f()
+  @gpu begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.GPU
+    @test ctx.device_id   == 0
+  end
+  let n = 321
+    @gpu n begin
+      ctx = mx.current_context()
+      @test ctx.device_type == mx.GPU
+      @test ctx.device_id   == 321
+    end
+  end
+
+  @info "Context::@cpu"
+  @cpu 123 begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.CPU
+    @test ctx.device_id   == 123
+  end
+  @cpu begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.CPU
+    @test ctx.device_id   == 0
+  end
+  let n = 321
+    @cpu n begin
+      ctx = mx.current_context()
+      @test ctx.device_type == mx.CPU
+      @test ctx.device_id   == 321
+    end
+  end
+end
+
 @testset "Context Test" begin
   test_num_gpus()
+  test_context_macro()
 end
 
 
diff --git a/julia/test/unittest/ndarray.jl b/julia/test/unittest/ndarray.jl
index 599b0a65bfc4..fb59b71edd60 100644
--- a/julia/test/unittest/ndarray.jl
+++ b/julia/test/unittest/ndarray.jl
@@ -1294,7 +1294,7 @@ function test_show()
     @test occursin("1×4", str)
     @test occursin("NDArray", str)
     @test occursin("Int64", str)
-    @test occursin("CPU", str)
+    @test occursin("cpu", str)
     @test match(r"1\s+2\s+3\s+4", str) != nothing
   end
 
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index b0714037bb42..01051ab7c8e4 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -188,8 +188,8 @@ def collect(self, name, arr):
             return
         handle = ctypes.cast(arr, NDArrayHandle)
         arr = NDArray(handle, writable=False).copyto(cpu()).asnumpy()
-        if self.logger is not None:
-            self.logger.info("Collecting layer %s histogram of shape %s" % (name, arr.shape))
+        if self.logger:
+            self.logger.debug("Collecting layer %s histogram of shape %s" % (name, arr.shape))
         min_range = np.min(arr)
         max_range = np.max(arr)
         th = max(abs(min_range), abs(max_range))
@@ -224,9 +224,9 @@ def collect(self, name, arr):
                                        max(cur_min_max[1], max_range))
         else:
             self.min_max_dict[name] = (min_range, max_range)
-        if self.logger is not None:
-            self.logger.info("Collecting layer %s min_range=%f, max_range=%f"
-                             % (name, min_range, max_range))
+        if self.logger:
+            self.logger.debug("Collecting layer %s min_range=%f, max_range=%f"
+                              % (name, min_range, max_range))
 
 def _calibrate_quantized_sym(qsym, th_dict):
     """Given a dictionary containing the thresholds for quantizing the layers,
@@ -358,18 +358,19 @@ def _get_optimal_thresholds(hist_dict, quantized_dtype, num_quantized_bins=255,
         else:
             th_dict[name] = (-th, th)
         del hist_dict[name]  # release the memory
-        if logger is not None:
-            logger.info('layer=%s, min_val=%f, max_val=%f, th=%f, divergence=%f'
-                        % (name, min_val, max_val, th, divergence))
+        if logger:
+            logger.debug('layer=%s, min_val=%f, max_val=%f, th=%f, divergence=%f'
+                         % (name, min_val, max_val, th, divergence))
     return th_dict
 
 
-def _load_sym(sym, logger=logging):
+def _load_sym(sym, logger=None):
     """Given a str as a path the symbol .json file or a symbol, returns a Symbol object."""
     if isinstance(sym, str):  # sym is a symbol file path
         cur_path = os.path.dirname(os.path.realpath(__file__))
         symbol_file_path = os.path.join(cur_path, sym)
-        logger.info('Loading symbol from file %s' % symbol_file_path)
+        if logger:
+            logger.info('Loading symbol from file %s' % symbol_file_path)
         return sym_load(symbol_file_path)
     elif isinstance(sym, Symbol):
         return sym
@@ -378,14 +379,15 @@ def _load_sym(sym, logger=logging):
                          ' while received type %s' % str(type(sym)))
 
 
-def _load_params(params, logger=logging):
+def _load_params(params, logger=None):
     """Given a str as a path to the .params file or a pair of params,
     returns two dictionaries representing arg_params and aux_params.
     """
     if isinstance(params, str):
         cur_path = os.path.dirname(os.path.realpath(__file__))
         param_file_path = os.path.join(cur_path, params)
-        logger.info('Loading params from file %s' % param_file_path)
+        if logger:
+            logger.info('Loading params from file %s' % param_file_path)
         save_dict = nd_load(param_file_path)
         arg_params = {}
         aux_params = {}
@@ -451,7 +453,7 @@ def quantize_model(sym, arg_params, aux_params,
                    data_names=('data',), label_names=('softmax_label',),
                    ctx=cpu(), excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
                    calib_data=None, num_calib_examples=None,
-                   quantized_dtype='int8', quantize_mode='smart', logger=logging):
+                   quantized_dtype='int8', quantize_mode='smart', logger=None):
     """User-level API for generating a quantized model from a FP32 model w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -530,7 +532,9 @@ def quantize_model(sym, arg_params, aux_params,
                          ' the names of the operators that will not be quantized,'
                          ' while received type %s' % str(type(excluded_op_names)))
 
-    logger.info('Quantizing symbol')
+    if logger:
+        os.environ['MXNET_QUANTIZATION_VERBOSE'] = '1'
+        logger.info('Quantizing symbol')
     if quantized_dtype not in ('int8', 'uint8', 'auto'):
         raise ValueError('unknown quantized_dtype %s received,'
                          ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
@@ -561,21 +565,24 @@ def quantize_model(sym, arg_params, aux_params,
                                                                include_layer=calib_layer,
                                                                max_num_examples=num_calib_examples,
                                                                logger=logger)
-            logger.info('Collected layer outputs from FP32 model using %d examples' % num_examples)
-            logger.info('Calculating optimal thresholds for quantization')
+            if logger:
+                logger.info('Collected layer outputs from FP32 model using %d examples' % num_examples)
+                logger.info('Calculating optimal thresholds for quantization')
             th_dict = _get_optimal_thresholds(hist_dict, quantized_dtype, logger=logger)
         elif calib_mode == 'naive':
             th_dict, num_examples = _collect_layer_output_min_max(
                 mod, calib_data, quantized_dtype, include_layer=calib_layer, max_num_examples=num_calib_examples,
                 logger=logger)
-            logger.info('Collected layer output min/max values from FP32 model using %d examples'
-                        % num_examples)
+            if logger:
+                logger.info('Collected layer output min/max values from FP32 model using %d examples'
+                            % num_examples)
         else:
             raise ValueError('unknown calibration mode %s received,'
                              ' expected `none`, `naive`, or `entropy`' % calib_mode)
         qsym = _calibrate_quantized_sym(qsym, th_dict)
 
-    logger.info('Quantizing parameters')
+    if logger:
+        logger.info('Quantizing parameters')
     qarg_params = _quantize_params(qsym, arg_params, th_dict)
 
     return qsym, qarg_params, aux_params
@@ -584,7 +591,7 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
                           data_names=('data',), label_names=('softmax_label',),
                           ctx=cpu(), excluded_sym_names=None, excluded_op_names=None,
                           calib_mode='entropy', calib_data=None, num_calib_examples=None,
-                          quantized_dtype='int8', quantize_mode='smart', logger=logging):
+                          quantized_dtype='int8', quantize_mode='smart', logger=None):
     """User-level API for generating a fusion + quantized model from a FP32 model
     w/ or w/o calibration with Intel MKL-DNN.
     The backend quantized operators are only enabled for Linux systems. Please do not run
@@ -621,7 +628,7 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
 
 def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
                    excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
-                   quantized_dtype='int8', quantize_mode='full', logger=logging):
+                   quantized_dtype='int8', quantize_mode='full', logger=None):
     """User-level API for generating a quantized model from a FP32 model w/o calibration
     and a collector for naive or entropy calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
@@ -676,7 +683,9 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
                          ' while received type %s' % str(type(excluded_sym_names)))
     if not isinstance(ctx, Context):
         raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
-    logger.info('Quantizing graph')
+    if logger:
+        os.environ['MXNET_QUANTIZATION_VERBOSE'] = '1'
+        logger.info('Quantizing graph')
     if quantized_dtype not in ('int8', 'uint8', 'auto'):
         raise ValueError('unknown quantized_dtype %s received,'
                          ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
@@ -693,20 +702,24 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
         if calib_mode == 'entropy':
             collector = _LayerHistogramCollector(
                 include_layer=calib_layer, logger=logger)
-            logger.info(
-                'Create a layer output collector for entropy calibration.')
+            if logger:
+                logger.info(
+                    'Create a layer output collector for entropy calibration.')
         elif calib_mode == 'naive':
             collector = _LayerOutputMinMaxCollector(quantized_dtype=quantized_dtype,
                                                     include_layer=calib_layer, logger=logger)
-            logger.info(
-                'Create a layer output minmax collector for naive calibration')
+            if logger:
+                logger.info(
+                    'Create a layer output minmax collector for naive calibration')
         else:
             raise ValueError('unknown calibration mode %s received,'
                              ' expected `none`, `naive`, or `entropy`' % calib_mode)
-        logger.info('Collector created, please use set_monitor_callback'
-                    ' to collect calibration information.')
+        if logger:
+            logger.info('Collector created, please use set_monitor_callback'
+                        ' to collect calibration information.')
 
-    logger.info('Quantizing parameters')
+    if logger:
+        logger.info('Quantizing parameters')
     qarg_params = _quantize_params(qsym, arg_params, th_dict)
 
     return qsym, qarg_params, aux_params, collector
@@ -751,7 +764,8 @@ def calib_graph(qsym, arg_params, aux_params, collector,
     th_dict = {}
     if calib_mode is not None and calib_mode != 'none':
         if calib_mode == 'entropy':
-            logger.info('Calculating optimal thresholds for quantization')
+            if logger:
+                logger.info('Calculating optimal thresholds for quantization')
             th_dict = _get_optimal_thresholds(
                 collector.hist_dict, quantized_dtype, logger=logger)
         elif calib_mode == 'naive':
@@ -763,7 +777,8 @@ def calib_graph(qsym, arg_params, aux_params, collector,
     else:
         raise ValueError('please set calibration mode to naive or entropy.')
 
-    logger.info('Quantizing parameters')
+    if logger:
+        logger.info('Quantizing parameters')
     qarg_params = _quantize_params(qsym, arg_params, th_dict)
 
     return qsym, qarg_params, aux_params
@@ -771,7 +786,7 @@ def calib_graph(qsym, arg_params, aux_params, collector,
 def quantize_net(network, quantized_dtype='auto', quantize_mode='full',
                  exclude_layers=None, exclude_layers_match=None, exclude_operators=None,
                  calib_data=None, data_shapes=None, calib_mode='none',
-                 num_calib_examples=None, ctx=cpu(), logger=logging):
+                 num_calib_examples=None, ctx=cpu(), logger=None):
     """User-level API for Gluon users to generate a quantized SymbolBlock from a FP32 HybridBlock w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -825,7 +840,8 @@ def quantize_net(network, quantized_dtype='auto', quantize_mode='full',
     -------
     """
 
-    logger.info('Export HybridBlock')
+    if logger:
+        logger.info('Export HybridBlock')
     network.hybridize()
     import mxnet as mx
     if calib_data is not None:
@@ -881,7 +897,8 @@ def __exit__(self, exc_type, exc_value, traceback):
         for layers in list(symnet.get_internals()):
             if layers.name.find(name_match) != -1:
                 exclude_layers.append(layers.name)
-    logger.info('These layers have been excluded %s' % exclude_layers)
+    if logger:
+        logger.info('These layers have been excluded %s' % exclude_layers)
 
     if ctx == mx.cpu():
         symnet = symnet.get_backend_symbol('MKLDNN_QUANTIZE')
@@ -906,8 +923,9 @@ def __exit__(self, exc_type, exc_value, traceback):
             mod.set_params(args, auxs, allow_missing=False, force_init=True)
             num_examples = _collect_layer_statistics(mod, calib_data, collector,
                                                      num_calib_examples, logger)
-            logger.info('Collected layer output values from FP32 model using %d examples'
-                        % num_examples)
+            if logger:
+                logger.info('Collected layer output values from FP32 model using %d examples'
+                            % num_examples)
             qsym, qarg_params, aux_params = calib_graph(
                 qsym=qsym, arg_params=args, aux_params=auxs, collector=collector,
                 calib_mode=calib_mode, quantized_dtype=quantized_dtype, logger=logger)
diff --git a/python/mxnet/gluon/contrib/estimator/batch_processor.py b/python/mxnet/gluon/contrib/estimator/batch_processor.py
index 4985f8c81bf3..aa5adbfdea5f 100644
--- a/python/mxnet/gluon/contrib/estimator/batch_processor.py
+++ b/python/mxnet/gluon/contrib/estimator/batch_processor.py
@@ -61,8 +61,8 @@ def evaluate_batch(self, estimator,
             Batch axis to split the validation data into devices.
         """
         data, label = self._get_data_and_label(val_batch, estimator.context, batch_axis)
-        pred = [estimator.eval_net(x) for x in data]
-        loss = [estimator.evaluation_loss(y_hat, y) for y_hat, y in zip(pred, label)]
+        pred = [estimator.val_net(x) for x in data]
+        loss = [estimator.val_loss(y_hat, y) for y_hat, y in zip(pred, label)]
 
         return data, label, pred, loss
 
diff --git a/python/mxnet/gluon/contrib/estimator/estimator.py b/python/mxnet/gluon/contrib/estimator/estimator.py
index 09f43151e235..ed8a53d7c3a6 100644
--- a/python/mxnet/gluon/contrib/estimator/estimator.py
+++ b/python/mxnet/gluon/contrib/estimator/estimator.py
@@ -61,22 +61,19 @@ class Estimator(object):
         Trainer to apply optimizer on network parameters.
     context : Context or list of Context
         Device(s) to run the training on.
-    evaluation_loss : gluon.loss.loss
-        Loss (objective) function to calculate during validation. If set evaluation_loss
-        None, it will use the same loss function as self.loss
-    eval_net : gluon.Block
+    val_net : gluon.Block
         The model used for validation. The validation model does not necessarily belong to
         the same model class as the training model. But the two models typically share the
         same architecture. Therefore the validation model can reuse parameters of the
         training model.
 
-        The code example of consruction of eval_net sharing the same network parameters as
+        The code example of consruction of val_net sharing the same network parameters as
         the training net is given below:
 
         >>> net = _get_train_network()
-        >>> eval_net = _get_test_network(params=net.collect_params())
+        >>> val_net = _get_test_network(params=net.collect_params())
         >>> net.initialize(ctx=ctx)
-        >>> est = Estimator(net, loss, eval_net=eval_net)
+        >>> est = Estimator(net, loss, val_net=val_net)
 
         Proper namespace match is required for weight sharing between two networks. Most networks
         inheriting :py:class:`Block` can share their parameters correctly. An exception is
@@ -84,6 +81,9 @@ class Estimator(object):
         the  naming in mxnet Gluon API, please refer to the site
         (https://mxnet.apache.org/api/python/docs/tutorials/packages/gluon/blocks/naming.html)
         for future information.
+    val_loss : gluon.loss.loss
+        Loss (objective) function to calculate during validation. If set val_loss
+        None, it will use the same loss function as self.loss
     batch_processor: BatchProcessor
         BatchProcessor provides customized fit_batch() and evaluate_batch() methods
     """
@@ -113,8 +113,8 @@ def __init__(self, net,
                  initializer=None,
                  trainer=None,
                  context=None,
-                 evaluation_loss=None,
-                 eval_net=None,
+                 val_net=None,
+                 val_loss=None,
                  batch_processor=None):
         self.net = net
         self.loss = self._check_loss(loss)
@@ -122,12 +122,12 @@ def __init__(self, net,
         self._val_metrics = _check_metrics(val_metrics)
         self._add_default_training_metrics()
         self._add_validation_metrics()
-        self.evaluation_loss = self.loss
-        if evaluation_loss is not None:
-            self.evaluation_loss = self._check_loss(evaluation_loss)
-        self.eval_net = self.net
-        if eval_net is not None:
-            self.eval_net = eval_net
+        self.val_loss = self.loss
+        if val_loss is not None:
+            self.val_loss = self._check_loss(val_loss)
+        self.val_net = self.net
+        if val_net is not None:
+            self.val_net = val_net
 
         self.logger = logging.Logger(name='Estimator', level=logging.INFO)
         self.logger.addHandler(logging.StreamHandler(sys.stdout))
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index c111a95a707a..e380b4937168 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -28,8 +28,8 @@
 from . import _internal as _npi
 from ..ndarray import NDArray
 
-__all__ = ['shape', 'zeros', 'zeros_like', 'ones', 'ones_like', 'full', 'full_like',
-           'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power',
+__all__ = ['shape', 'zeros', 'zeros_like', 'ones', 'ones_like', 'full', 'full_like', 'invert',
+           'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power', 'bitwise_not',
            'arctan2', 'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'log10', 'sqrt', 'cbrt', 'abs',
            'absolute', 'exp', 'expm1', 'arcsin', 'arccos', 'arctan', 'sign', 'log', 'degrees', 'log2',
            'log1p', 'rint', 'radians', 'reciprocal', 'square', 'negative', 'fix', 'ceil', 'floor', 'histogram',
@@ -37,10 +37,10 @@
            'logspace', 'expand_dims', 'tile', 'arange', 'array_split', 'split', 'vsplit', 'concatenate', 'append',
            'stack', 'vstack', 'column_stack', 'dstack', 'average', 'mean', 'maximum', 'minimum', 'swapaxes', 'clip',
            'argmax', 'argmin', 'std', 'var', 'indices', 'copysign', 'ravel', 'unravel_index', 'hanning', 'hamming',
-           'blackman', 'flip', 'around', 'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm',
-           'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less',
-           'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'nonzero', 'shares_memory',
-           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where']
+           'blackman', 'flip', 'around', 'round', 'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad',
+           'unique', 'lcm', 'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal',
+           'greater', 'less', 'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'nonzero',
+           'shares_memory', 'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where', 'bincount']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -2690,6 +2690,112 @@ def floor(x, out=None, **kwargs):
     return _unary_func_helper(x, _npi.floor, _np.floor, out=out, **kwargs)
 
 
+@set_module('mxnet.ndarray.numpy')
+@wrap_np_unary_func
+def bitwise_not(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _unary_func_helper(x, _npi.bitwise_not, _np.bitwise_not, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+@wrap_np_unary_func
+def invert(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _unary_func_helper(x, _npi.bitwise_not, _np.bitwise_not, out=out, **kwargs)
+
+
 @set_module('mxnet.ndarray.numpy')
 @wrap_np_unary_func
 def trunc(x, out=None, **kwargs):
@@ -4631,6 +4737,25 @@ def around(x, decimals=0, out=None, **kwargs):
         raise TypeError('type {} not supported'.format(str(type(x))))
 
 
+@set_module('mxnet.ndarray.numpy')
+def round(x, decimals=0, out=None, **kwargs):
+    r"""
+    round_(a, decimals=0, out=None)
+    Round an array to the given number of decimals.
+
+    See Also
+    --------
+    around : equivalent function; see for details.
+    """
+    from ...numpy import ndarray
+    if isinstance(x, numeric_types):
+        return _np.around(x, decimals, **kwargs)
+    elif isinstance(x, ndarray):
+        return _npi.around(x, decimals, out=out, **kwargs)
+    else:
+        raise TypeError('type {} not supported'.format(str(type(x))))
+
+
 @set_module('mxnet.ndarray.numpy')
 @wrap_np_binary_func
 def arctan2(x1, x2, out=None, **kwargs):
@@ -5929,3 +6054,60 @@ def where(condition, x=None, y=None):
         return nonzero(condition)
     else:
         return _npi.where(condition, x, y, out=None)
+
+
+@set_module('mxnet.ndarray.numpy')
+def bincount(x, weights=None, minlength=0):
+    """
+    Count number of occurrences of each value in array of non-negative ints.
+
+    Parameters
+    ----------
+    x : ndarray
+        input array, 1 dimension, nonnegative ints.
+    weights: ndarray
+        input weigths same shape as x. (Optional)
+    minlength: int
+        A minimum number of bins for the output. (Optional)
+
+    Returns
+    --------
+    out : ndarray
+        the result of binning the input array. The length of out is equal to amax(x)+1.
+
+    Raises
+    --------
+    Value Error
+        If the input is not 1-dimensional, or contains elements with negative values,
+        or if minlength is negative
+    TypeError
+        If the type of the input is float or complex.
+
+    Examples
+    --------
+    >>> np.bincount(np.arange(5))
+    array([1, 1, 1, 1, 1])
+    >>> np.bincount(np.array([0, 1, 1, 3, 2, 1, 7]))
+    array([1, 3, 1, 1, 0, 0, 0, 1])
+
+    >>> x = np.array([0, 1, 1, 3, 2, 1, 7, 23])
+    >>> np.bincount(x).size == np.amax(x)+1
+    True
+
+    >>> np.bincount(np.arange(5, dtype=float))
+    Traceback (most recent call last):
+    File "<stdin>", line 1, in <module>
+    TypeError: array cannot be safely cast to required type
+
+    >>> w = np.array([0.3, 0.5, 0.2, 0.7, 1., -0.6]) # weights
+    >>> x = np.array([0, 1, 1, 2, 2, 2])
+    >>> np.bincount(x,  weights=w)
+    array([ 0.3,  0.7,  1.1])
+    """
+    if not isinstance(x, NDArray):
+        raise TypeError("Input data should be NDarray")
+    if minlength < 0:
+        raise ValueError("Minlength value should greater than 0")
+    if weights is None:
+        return _npi.bincount(x, minlength=minlength, has_weights=False)
+    return _npi.bincount(x, weights=weights, minlength=minlength, has_weights=True)
diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
index 9d1a6f9119ee..e67c766c6bdf 100644
--- a/python/mxnet/ndarray/numpy/random.py
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -23,7 +23,7 @@
 from ..ndarray import NDArray
 
 
-__all__ = ['randint', 'uniform', 'normal', "choice", "rand", "multinomial"]
+__all__ = ['randint', 'uniform', 'normal', "choice", "rand", "multinomial", "shuffle"]
 
 
 def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
@@ -344,3 +344,39 @@ def rand(*size, **kwargs):
     for s in size:
         output_shape += (s,)
     return uniform(0, 1, size=output_shape, **kwargs)
+
+
+def shuffle(x):
+    """
+    Modify a sequence in-place by shuffling its contents.
+
+    This function only shuffles the array along the first axis of a
+    multi-dimensional array. The order of sub-arrays is changed but
+    their contents remain the same.
+
+    Parameters
+    ----------
+    x: ndarray
+        The array or list to be shuffled.
+
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> arr = np.arange(10)
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([5., 1., 0., 6., 7., 3., 9., 8., 4., 2.])  # random
+
+    Multi-dimensional arrays are only shuffled along the first axis:
+
+    >>> arr = np.arange(9).reshape((3, 3))
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([[6., 7., 8.], # random
+           [3., 4., 5.],
+           [0., 1., 2.]])
+    """
+    _npi.shuffle(x, out=x)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 5795c62942df..22094a1621d2 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -47,19 +47,19 @@
 from ..ndarray.ndarray import _storage_type
 
 __all__ = ['ndarray', 'empty', 'array', 'shape', 'zeros', 'zeros_like', 'ones', 'ones_like', 'full', 'full_like',
-           'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power',
-           'arctan2', 'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'log10',
+           'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power', 'bitwise_not',
+           'arctan2', 'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'log10', 'invert',
            'sqrt', 'cbrt', 'abs', 'absolute', 'exp', 'expm1', 'arcsin', 'arccos', 'arctan', 'sign', 'log',
            'degrees', 'log2', 'log1p', 'rint', 'radians', 'reciprocal', 'square', 'negative', 'histogram',
            'fix', 'ceil', 'floor', 'trunc', 'logical_not', 'arcsinh', 'arccosh', 'arctanh', 'append', 'argsort',
            'tensordot', 'eye', 'linspace', 'logspace', 'expand_dims', 'tile', 'arange', 'array_split',
            'split', 'vsplit', 'concatenate', 'stack', 'vstack', 'column_stack', 'dstack', 'average', 'mean',
            'maximum', 'minimum', 'swapaxes', 'clip', 'argmax', 'argmin', 'std', 'var', 'indices', 'copysign',
-           'ravel', 'unravel_index', 'hanning', 'hamming', 'blackman', 'flip', 'around', 'arctan2', 'hypot',
-           'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm', 'tril', 'identity', 'take',
-           'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less', 'greater_equal',
+           'ravel', 'unravel_index', 'hanning', 'hamming', 'blackman', 'flip', 'around', 'round', 'arctan2',
+           'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm', 'tril', 'identity',
+           'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less', 'greater_equal',
            'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'nonzero', 'shares_memory',
-           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where']
+           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where', 'bincount']
 
 # Return code for dispatching indexing function call
 _NDARRAY_UNSUPPORTED_INDEXING = -1
@@ -1377,7 +1377,7 @@ def argsort(self, axis=-1, kind=None, order=None):  # pylint: disable=arguments-
         The arguments are the same as for :py:func:`argsort`, with
         this array as data.
         """
-        raise argsort(self, axis=axis, kind=kind, order=order)
+        return argsort(self, axis=axis, kind=kind, order=order)
 
     def argmax_channel(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax_channel`.
@@ -1558,13 +1558,13 @@ def norm(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute norm')
 
-    def round(self, *args, **kwargs):
+    def round(self, decimals=0, out=None, **kwargs): # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`round`.
 
         The arguments are the same as for :py:func:`round`, with
         this array as data.
         """
-        raise NotImplementedError
+        return round(self, decimals=decimals, out=out, **kwargs)
 
     def rint(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`rint`.
@@ -3981,6 +3981,110 @@ def floor(x, out=None, **kwargs):
     """
     return _mx_nd_np.floor(x, out=out, **kwargs)
 
+@set_module('mxnet.numpy')
+@wrap_np_unary_func
+def invert(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _mx_nd_np.bitwise_not(x, out=out, **kwargs)
+
+@set_module('mxnet.numpy')
+@wrap_np_unary_func
+def bitwise_not(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _mx_nd_np.bitwise_not(x, out=out, **kwargs)
+
 
 @set_module('mxnet.numpy')
 @wrap_np_unary_func
@@ -6352,6 +6456,19 @@ def around(x, decimals=0, out=None, **kwargs):
     return _mx_nd_np.around(x, decimals, out=out, **kwargs)
 
 
+@set_module('mxnet.numpy')
+def round(x, decimals=0, out=None, **kwargs):
+    r"""
+    round_(a, decimals=0, out=None)
+    Round an array to the given number of decimals.
+
+    See Also
+    --------
+    around : equivalent function; see for details.
+    """
+    return _mx_nd_np.around(x, decimals, out=out, **kwargs)
+
+
 @set_module('mxnet.numpy')
 @wrap_np_binary_func
 def arctan2(x1, x2, out=None, **kwargs):
@@ -7892,3 +8009,54 @@ def where(condition, x=None, y=None):
            [ 0.,  3., -1.]])
     """
     return _mx_nd_np.where(condition, x, y)
+
+
+@set_module('mxnet.numpy')
+def bincount(x, weights=None, minlength=0):
+    """
+    Count number of occurrences of each value in array of non-negative ints.
+
+    Parameters
+    ----------
+    x : ndarray
+        input array, 1 dimension, nonnegative ints.
+    weights: ndarray
+        input weigths same shape as x. (Optional)
+    minlength: int
+        A minimum number of bins for the output. (Optional)
+
+    Returns
+    --------
+    out : ndarray
+        the result of binning the input array. The length of out is equal to amax(x)+1.
+
+    Raises
+    --------
+    Value Error
+        If the input is not 1-dimensional, or contains elements with negative values,
+        or if minlength is negative
+    TypeError
+        If the type of the input is float or complex.
+
+    Examples
+    --------
+    >>> np.bincount(np.arange(5))
+    array([1, 1, 1, 1, 1])
+    >>> np.bincount(np.array([0, 1, 1, 3, 2, 1, 7]))
+    array([1, 3, 1, 1, 0, 0, 0, 1])
+
+    >>> x = np.array([0, 1, 1, 3, 2, 1, 7, 23])
+    >>> np.bincount(x).size == np.amax(x)+1
+    True
+
+    >>> np.bincount(np.arange(5, dtype=float))
+    Traceback (most recent call last):
+    File "<stdin>", line 1, in <module>
+    TypeError: array cannot be safely cast to required type
+
+    >>> w = np.array([0.3, 0.5, 0.2, 0.7, 1., -0.6]) # weights
+    >>> x = np.array([0, 1, 1, 2, 2, 2])
+    >>> np.bincount(x,  weights=w)
+    array([ 0.3,  0.7,  1.1])
+    """
+    return _mx_nd_np.bincount(x, weights=weights, minlength=minlength)
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index 1cad4a55c466..95719a005cec 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import
 from ..ndarray import numpy as _mx_nd_np
 
-__all__ = ["randint", "uniform", "normal", "choice", "rand", "multinomial"]
+__all__ = ["randint", "uniform", "normal", "choice", "rand", "multinomial", "shuffle", "randn"]
 
 
 def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
@@ -321,3 +321,80 @@ def rand(*size, **kwargs):
     for s in size:
         output_shape += (s,)
     return _mx_nd_np.random.uniform(0, 1, size=output_shape, **kwargs)
+
+
+def shuffle(x):
+    """
+    Modify a sequence in-place by shuffling its contents.
+
+    This function only shuffles the array along the first axis of a
+    multi-dimensional array. The order of sub-arrays is changed but
+    their contents remain the same.
+
+    Parameters
+    ----------
+    x: ndarray
+        The array or list to be shuffled.
+
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> arr = np.arange(10)
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([5., 1., 0., 6., 7., 3., 9., 8., 4., 2.])  # random
+
+    Multi-dimensional arrays are only shuffled along the first axis:
+
+    >>> arr = np.arange(9).reshape((3, 3))
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([[6., 7., 8.], # random
+           [3., 4., 5.],
+           [0., 1., 2.]])
+    """
+    _mx_nd_np.random.shuffle(x)
+
+
+def randn(*size, **kwargs):
+    r"""Return a sample (or samples) from the "standard normal" distribution.
+    If positive, int_like or int-convertible arguments are provided,
+    `randn` generates an array of shape ``(d0, d1, ..., dn)``, filled
+    with random floats sampled from a univariate "normal" (Gaussian)
+    distribution of mean 0 and variance 1 (if any of the :math:`d_i` are
+    floats, they are first converted to integers by truncation). A single
+    float randomly sampled from the distribution is returned if no
+    argument is provided.
+    This is a convenience function.  If you want an interface that takes a
+    tuple as the first argument, use `numpy.random.standard_normal` instead.
+    Parameters
+    ----------
+    d0, d1, ..., dn : int, optional
+        The dimensions of the returned array, should be all positive.
+        If no argument is given a single Python float is returned.
+    Returns
+    -------
+    Z : ndarray
+        A ``(d0, d1, ..., dn)``-shaped array of floating-point samples from
+        the standard normal distribution, or a single such float if
+        no parameters were supplied.
+    Notes
+    -----
+    For random samples from :math:`N(\mu, \sigma^2)`, use:
+    ``sigma * np.random.randn(...) + mu``
+    Examples
+    --------
+    >>> np.random.randn()
+    2.1923875335537315 #random
+    Two-by-four array of samples from N(3, 6.25):
+    >>> 2.5 * np.random.randn(2, 4) + 3
+    array([[-4.49401501,  4.00950034, -1.81814867,  7.29718677],  #random
+        [ 0.39924804,  4.68456316,  4.99394529,  4.84057254]]) #random
+    """
+    output_shape = ()
+    for s in size:
+        output_shape += (s,)
+    return _mx_nd_np.random.normal(0, 1, size=output_shape, **kwargs)
diff --git a/python/mxnet/numpy_dispatch_protocol.py b/python/mxnet/numpy_dispatch_protocol.py
index e93720564774..9aa755fb436e 100644
--- a/python/mxnet/numpy_dispatch_protocol.py
+++ b/python/mxnet/numpy_dispatch_protocol.py
@@ -86,6 +86,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'argmin',
     'argmax',
     'around',
+    'round',
     'argsort',
     'append',
     'broadcast_arrays',
@@ -147,6 +148,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'resize',
     'where',
     'full_like',
+    'bincount'
 ]
 
 
@@ -218,6 +220,8 @@ def _register_array_function():
     'square',
     'cbrt',
     'reciprocal',
+    'invert',
+    'bitwise_not',
     'remainder',
     'sin',
     'cos',
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index c61d5b2d393d..0b341b804758 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -36,7 +36,7 @@
 except ImportError:
     from builtins import slice as py_slice
 
-__all__ = ['zeros', 'zeros_like', 'ones', 'ones_like', 'full_like',
+__all__ = ['zeros', 'zeros_like', 'ones', 'ones_like', 'full_like', 'bitwise_not', 'invert',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power', 'arctan2',
            'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'log10', 'sqrt', 'cbrt', 'abs', 'absolute', 'exp',
            'expm1', 'arcsin', 'arccos', 'arctan', 'sign', 'log', 'degrees', 'log2', 'log1p',
@@ -45,10 +45,10 @@
            'logspace', 'expand_dims', 'tile', 'arange', 'array_split', 'split', 'vsplit', 'concatenate', 'append',
            'stack', 'vstack', 'column_stack', 'dstack', 'average', 'mean', 'maximum', 'minimum', 'swapaxes', 'clip',
            'argmax', 'argmin', 'std', 'var', 'indices', 'copysign', 'ravel', 'unravel_index', 'hanning', 'hamming',
-           'blackman', 'flip', 'around', 'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm',
-           'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less',
-           'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'shares_memory',
-           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where']
+           'blackman', 'flip', 'around', 'round', 'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad',
+           'unique', 'lcm', 'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal',
+           'greater', 'less', 'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide',
+           'shares_memory', 'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where', 'bincount']
 
 
 @set_module('mxnet.symbol.numpy')
@@ -491,7 +491,7 @@ def argsort(self, axis=-1, kind=None, order=None):  # pylint: disable=arguments-
         The arguments are the same as for :py:func:`argsort`, with
         this array as data.
         """
-        raise argsort(self, axis=axis, kind=kind, order=order)
+        return argsort(self, axis=axis, kind=kind, order=order)
 
     def argmax_channel(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax_channel`.
@@ -665,13 +665,13 @@ def norm(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute norm')
 
-    def round(self, *args, **kwargs):
+    def round(self, decimals=0, out=None, **kwargs): # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`round`.
 
         The arguments are the same as for :py:func:`round`, with
         this array as data.
         """
-        raise NotImplementedError
+        return round(self, decimals=decimals, out=out, **kwargs)
 
     def rint(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`rint`.
@@ -1040,6 +1040,98 @@ def ones(shape, dtype=_np.float32, order='C', ctx=None):
     return _npi.ones(shape=shape, ctx=ctx, dtype=dtype)
 
 
+@set_module('mxnet.symbol.numpy')
+@wrap_np_unary_func
+def invert(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _unary_func_helper(x, _npi.bitwise_not, _np.bitwise_not, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+@wrap_np_unary_func
+def bitwise_not(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _unary_func_helper(x, _npi.bitwise_not, _np.bitwise_not, out=out, **kwargs)
+
+
 @set_module('mxnet.symbol.numpy')
 def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylint: disable=too-many-arguments
     """
@@ -4432,6 +4524,24 @@ def around(x, decimals=0, out=None, **kwargs):
         raise TypeError('type {} not supported'.format(str(type(x))))
 
 
+@set_module('mxnet.symbol.numpy')
+def round(x, decimals=0, out=None, **kwargs):
+    r"""
+    round_(a, decimals=0, out=None)
+    Round an array to the given number of decimals.
+
+    See Also
+    --------
+    around : equivalent function; see for details.
+    """
+    if isinstance(x, numeric_types):
+        return _np.around(x, decimals, **kwargs)
+    elif isinstance(x, _Symbol):
+        return _npi.around(x, decimals, out=out, **kwargs)
+    else:
+        raise TypeError('type {} not supported'.format(str(type(x))))
+
+
 @set_module('mxnet.symbol.numpy')
 @wrap_np_binary_func
 def arctan2(x1, x2, out=None, **kwargs):
@@ -5468,4 +5578,38 @@ def load_json(json_str):
     return _Symbol(handle)
 
 
+@set_module('mxnet.symbol.numpy')
+def bincount(x, weights=None, minlength=0):
+    """
+    Count number of occurrences of each value in array of non-negative ints.
+
+    Parameters
+    ----------
+    x : _Symbol
+        input data
+    weights: _Symbol
+        input weigths same shape as x. (Optional)
+    minlength: int
+        A minimum number of bins for the output. (Optional)
+
+    Returns
+    --------
+    out : _Symbol
+        the result of binning the input data. The length of out is equal to amax(x)+1.
+
+    Raises:
+    --------
+    Value Error
+        If the input is not 1-dimensional, or contains elements with negative values,
+        or if minlength is negative
+    TypeError
+        If the type of the input is float or complex.
+    """
+    if minlength < 0:
+        raise ValueError("Minlength value should greater than 0")
+    if weights is None:
+        return _npi.bincount(x, minlength=minlength, has_weights=False)
+    return _npi.bincount(x, weights=weights, minlength=minlength, has_weights=True)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/python/mxnet/symbol/numpy/random.py b/python/mxnet/symbol/numpy/random.py
index 48bccb64a2b4..94c29f407acc 100644
--- a/python/mxnet/symbol/numpy/random.py
+++ b/python/mxnet/symbol/numpy/random.py
@@ -21,7 +21,7 @@
 from ...context import current_context
 from . import _internal as _npi
 
-__all__ = ['randint', 'uniform', 'normal', 'rand']
+__all__ = ['randint', 'uniform', 'normal', 'rand', 'shuffle']
 
 
 def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
@@ -288,3 +288,39 @@ def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
             return _npi.choice(a=a, size=size, replace=replace, ctx=ctx, weighted=False, out=out)
         else:
             return _npi.choice(p, a=a, size=size, replace=replace, ctx=ctx, weighted=True, out=out)
+
+
+def shuffle(x):
+    """
+    Modify a sequence in-place by shuffling its contents.
+
+    This function only shuffles the array along the first axis of a
+    multi-dimensional array. The order of sub-arrays is changed but
+    their contents remain the same.
+
+    Parameters
+    ----------
+    x: _Symbol
+        The array or list to be shuffled.
+
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> arr = np.arange(10)
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([5., 1., 0., 6., 7., 3., 9., 8., 4., 2.])  # random
+
+    Multi-dimensional arrays are only shuffled along the first axis:
+
+    >>> arr = np.arange(9).reshape((3, 3))
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([[6., 7., 8.], # random
+           [3., 4., 5.],
+           [0., 1., 2.]])
+    """
+    _npi.shuffle(x, out=x)
diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h
index 5b26a4523c13..ea1dcf39b8ba 100644
--- a/src/executor/simple_partition_pass.h
+++ b/src/executor/simple_partition_pass.h
@@ -102,8 +102,7 @@ class BidirectionalGraph {
   std::vector<std::unordered_set<Node*>> get_subsets(FCompatible is_compatible) {
     std::vector<std::unordered_set<Node*>> subgraphs;
     std::unordered_set<Node*> incomp_set;
-    std::unordered_set<Node*> all_set(nodes.size());
-    std::vector<PairSet> separation_sets;
+    std::vector<std::pair<bool, PairSet>> separation_sets;
     // Check each node for compatibility
     // and, if it is incompatible, mark nodes
     // on each side of it as not possible to be
@@ -111,48 +110,79 @@ class BidirectionalGraph {
     for (Node& node : nodes) {
       if (!is_compatible(node.nnvmptr)) {
         incomp_set.insert(&node);
-        std::unordered_set<Node*> in_graph;
-        std::unordered_set<Node*> out_graph;
-        std::vector<Node*> dummy_head;
-        dummy_head.emplace_back(&node);
-        DFS(dummy_head, false, [&out_graph, &is_compatible](Node* node) {
-          if (is_compatible(node->nnvmptr))
-            out_graph.insert(node);
-        });
-        DFS(dummy_head, true, [&in_graph, is_compatible](Node* node) {
-          if (is_compatible(node->nnvmptr))
-            in_graph.insert(node);
-        });
-        if (!(in_graph.empty() || out_graph.empty()))
-          separation_sets.push_back(std::make_pair(in_graph, out_graph));
       }
-      all_set.emplace(&node);
     }
-    IncompMap incomp_map;
-    std::unordered_set<Node*> comp_set;
-    comp_set.insert(all_set.begin(), all_set.end());
-    for (Node* n : incomp_set) {
-      comp_set.erase(n);
+    for (Node& node : nodes) {
+      if (incomp_set.count(&node) != 0) {
+        // Check if all your inputs are incompatible too.
+        // If so, then your separation set does not matter,
+        // because it will covered by the sets of your inputs
+        bool inside_node = true;
+        for (Node* input : node.inputs) {
+          if (incomp_set.count(input) == 0) {
+            inside_node = false;
+          }
+        }
+        if (!inside_node) {
+          std::unordered_set<Node*> in_graph;
+          std::unordered_set<Node*> out_graph;
+          std::vector<Node*> dummy_head;
+          dummy_head.emplace_back(&node);
+          DFS(dummy_head, false, [&out_graph](Node* node) {
+              out_graph.insert(node);
+          });
+          DFS(dummy_head, true, [&in_graph](Node* node) {
+              in_graph.insert(node);
+          });
+            separation_sets.push_back(std::make_pair(true,
+                                                     std::make_pair(in_graph, out_graph)));
+        } else {
+          separation_sets.push_back(std::make_pair(false, PairSet()));
+        }
+      } else {
+        separation_sets.push_back(std::make_pair(false, PairSet()));
+      }
     }
+    IncompMap incomp_map;
     // For each node construct the map of nodes that cannot be in
     // the same subset
-    for (Node* n : comp_set) {
-      for (PairSet p : separation_sets) {
-        if (p.first.count(n)) {
-          incomp_map[n].insert(p.second.begin(), p.second.end());
-        } else if (p.second.count(n)) {
-          incomp_map[n].insert(p.first.begin(), p.first.end());
+    index_t num_nodes = nodes.size();
+    for (index_t i = 0; i < num_nodes; ++i) {
+      const auto n = &(nodes[i]);
+      if (incomp_set.count(n) == 0) {
+        for (index_t j = i + 1; j < num_nodes; ++j) {
+          const auto& sep_set_pair = separation_sets[j];
+          if (sep_set_pair.first && incomp_map[n].count(&nodes[j]) == 0) {
+            const auto& p = sep_set_pair.second;
+            if (p.first.count(n)) {
+              incomp_map[n].insert(p.second.begin(), p.second.end());
+            } else if (p.second.count(n)) {
+              incomp_map[n].insert(p.first.begin(), p.first.end());
+            }
+          }
+        }
+        for (index_t j = i - 1; j >= 0; --j) {
+          const auto& sep_set_pair = separation_sets[j];
+          if (sep_set_pair.first && incomp_map[n].count(&nodes[j]) == 0) {
+            const auto& p = sep_set_pair.second;
+            if (p.first.count(n)) {
+              incomp_map[n].insert(p.second.begin(), p.second.end());
+            } else if (p.second.count(n)) {
+              incomp_map[n].insert(p.first.begin(), p.first.end());
+            }
+          }
+        }
+        for (Node* incomp_n : incomp_set) {
+          incomp_map[n].erase(incomp_n);
         }
-      }
-      for (Node* incomp_n : incomp_set) {
-        incomp_map[n].erase(incomp_n);
       }
     }
     std::unordered_set<Node*> unused_set;
-    unused_set.reserve(comp_set.size());
 
-    for (auto& n : comp_set) {
-      unused_set.insert(n);
+    for (auto& n : nodes) {
+      if (incomp_set.count(&n) == 0) {
+        unused_set.insert(&n);
+      }
     }
     std::unordered_set<Node*> visited;
     std::deque<Node*> stack(outputs.begin(), outputs.end());
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index ec5a79a2e675..1edd9897ec82 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -1032,17 +1032,19 @@ OpStatePtr CachedOp::Forward(
   CHECK_EQ(inputs.size(), num_inputs());
 
   Context default_ctx = inputs[0]->ctx();
-  auto state_ptr = GetCachedOpState(default_ctx);
-  auto& state = state_ptr.get_state<CachedOpState>();
+  {
+    auto state_ptr = GetCachedOpState(default_ctx);
+    auto& state = state_ptr.get_state<CachedOpState>();
 
-  const auto& idx = state.info.fwd_graph.indexed_graph();
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    CHECK_EQ(inputs[i]->ctx(), default_ctx)
-        << "CachedOp requires all inputs to live on the same context. But "
-        << idx[idx.input_nodes()[0]].source->attrs.name
-        << " is on " << default_ctx << " while "
-        << idx[idx.input_nodes()[i]].source->attrs.name
-        << " is on " << inputs[i]->ctx();
+    const auto& idx = state.info.fwd_graph.indexed_graph();
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      CHECK_EQ(inputs[i]->ctx(), default_ctx)
+          << "CachedOp requires all inputs to live on the same context. But "
+          << idx[idx.input_nodes()[0]].source->attrs.name
+          << " is on " << default_ctx << " while "
+          << idx[idx.input_nodes()[i]].source->attrs.name
+          << " is on " << inputs[i]->ctx();
+    }
   }
 
   int prev_bulk_size = Engine::Get()->set_bulk_size(config_.forward_bulk_size);
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index f92d544ddd90..f59f6da4450d 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -359,6 +359,18 @@ MXNET_UNARY_MATH_OP(negation, -a);
 
 MXNET_UNARY_MATH_OP(reciprocal, 1.0f / math::id(a));
 
+struct bitwise_not : public mxnet_op::tunable {
+  template<typename DType,
+           typename std::enable_if<!std::is_same<DType, bool>::value, int>::type = 0>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return ~static_cast<int64_t>(a);
+  }
+
+  MSHADOW_XINLINE static bool Map(bool a) {
+    return !a;
+  }
+};
+
 MXNET_UNARY_MATH_OP(reciprocal_grad, -1.0f / math::sqr(a));
 
 MXNET_UNARY_MATH_OP(sigmoid, 1.0f / (1.0f + math::exp(-a)));
diff --git a/src/operator/numpy/np_bincount_op-inl.h b/src/operator/numpy/np_bincount_op-inl.h
new file mode 100644
index 000000000000..254ea8fdec22
--- /dev/null
+++ b/src/operator/numpy/np_bincount_op-inl.h
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_bicount_op-inl.h
+ * \brief numpy compatible bincount operator
+ */
+#ifndef MXNET_OPERATOR_NUMPY_NP_BINCOUNT_OP_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_BINCOUNT_OP_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <utility>
+#include <vector>
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../elemwise_op_common.h"
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct NumpyBincountParam : public dmlc::Parameter<NumpyBincountParam> {
+  int minlength;
+  bool has_weights;
+  DMLC_DECLARE_PARAMETER(NumpyBincountParam) {
+    DMLC_DECLARE_FIELD(minlength)
+    .set_default(0)
+    .describe("A minimum number of bins for the output array"
+              "If minlength is specified, there will be at least this"
+              "number of bins in the output array");
+    DMLC_DECLARE_FIELD(has_weights)
+    .set_default(false)
+    .describe("Determine whether Bincount has weights.");
+  }
+};
+
+inline bool NumpyBincountType(const nnvm::NodeAttrs& attrs,
+                              std::vector<int> *in_attrs,
+                              std::vector<int> *out_attrs) {
+  const NumpyBincountParam& param = nnvm::get<NumpyBincountParam>(attrs.parsed);
+  if (!param.has_weights) {
+    return ElemwiseType<1, 1>(attrs, in_attrs, out_attrs) && in_attrs->at(0) != -1;
+  } else {
+    CHECK_EQ(out_attrs->size(), 1U);
+    CHECK_EQ(in_attrs->size(), 2U);
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(1));
+    TYPE_ASSIGN_CHECK(*in_attrs, 1, out_attrs->at(0));
+    return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
+  }
+}
+
+inline bool NumpyBincountStorageType(const nnvm::NodeAttrs& attrs,
+                                     const int dev_mask,
+                                     DispatchMode* dispatch_mode,
+                                     std::vector<int> *in_attrs,
+                                     std::vector<int> *out_attrs) {
+  const NumpyBincountParam& param = nnvm::get<NumpyBincountParam>(attrs.parsed);
+  if (param.has_weights) {
+    CHECK_EQ(in_attrs->size(), 2U);
+  } else {
+    CHECK_EQ(in_attrs->size(), 1U);
+  }
+  CHECK_EQ(out_attrs->size(), 1U);
+  for (int &attr : *in_attrs) {
+    CHECK_EQ(attr, kDefaultStorage) << "Only default storage is supported";
+  }
+  for (int &attr : *out_attrs) {
+    attr = kDefaultStorage;
+  }
+  *dispatch_mode = DispatchMode::kFComputeEx;
+  return true;
+}
+
+template<typename xpu>
+void NumpyBincountForwardImpl(const OpContext &ctx,
+                              const NDArray &data,
+                              const NDArray &weights,
+                              const NDArray &out,
+                              const size_t &data_n,
+                              const int &minlength);
+
+template<typename xpu>
+void NumpyBincountForwardImpl(const OpContext &ctx,
+                              const NDArray &data,
+                              const NDArray &out,
+                              const size_t &data_n,
+                              const int &minlength);
+
+template<typename xpu>
+void NumpyBincountForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext &ctx,
+                          const std::vector<NDArray> &inputs,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<NDArray> &outputs) {
+  CHECK_GE(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK(req[0] == kWriteTo);
+  const NumpyBincountParam& param = nnvm::get<NumpyBincountParam>(attrs.parsed);
+  const bool has_weights = param.has_weights;
+  const int minlength = param.minlength;
+  const NDArray &data = inputs[0];
+  const NDArray &out = outputs[0];
+  CHECK_LE(data.shape().ndim(), 1U) << "Input only accept 1d array";
+  CHECK(!common::is_float(data.dtype())) <<"Input data should be int type";
+  size_t N = data.shape().Size();
+  if (N == 0) {
+    mshadow::Stream<xpu> *stream = ctx.get_stream<xpu>();
+    mxnet::TShape s(1, minlength);
+    const_cast<NDArray &>(out).Init(s);
+    MSHADOW_TYPE_SWITCH(out.dtype(), OType, {
+      mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(
+        stream, minlength, out.data().dptr<OType>());
+    });
+  } else {
+    if (has_weights) {
+      CHECK_EQ(inputs.size(), 2U);
+      const NDArray &weights = inputs[1];
+      CHECK_EQ(data.shape(), weights.shape()) << "weights should has same size as input";
+      NumpyBincountForwardImpl<xpu>(ctx, data, weights, out, N, minlength);
+    } else {
+      NumpyBincountForwardImpl<xpu>(ctx, data, out, N, minlength);
+    }
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_BINCOUNT_OP_INL_H_
diff --git a/src/operator/numpy/np_bincount_op.cc b/src/operator/numpy/np_bincount_op.cc
new file mode 100644
index 000000000000..6256db176977
--- /dev/null
+++ b/src/operator/numpy/np_bincount_op.cc
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_bicount_op.cc
+ * \brief numpy compatible bincount operator CPU registration
+ */
+
+#include "./np_bincount_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+void BinNumberCount(const NDArray& data, const int& minlength,
+                    const NDArray& out, const size_t& N) {
+  int bin = minlength;
+  MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
+    DType* data_ptr = data.data().dptr<DType>();
+    for (size_t i = 0; i < N; i++) {
+      CHECK_GE(data_ptr[i], 0) << "input should be nonnegative number";
+      if (data_ptr[i] + 1 > bin) {
+        bin = data_ptr[i] + 1;
+      }
+    }
+  });  // bin number = max(max(data) + 1, minlength)
+  mxnet::TShape s(1, bin);
+  const_cast<NDArray &>(out).Init(s);  // set the output shape forcefully
+}
+
+template<typename DType, typename OType>
+void BincountCpuWeights(const DType* data, const OType* weights,
+                        OType* out, const size_t& data_n) {
+  for (size_t i = 0; i < data_n; i++) {
+    int target = data[i];
+    out[target] += weights[i];
+  }
+}
+
+template<typename DType, typename OType>
+void BincountCpu(const DType* data, OType* out, const size_t& data_n) {
+  for (size_t i = 0; i < data_n; i++) {
+    int target = data[i];
+    out[target] += 1;
+  }
+}
+
+template<>
+void NumpyBincountForwardImpl<cpu>(const OpContext &ctx,
+                                   const NDArray &data,
+                                   const NDArray &weights,
+                                   const NDArray &out,
+                                   const size_t &data_n,
+                                   const int &minlength) {
+  using namespace mxnet_op;
+  BinNumberCount(data, minlength, out, data_n);
+  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+  MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
+      MSHADOW_TYPE_SWITCH(weights.dtype(), OType, {
+        size_t out_size = out.shape()[0];
+        Kernel<set_zero, cpu>::Launch(s, out_size, out.data().dptr<OType>());
+        BincountCpuWeights(data.data().dptr<DType>(), weights.data().dptr<OType>(),
+                          out.data().dptr<OType>(), data_n);
+      });
+    });
+}
+
+template<>
+void NumpyBincountForwardImpl<cpu>(const OpContext &ctx,
+                                   const NDArray &data,
+                                   const NDArray &out,
+                                   const size_t &data_n,
+                                   const int &minlength) {
+  using namespace mxnet_op;
+  BinNumberCount(data, minlength, out, data_n);
+  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+  MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
+      MSHADOW_TYPE_SWITCH(out.dtype(), OType, {
+        size_t out_size = out.shape()[0];
+        Kernel<set_zero, cpu>::Launch(s, out_size, out.data().dptr<OType>());
+        BincountCpu(data.data().dptr<DType>(), out.data().dptr<OType>(), data_n);
+      });
+    });
+}
+
+DMLC_REGISTER_PARAMETER(NumpyBincountParam);
+
+NNVM_REGISTER_OP(_npi_bincount)
+.set_attr_parser(ParamParser<NumpyBincountParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const NumpyBincountParam& params =
+    nnvm::get<NumpyBincountParam>(attrs.parsed);
+  return params.has_weights? 2 : 1;
+  })
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    const NumpyBincountParam& params =
+    nnvm::get<NumpyBincountParam>(attrs.parsed);
+    return params.has_weights ?
+           std::vector<std::string>{"data", "weights"} :
+           std::vector<std::string>{"data"};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+[](const NodeAttrs& attrs) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<nnvm::FInferType>("FInferType", NumpyBincountType)
+.set_attr<FInferStorageType>("FInferStorageType", NumpyBincountStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", NumpyBincountForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("data", "NDArray-or-Symbol", "Data")
+.add_argument("weights", "NDArray-or-Symbol", "Weights")
+.add_arguments(NumpyBincountParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_bincount_op.cu b/src/operator/numpy/np_bincount_op.cu
new file mode 100644
index 000000000000..ed1f90f00c16
--- /dev/null
+++ b/src/operator/numpy/np_bincount_op.cu
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_bicount_op.cu
+ * \brief numpy compatible bincount operator GPU registration
+ */
+
+#include "./np_bincount_op-inl.h"
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+#include "../tensor/util/tensor_util-inl.cuh"
+#include "../tensor/util/tensor_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct BincountFusedKernel {
+  template<typename DType, typename OType>
+  static MSHADOW_XINLINE void Map(int i, const DType* data, OType* out) {
+    int idx = data[i];
+    atomicAdd(&out[idx], 1);
+  }
+
+  template<typename DType, typename OType>
+  static MSHADOW_XINLINE void Map(int i, const DType* data, const OType* weights,
+                                  OType* out) {
+    int idx = data[i];
+    atomicAdd(&out[idx], weights[i]);
+  }
+};
+
+struct is_valid_check {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, char* invalid_ptr, const DType* data) {
+    if (data[i] < 0) *invalid_ptr = 1;
+  }
+};
+
+template<typename DType>
+bool CheckInvalidInput(mshadow::Stream<gpu> *s, const DType *data, const size_t& data_size,
+                       char* is_valid_ptr) {
+  using namespace mxnet_op;
+  int32_t is_valid = 0;
+  Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
+  Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data);
+  CUDA_CALL(cudaMemcpyAsync(&is_valid, is_valid_ptr, sizeof(char),
+                            cudaMemcpyDeviceToHost, mshadow::Stream<gpu>::GetStream(s)));
+  CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+  return is_valid == 0;
+}
+
+template<>
+void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
+                                   const NDArray &data,
+                                   const NDArray &weights,
+                                   const NDArray &out,
+                                   const size_t &data_n,
+                                   const int &minlength) {
+  using namespace mxnet_op;
+  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+
+  MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, {
+    DType* h_ptr;
+    DType* d_ptr;
+    int bin = minlength;
+    d_ptr = data.data().dptr<DType>();
+    Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(1), s);
+    char* is_valid_ptr = reinterpret_cast<char*>(workspace.dptr_);
+    bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr);
+    CHECK(is_valid) << "Input should be nonnegative number";   // check invalid input
+
+    h_ptr = reinterpret_cast<DType*>(malloc(data_n*sizeof(DType)));
+    CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost,
+                              mshadow::Stream<gpu>::GetStream(s)));
+    CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+    for (size_t i = 0; i < data_n; i++) {
+      if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1;
+    }
+    free(h_ptr);
+    mxnet::TShape s(1, bin);
+    const_cast<NDArray &>(out).Init(s);  // set the output shape forcefully
+  });
+
+  MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
+      MSHADOW_TYPE_SWITCH(weights.dtype(), OType, {
+        size_t out_size = out.shape().Size();
+        Kernel<set_zero, gpu>::Launch(s, out_size, out.data().dptr<OType>());
+        Kernel<BincountFusedKernel, gpu>::Launch(
+          s, data_n, data.data().dptr<DType>(), weights.data().dptr<OType>(),
+          out.data().dptr<OType>());
+      });
+    });
+}
+
+template<>
+void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
+                                   const NDArray &data,
+                                   const NDArray &out,
+                                   const size_t &data_n,
+                                   const int &minlength) {
+  using namespace mxnet_op;
+  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+
+  MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, {
+    DType* h_ptr;
+    DType* d_ptr;
+    int bin = minlength;
+    d_ptr = data.data().dptr<DType>();
+    Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(1), s);
+    char* is_valid_ptr = reinterpret_cast<char*>(workspace.dptr_);
+    bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr);
+    CHECK(is_valid) << "Input should be nonnegative number";   // check invalid input
+
+    h_ptr = reinterpret_cast<DType*>(malloc(data_n*sizeof(DType)));
+    CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost,
+                              mshadow::Stream<gpu>::GetStream(s)));
+    CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+    for (size_t i = 0; i < data_n; i++) {
+      if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1;
+    }
+    free(h_ptr);
+    mxnet::TShape s(1, bin);
+    const_cast<NDArray &>(out).Init(s);  // set the output shape forcefully
+  });
+
+  MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
+    MSHADOW_TYPE_SWITCH(out.dtype(), OType, {
+      size_t out_size = out.shape().Size();
+      Kernel<set_zero, gpu>::Launch(s, out_size, out.data().dptr<OType>());
+      Kernel<BincountFusedKernel, gpu>::Launch(
+        s, data_n, data.data().dptr<DType>(), out.data().dptr<OType>());
+    });
+  });
+}
+
+NNVM_REGISTER_OP(_npi_bincount)
+.set_attr<FComputeEx>("FComputeEx<gpu>", NumpyBincountForward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index cad736aab65b..5e15d7ad4e67 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -169,6 +169,20 @@ Example::
 )code" ADD_FILELINE)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
+// bitwise_not
+NNVM_REGISTER_OP(_npi_bitwise_not)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+     return std::vector<std::string>{"x"};
+})
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::ComputeInt<cpu, mshadow_op::bitwise_not>)
+.add_argument("x", "NDArray-or-Symbol", "The input array.")
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
 // trunc
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_trunc, "x", mshadow_op::trunc)
 .describe(R"code(Return the truncated value of the input, element-wise.
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index af8834f01664..517ef9c2b52a 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -53,6 +53,9 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_ceil, mshadow_op::ceil);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_floor, mshadow_op::floor);
 
+NNVM_REGISTER_OP(_npi_bitwise_not)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::ComputeInt<gpu, mshadow_op::bitwise_not>);
+
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_trunc, mshadow_op::trunc);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_fix, mshadow_op::fix);
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index db898f8840f0..7ca594d21e59 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -311,6 +311,7 @@ IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::radians);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::radians_grad);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::nt);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::np_logical_not);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::bitwise_not);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::nt);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::clip);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::clip);  // NOLINT()
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 182f6339308a..01365067ce93 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -275,12 +275,15 @@ Graph QuantizeGraph(Graph &&src) {
   std::unordered_map<Node*, NodePtr> mirror_map;
   std::unordered_map<NodePtr, NodePtr> reverse_mirror_map;
   nnvm::NodeEntryMap<NodeEntry> mirror_entry_map;
+  static int verbose = dmlc::GetEnv("MXNET_QUANTIZATION_VERBOSE", 0);
   DFSVisit(src.outputs, [&](const NodePtr& node) {
     NodePtr new_node = Node::Create();
     // If the currently visited node needs quantization, insert a quantize op node before the
     // current node and replace the current node with the quantized version in the new graph.
     if (quantized_node_map.count(node)) {
-      std::cout << node->attrs.name << " is quantized." << std::endl;
+      if (verbose) {
+        LOG(INFO) << node->attrs.name << " is quantized.";
+      }
       new_node = quantized_node_map[node];
 
       // add data into quantized op input
@@ -395,7 +398,8 @@ Graph QuantizeGraph(Graph &&src) {
       // (e.g., a quantized_conv2d node), and insert a dequantize op node in the new graph if there
       // are any. Otherwise, simply add a copy of the current node's entry to the inputs of
       // the new_node.
-      if (!node->is_variable()) std::cout << node->attrs.name << " is NOT quantized." << std::endl;
+      if (verbose && !node->is_variable())
+        LOG(INFO) << node->attrs.name << " is NOT quantized.";
       *new_node = *node;
       new_node->inputs.clear();
       for (const auto& e : node->inputs) {
@@ -516,15 +520,20 @@ static inline void SetCalibTableForEntry(
     out_data_name = out_data_name.substr(prefix.size());
   }
   const auto calib_table_iter = calib_table.find(out_data_name);
+  static int verbose = dmlc::GetEnv("MXNET_QUANTIZATION_VERBOSE", 0);
   if (calib_table_iter != calib_table.end()) {
-    std::cout << "Set calibration result to " << node->attrs.name
-              << " : min=" << calib_table_iter->second.first
-              << " max=" << calib_table_iter->second.second << std::endl;
+    if (verbose) {
+      LOG(INFO) << "Set calibration result to " << node->attrs.name
+                << " : min=" << calib_table_iter->second.first
+                << " max=" << calib_table_iter->second.second;
+    }
     node->attrs.dict["min_calib_range"] = std::to_string(calib_table_iter->second.first);
     node->attrs.dict["max_calib_range"] = std::to_string(calib_table_iter->second.second);
     if (node->op() && node->op()->attr_parser) node->op()->attr_parser(&(node->attrs));
   } else {
-    std::cout << "Can't find calibration result for " << node->attrs.name << std::endl;
+    if (verbose) {
+      LOG(INFO) << "Can't find calibration result for " << node->attrs.name;
+    }
   }
 }
 
@@ -535,7 +544,10 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
       Op::GetAttr<mxnet::FNeedCalibrateInput>("FNeedCalibrateInput");
   static const auto& need_calib_output_map =
       Op::GetAttr<mxnet::FNeedCalibrateOutput>("FNeedCalibrateOutput");
-  std::cout << "Set calibration result to quantized symbol." << std::endl;
+  static int verbose = dmlc::GetEnv("MXNET_QUANTIZATION_VERBOSE", 0);
+  if (verbose) {
+    LOG(INFO) << "Set calibration result to quantized symbol.";
+  }
   DFSVisit(g.outputs, [&](const NodePtr& node) {
     if (need_calib_input_map.count(node->op())) {
       const auto calib_idx = need_calib_input_map[node->op()](node->attrs);
diff --git a/src/operator/quantization/quantized_elemwise_mul-inl.h b/src/operator/quantization/quantized_elemwise_mul-inl.h
new file mode 100644
index 000000000000..f58db8a45eea
--- /dev/null
+++ b/src/operator/quantization/quantized_elemwise_mul-inl.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file quantized_elemwise_mul.cc
+ * \brief CPU Implementation of basic elementwise binary mul operators
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
+
+#include "../tensor/elemwise_binary_op-inl.h"
+
+namespace mxnet {
+namespace op {
+/* These structure is used for requantization only when fusion */
+struct QuantizeElemwiseMulParam : public dmlc::Parameter<QuantizeElemwiseMulParam> {
+  dmlc::optional<float> min_calib_range;
+  dmlc::optional<float> max_calib_range;
+  bool enable_float_output;
+  DMLC_DECLARE_PARAMETER(QuantizeElemwiseMulParam) {
+    DMLC_DECLARE_FIELD(min_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The minimum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int8 output data.");
+    DMLC_DECLARE_FIELD(max_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The maximum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int8 output data.");
+    DMLC_DECLARE_FIELD(enable_float_output).set_default(false)
+    .describe("Whether to enable float32 output");
+  }
+};
+
+namespace quantized_elemwise_mul {
+enum QuantizedElemwiseMulOpInputs {kLhs, kRhs, kLhsMin, kLhsMax, kRhsMin, kRhsMax};
+enum QuantizedElemwiseMulOpOutputs {kOut, kOutMin, kOutMax};
+enum QuantizedElemwiseMulOpResource {kTempSpace};
+}  // namespace quantized_elemwise_mul
+
+
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
diff --git a/src/operator/quantization/quantized_elemwise_mul.cc b/src/operator/quantization/quantized_elemwise_mul.cc
new file mode 100644
index 000000000000..a752c14837a6
--- /dev/null
+++ b/src/operator/quantization/quantized_elemwise_mul.cc
@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file quantized_elemwise_mul.cc
+ * \brief CPU Implementation of basic elementwise binary mul operators
+ */
+#include <mxnet/op_attr_types.h>
+#include "../tensor/elemwise_binary_op-inl.h"
+#include "./quantized_elemwise_mul-inl.h"
+#include "./quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(QuantizeElemwiseMulParam);
+
+static std::vector<std::string> QuantizedElemwiseMulOutputNames(const NodeAttrs &attrs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  if (params.enable_float_output)
+    return std::vector<std::string>{"output"};
+  else
+    return std::vector<std::string>{"output", "min_output", "max_output"};
+}
+
+inline bool QuantizedElemwiseMulOpShape(const nnvm::NodeAttrs& attrs,
+                                        mxnet::ShapeVector *in_attrs,
+                                        mxnet::ShapeVector *out_attrs) {
+  using namespace mshadow;
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  const mxnet::TShape &lshape = (*in_attrs)[quantized_elemwise_mul::kLhs];
+  const mxnet::TShape &rshape = (*in_attrs)[quantized_elemwise_mul::kRhs];
+  if (!ndim_is_known(lshape) || !ndim_is_known(rshape)) return false;
+  CHECK_EQ(lshape.ndim(), rshape.ndim())
+    << "Currently, quantized elemwise multiply doesn't support broadcast.";
+  for (int i = 0; i < lshape.ndim(); ++i) {
+    CHECK_EQ(lshape[i], rshape[i]);
+  }
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kLhsMin, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kLhsMax, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kRhsMin, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kRhsMax, mxnet::TShape(1, 1));
+
+  out_attrs->clear();
+  SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOut, lshape);
+  if (!params.enable_float_output) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOutMin, mxnet::TShape(1, 1));
+    SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOutMax, mxnet::TShape(1, 1));
+  }
+  return true;
+}
+
+inline bool QuantizedElemwiseMulOpType(const nnvm::NodeAttrs& attrs,
+                                       std::vector<int> *in_type,
+                                       std::vector<int> *out_type) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  for (int i = 0; i < 2; ++i) {
+    if (in_type->at(i) == mshadow::kInt8) {
+      TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kInt8);
+    } else {
+      LOG(ERROR) << "currently, quantized elemwise mul only support int8 inputs.";
+    }
+  }
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kLhsMin, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kLhsMax, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kRhsMin, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kRhsMax, mshadow::kFloat32);
+
+  int dtype = mshadow::kInt32;
+  if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+    dtype = mshadow::kInt8;
+  }
+  if (!params.enable_float_output) {
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOut, dtype);
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOutMin, mshadow::kFloat32);
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOutMax, mshadow::kFloat32);
+  } else {
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOut, mshadow::kFloat32);
+  }
+  return true;
+}
+
+inline bool QuantizedElemwiseMulOpStorageType(const nnvm::NodeAttrs& attrs,
+                                              int dev_mask,
+                                              DispatchMode* dispatch_mode,
+                                              std::vector<int> *in_attrs,
+                                              std::vector<int> *out_attrs) {
+  using namespace common;
+  *dispatch_mode = DispatchMode::kFCompute;
+
+  for (auto &v : *out_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+
+  for (auto &v : *in_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void QuantizedElemwiseMulOpForward(const nnvm::NodeAttrs &attrs,
+                                   const OpContext &ctx,
+                                   const std::vector<TBlob> &inputs,
+                                   const std::vector<OpReqType> &req,
+                                   const std::vector<TBlob> &outputs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  using namespace mxnet_op;
+
+  float lhs_min = inputs[quantized_elemwise_mul::kLhsMin].dptr<float>()[0];
+  float lhs_max = inputs[quantized_elemwise_mul::kLhsMax].dptr<float>()[0];
+  float rhs_min = inputs[quantized_elemwise_mul::kRhsMin].dptr<float>()[0];
+  float rhs_max = inputs[quantized_elemwise_mul::kRhsMax].dptr<float>()[0];
+
+  float cached_output_min_ = 0.f;
+  float cached_output_max_ = 0.f;
+  float out_data_scale = 1.f;
+  float out_scale = 1.f;
+  if (!params.enable_float_output) {
+    float output_data_range = kInt32Range;
+    // dataA && dataB are int8
+    if (outputs[quantized_elemwise_mul::kOut].type_flag_ == mshadow::kInt8) {
+      output_data_range = kInt8Range;
+    } else {
+      output_data_range = kInt32Range;
+    }
+    if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+      cached_output_min_ = params.min_calib_range.value();
+      cached_output_max_ = params.max_calib_range.value();
+      out_data_scale = output_data_range / MaxAbs(cached_output_min_, cached_output_max_);
+      auto lhs_scale = kInt8Range / MaxAbs(lhs_min, lhs_max);
+      auto rhs_scale = kInt8Range / MaxAbs(rhs_min, rhs_max);
+      out_scale = out_data_scale / lhs_scale / rhs_scale;
+    } else {
+      Stream<cpu> *s = ctx.get_stream<cpu>();
+      if (inputs[quantized_elemwise_mul::kLhs].type_flag_ == mshadow::kInt8 &&
+          inputs[quantized_elemwise_mul::kRhs].type_flag_ == mshadow::kInt8) {
+        mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(
+            s, 1, &cached_output_min_, &cached_output_max_, &lhs_min, &lhs_max, &rhs_min, &rhs_max);
+      } else {
+        LOG(ERROR) << "lhs and rhs only support iny8 dtype.";
+      }
+    }
+  } else {
+    auto lhs_scale = kInt8Range / MaxAbs(lhs_min, lhs_max);
+    auto rhs_scale = kInt8Range / MaxAbs(rhs_min, rhs_max);
+    out_scale = 1.0 / lhs_scale / rhs_scale;
+  }
+
+  size_t out_size = outputs[quantized_elemwise_mul::kOut].Size();
+  auto *input_l = inputs[quantized_elemwise_mul::kLhs].dptr<int8_t>();
+  auto *input_r = inputs[quantized_elemwise_mul::kRhs].dptr<int8_t>();
+  // TODO(Xinyu): a temp solution to enable Elemwise INT8 computation,
+  // will be refactored after the DNNL primitive is done.
+  if (!params.enable_float_output) {
+    if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+      typedef int8_t out_type;
+      auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+      for (size_t i = 0; i < out_size; ++i) {
+        const int8_t a = input_l[i];
+        const int8_t b = input_r[i];
+        out_data[i] = static_cast<out_type>(a * b * out_scale);
+      }
+    } else {
+      typedef int32_t out_type;
+      auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+      for (size_t i = 0; i < out_size; ++i) {
+        const int8_t a = input_l[i];
+        const int8_t b = input_r[i];
+        out_data[i] = static_cast<out_type>(a * b * out_scale);
+      }
+    }
+  } else {
+    typedef float_t out_type;
+    auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+    for (size_t i = 0; i < out_size; ++i) {
+      const int8_t a = input_l[i];
+      const int8_t b = input_r[i];
+      out_data[i] = static_cast<out_type>(a * b * out_scale);
+    }
+  }
+
+  if (!params.enable_float_output) {
+    outputs[quantized_elemwise_mul::kOutMin].dptr<float>()[0] = cached_output_min_;
+    outputs[quantized_elemwise_mul::kOutMax].dptr<float>()[0] = cached_output_max_;
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_elemwise_mul)
+.describe(R"code(Multiplies arguments int8 element-wise.
+)code" ADD_FILELINE)
+.set_num_inputs(6)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  return (!params.enable_float_output) ? 3 : 1;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs", "lhs_min", "lhs_max", "rhs_min", "rhs_max"};
+  })
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", QuantizedElemwiseMulOutputNames)
+.set_attr<mxnet::FInferShape>("FInferShape", QuantizedElemwiseMulOpShape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizedElemwiseMulOpType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedElemwiseMulOpStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", QuantizedElemwiseMulOpForward)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.add_argument("lhs", "NDArray-or-Symbol", "first input")
+.add_argument("rhs", "NDArray-or-Symbol", "second input")
+.add_argument("lhs_min", "NDArray-or-Symbol", "Minimum value of first input.")
+.add_argument("lhs_max", "NDArray-or-Symbol", "Maximum value of first input.")
+.add_argument("rhs_min", "NDArray-or-Symbol", "Minimum value of second input.")
+.add_argument("rhs_max", "NDArray-or-Symbol", "Maximum value of second input.")
+.set_attr_parser(ParamParser<QuantizeElemwiseMulParam>)
+.add_arguments(QuantizeElemwiseMulParam::__FIELDS__());
+
+NNVM_REGISTER_OP(elemwise_mul)
+.set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
+  nnvm::NodePtr node = nnvm::Node::Create();
+  node->attrs.op = Op::Get("_contrib_quantized_elemwise_mul");
+  node->attrs.name = "quantized_" + attrs.name;
+  node->attrs.dict = attrs.dict;
+  if (node->op()->attr_parser != nullptr) {
+    node->op()->attr_parser(&(node->attrs));
+  }
+  return node;
+});
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/random/shuffle_op.cc b/src/operator/random/shuffle_op.cc
index 86797c136bab..0f64fbc51449 100644
--- a/src/operator/random/shuffle_op.cc
+++ b/src/operator/random/shuffle_op.cc
@@ -122,7 +122,7 @@ void ShuffleForwardCPU(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(_shuffle)
 .add_alias("shuffle")
-.add_alias("_np__random_shuffle")
+.add_alias("_npi_shuffle")
 .describe(R"code(Randomly shuffle the elements.
 
 This shuffles the array along the first axis.
diff --git a/src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h b/src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h
new file mode 100644
index 000000000000..1469395ec169
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_elemwisemul_post_quantize_property.cc
+ * \brief Partition gragph property for MKLDNN Quantized ElemwiseMul operator
+ * \author Xinyu Chen
+*/
+
+#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
+#if MXNET_USE_MKLDNN == 1
+
+#include <string>
+#include <vector>
+#include "../../tensor/elemwise_binary_op-inl.h"
+#include "../../quantization/requantize-inl.h"
+#include "../common.h"
+#include "mkldnn_subgraph_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+#define QUANTIZED_ElemwiseMul_NAME "_contrib_quantized_elemwise_mul"
+
+class ElemwiseMulPostQuantizeSelector : public SubgraphSelector {
+ public:
+  /*! \brief pattern match status */
+  enum SelectStatus {
+    kFail = 0,
+    kStart,
+    kRequantize,
+    kSuccess,
+  };
+
+ private:
+  bool disable_all;
+  bool disable_float_output;
+  SelectStatus status;
+  std::vector<const nnvm::Node *> matched_list;
+
+ public:
+  explicit ElemwiseMulPostQuantizeSelector(const bool dis_all,
+                                           const bool dis_float_output)
+      : disable_all(dis_all),
+        disable_float_output(dis_float_output) {}
+
+  bool Select(const nnvm::Node &n) override {
+    if ((!disable_all) && n.op() == Op::Get(QUANTIZED_ElemwiseMul_NAME)) {
+      status = disable_all ? kSuccess : kStart;
+      matched_list.clear();
+      matched_list.push_back(&n);
+      return true;
+    }
+    return false;
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    return false;
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    if (status == kFail || status == kSuccess || new_node.is_variable())
+      return false;
+    // If n isn't the last matched node, then we encoutered a internal
+    // branch, we should pop out the node behind n and stop fusion.
+    if (matched_list.back() != &n) {
+      if (std::find(matched_list.begin(), matched_list.end(), &n) !=
+        matched_list.end()) {
+        while (matched_list.back() != &n) {
+          matched_list.pop_back();
+        }
+      }
+
+      status = kSuccess;
+      return false;
+    }
+
+    switch (status) {
+      case kStart:
+        if (new_node.op() == Op::Get("_contrib_requantize")) {
+          auto const &param = nnvm::get<RequantizeParam>(new_node.attrs.parsed);
+          if (param.min_calib_range.has_value() &&
+              param.max_calib_range.has_value()) {
+            matched_list.push_back(&new_node);
+            status = kRequantize;
+            return true;
+          }
+        }
+      case kRequantize:
+        if ((!disable_float_output) && (new_node.op() == Op::Get("_contrib_dequantize"))) {
+            matched_list.push_back(&new_node);
+            status = kSuccess;
+            return true;
+        }
+      default:
+        status = kSuccess;
+        return false;
+    }
+  }
+
+  std::vector<nnvm::Node *> Filter(
+      const std::vector<nnvm::Node *> &candidates) override {
+    if ((status != kSuccess) || (matched_list.size() <= 1)) {
+      return std::vector<nnvm::Node *>(0);
+    } else {
+      std::vector<nnvm::Node *> ret;
+      for (auto i : matched_list) {
+        auto non_const_i = const_cast<nnvm::Node *>(i);
+        if (std::find(candidates.begin(), candidates.end(), non_const_i) !=
+            candidates.end()) {
+          ret.push_back(non_const_i);
+        }
+      }
+      return ret;
+    }
+  }
+
+  void Reset() override {
+    CHECK_GE(matched_list.size(), 1);
+    auto new_selector = ElemwiseMulPostQuantizeSelector(disable_all, disable_float_output);
+    new_selector.Select(*matched_list[0]);
+    *this = new_selector;
+  }
+};
+
+class ElemwiseMulPostQuantizeProperty : public SubgraphProperty {
+ public:
+  ElemwiseMulPostQuantizeProperty() {
+    disable_fuse_all = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_QEM_FUSE_ALL", false);
+    disable_float_output = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_QEM_FLOAT_OUTPUT", false);
+  }
+
+  static SubgraphPropertyPtr Create() {
+    static const std::string &name = "MKLDNN EltwiseMul post-quantization optimization pass";
+    auto property = std::make_shared<ElemwiseMulPostQuantizeProperty>();
+    property->SetAttr<std::string>("property_name", name);
+    property->SetAttr<bool>("inference_only", true);
+    return property;
+  }
+
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id = 0) const override {
+    nnvm::NodePtr em_node = nullptr;
+    nnvm::NodePtr requantize_node = nullptr;
+    nnvm::NodePtr dequantize_node = nullptr;
+
+    DFSVisit(sym.outputs, [&](const nnvm::NodePtr &node) {
+      if (node->is_variable()) return;
+      if (node->op() == Op::Get(QUANTIZED_ElemwiseMul_NAME)) {
+        em_node = node;
+      } else if (node->op() == Op::Get("_contrib_requantize")) {
+        requantize_node = node;
+      } else if (node->op() == Op::Get("_contrib_dequantize")) {
+        dequantize_node = node;
+      }
+    });
+
+    CHECK_NOTNULL(em_node);
+    CHECK_NOTNULL(requantize_node);
+    auto const &requantize_param =
+        nnvm::get<RequantizeParam>(requantize_node->attrs.parsed);
+    CHECK(requantize_param.min_calib_range.has_value());
+    CHECK(requantize_param.max_calib_range.has_value());
+
+    // When only fused quantized_elemwise_mul and requantize, set min/max_cablib_range,
+    // When fused quantized_elemwise_mul + requantize + dequantize, set dequantize flag to true.
+    if (dequantize_node != nullptr) {
+      em_node->attrs.dict["enable_float_output"] = "True";
+    } else {
+      em_node->attrs.dict["min_calib_range"] =
+          std::to_string(requantize_param.min_calib_range.value());
+      em_node->attrs.dict["max_calib_range"] =
+          std::to_string(requantize_param.max_calib_range.value());
+    }
+    em_node->op()->attr_parser(&(em_node->attrs));
+    return em_node;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    auto selector =
+        std::make_shared<ElemwiseMulPostQuantizeSelector>(disable_fuse_all,
+                                                          disable_float_output);
+    return selector;
+  }
+
+  void ConnectSubgraphOutputs(
+      const nnvm::NodePtr n,
+      std::vector<nnvm::NodeEntry *> *output_entries) const override {
+    for (size_t i = 0; i < output_entries->size(); ++i) {
+      auto entry_ptr = output_entries->at(i);
+      *entry_ptr = nnvm::NodeEntry{n, entry_ptr->index, 0};
+    }
+  }
+
+ private:
+  bool disable_fuse_all;
+  bool disable_float_output;
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
index 269017ea6a03..18cd3031ef18 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
@@ -23,6 +23,7 @@
 #include "mkldnn_fc_property.h"
 #include "mkldnn_post_quantize_property.h"
 #include "mkldnn_fc_post_quantize_property.h"
+#include "mkldnn_elemwisemul_post_quantize_property.h"
 #include "mkldnn_post_quantize_align_scale_property.h"
 
 namespace mxnet {
@@ -57,6 +58,7 @@ MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeProperty);
 
 #if MXNET_USE_MKLDNN == 1
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNFCPostQuantizeProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, ElemwiseMulPostQuantizeProperty);
 
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeAlignScaleProperty);
 #endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/tensor/broadcast_reduce_norm_value.cc b/src/operator/tensor/broadcast_reduce_norm_value.cc
index 4cd92d44997e..9acc157f8eca 100644
--- a/src/operator/tensor/broadcast_reduce_norm_value.cc
+++ b/src/operator/tensor/broadcast_reduce_norm_value.cc
@@ -40,7 +40,7 @@ void L2NormComputeEx<cpu>(const nnvm::NodeAttrs& attrs,
   const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
   mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
   const NDArrayStorageType istype = inputs[0].storage_type();
-  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
+  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape(0, -1);
   if ((istype == kRowSparseStorage || istype == kCSRStorage) && axis.ndim() == 0 &&
        param.ord == 2) {
     // l2 norm on the entire array
diff --git a/src/operator/tensor/broadcast_reduce_norm_value.cu b/src/operator/tensor/broadcast_reduce_norm_value.cu
index 188c93e61221..735c3d7faec9 100644
--- a/src/operator/tensor/broadcast_reduce_norm_value.cu
+++ b/src/operator/tensor/broadcast_reduce_norm_value.cu
@@ -39,7 +39,7 @@ void L2NormComputeEx<gpu>(const nnvm::NodeAttrs& attrs,
   const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   const NDArrayStorageType istype = inputs[0].storage_type();
-  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
+  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape(0, -1);
   if ((istype == kRowSparseStorage || istype == kCSRStorage) && axis.ndim() == 0 &&
        param.ord == 2) {
     // l2 norm on the entire array
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index 27e22491ca35..799f86544160 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -1152,7 +1152,7 @@ inline bool LpNormStorageType(const nnvm::NodeAttrs& attrs,
                                      DispatchMode::kFCompute);
   }
   if (param.ord == 2) {
-    const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
+    const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape(0, -1);
     if (!dispatched && (in_stype == kRowSparseStorage || in_stype == kCSRStorage) &&
         axis.ndim() == 0 && param.ord == 2) {
       // l2 norm: rsp/csr, axis = () -> dns
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 05e10ffa4e16..4486b0dcd712 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -252,6 +252,23 @@ class UnaryOp : public OpBase {
     });
   }
 
+  template<typename xpu, typename OP>
+  static void ComputeInt(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    MXNET_INT_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        if (inputs[0].Size() != 0) {
+          mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+            s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+        }
+      });
+    });
+  }
+
   template<typename xpu, typename OP>
   static void ComputeLogic(const nnvm::NodeAttrs& attrs,
                            const OpContext& ctx,
diff --git a/src/operator/tensor/elemwise_unary_op_trig.cc b/src/operator/tensor/elemwise_unary_op_trig.cc
index a436ebb284a3..e5d662a1b262 100644
--- a/src/operator/tensor/elemwise_unary_op_trig.cc
+++ b/src/operator/tensor/elemwise_unary_op_trig.cc
@@ -188,7 +188,31 @@ The storage type of ``arcsin`` output depends upon the input storage type:
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arcsin" });
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arcsin,
-                                                  unary_bwd<mshadow_op::arcsin_grad>);
+                                                  unary_bwd<mshadow_op::arcsin_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+    [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+      // ograds[0]: head_grad_grads (dL/dxgrad)
+      // inputs[0]: dL/dy
+      // inputs[1]: x (ElemwiseGradUseIn)
+      // f(x) = arcsin(x)
+      // n: f'(x) = 1/(1-x^2)^1/2
+      // f''(x) = f'(x) * x/(1-x^2)
+      // Note: x/(1-x^2) = x * f'(x)^2
+      auto dydx = n->inputs[0];
+      auto x = n->inputs[1];
+      auto dydx_mul_grad_x = nnvm::NodeEntry{n};
+      auto op = mxnet::util::NodeOpGen{n};
+
+      auto x_grad = op.div(dydx_mul_grad_x, dydx);
+      auto x_grad_square = op.square(x_grad);
+      auto x_grad_square_mul_x = op.mul(x_grad_square, x);
+      auto x_grad_grad = op.mul(dydx_mul_grad_x, x_grad_square_mul_x);
+
+      std::vector<nnvm::NodeEntry> ret;
+      ret.emplace_back(op.mul(ograds[0], x_grad));
+      ret.emplace_back(op.mul(ograds[0], x_grad_grad));
+      return ret;
+    });
 
 // arccos
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(arccos, cpu, mshadow_op::arccos)
@@ -207,7 +231,32 @@ The storage type of ``arccos`` output is always dense
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arccos" });
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arccos,
-                                                  unary_bwd<mshadow_op::arccos_grad>);
+                                                  unary_bwd<mshadow_op::arccos_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+    [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+      // ograds[0]: head_grad_grads (dL/dxgrad)
+      // inputs[0]: dL/dy
+      // inputs[1]: x (ElemwiseGradUseIn)
+      // f(x) = arccos(x)
+      // n: f'(x) = -1/(1-x^2)^1/2
+      // f''(x) = f'(x) * x/(1-x^2)
+      // Note: x/(1-x^2) = x * f'(x)^2
+      auto dydx = n->inputs[0];
+      auto x = n->inputs[1];
+      auto dydx_mul_grad_x = nnvm::NodeEntry{n};
+      auto op = mxnet::util::NodeOpGen{n};
+
+      auto x_grad = op.div(dydx_mul_grad_x, dydx);
+      auto x_grad_square = op.square(x_grad);
+      auto x_grad_square_mul_x = op.mul(x_grad_square, x);
+      auto x_grad_grad = op.mul(dydx_mul_grad_x, x_grad_square_mul_x);
+
+      std::vector<nnvm::NodeEntry> ret;
+      ret.emplace_back(op.mul(ograds[0], x_grad));
+      ret.emplace_back(op.mul(ograds[0], x_grad_grad));
+      return ret;
+    });
+
 
 // arctan
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(arctan, cpu, mshadow_op::arctan)
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 527737e03cd7..0c40f32d1666 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -341,6 +341,66 @@ def check_quantized_elemwise_add(data_shape, qtype):
         check_quantized_elemwise_add((3, 4, 56, 56), qtype)
         check_quantized_elemwise_add((32, 56, 64, 11), qtype)
 
+@with_seed()
+def test_quantized_elemwise_mul():
+    def check_quantized_elemwise_mul(data_shape, qtype):
+        if is_test_for_native_cpu():
+            print('skipped testing quantized_elemwise_mul for native cpu since it is not supported yet')
+            return
+        elif qtype != 'int8':
+            print('skipped testing quantized_elemwise_mul for not supported data type')
+            return
+        elif is_test_for_gpu():
+            print('skipped testing quantized_elemwise_mul for gpu since it is not supported yet')
+            return
+
+        dataA = mx.sym.Variable(name='dataA', shape=data_shape, dtype='float32')
+        dataB = mx.sym.Variable(name='dataB', shape=data_shape, dtype='float32')
+        elemwise_mul_fp32 = mx.sym.elemwise_mul(dataA, dataB)
+        arg_names = elemwise_mul_fp32.list_arguments()
+        elemwise_mul_fp32_exe = elemwise_mul_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+        if qtype == 'uint8':
+            data_low = 0.0
+            data_high = 255.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+
+        dataA_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
+        dataB_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
+        elemwise_mul_fp32_exe.arg_dict[arg_names[0]][:] = dataA_val
+
+        elemwise_mul_fp32_exe.arg_dict[arg_names[1]][:] = dataB_val
+
+        output = elemwise_mul_fp32_exe.forward()[0]
+
+        qdataA = mx.sym.Variable(name='qdataA', shape=data_shape, dtype=qtype)
+        qdataB = mx.sym.Variable(name='qdataB', shape=data_shape, dtype=qtype)
+        min_dataA = mx.sym.Variable(name='min_dataA')
+        max_dataA = mx.sym.Variable(name='max_dataA')
+        min_dataB = mx.sym.Variable(name='min_dataB')
+        max_dataB = mx.sym.Variable(name='max_dataB')
+        quantized_elemwise_mul = mx.sym.contrib.quantized_elemwise_mul(qdataA, qdataB, min_dataA, max_dataA, min_dataB, max_dataB)
+        elemwise_mul_int8_exe = quantized_elemwise_mul.simple_bind(ctx=mx.current_context(), grad_req='null')
+        qarg_names = quantized_elemwise_mul.list_arguments()
+        elemwise_mul_int8_exe.arg_dict[qarg_names[0]][:] = elemwise_mul_fp32_exe.arg_dict[arg_names[0]].astype(qtype)
+        elemwise_mul_int8_exe.arg_dict[qarg_names[1]][:] = elemwise_mul_fp32_exe.arg_dict[arg_names[1]].astype(qtype)
+        quantized_range = 127.0
+        elemwise_mul_int8_exe.arg_dict[qarg_names[2]][:] = data_low
+        elemwise_mul_int8_exe.arg_dict[qarg_names[3]][:] = data_high
+        elemwise_mul_int8_exe.arg_dict[qarg_names[4]][:] = data_low
+        elemwise_mul_int8_exe.arg_dict[qarg_names[5]][:] = data_high
+        qoutput, min_range, max_range = elemwise_mul_int8_exe.forward()
+
+        fp32_rslt = output.asnumpy()
+        int8_rslt = qoutput.astype(output.dtype)
+        assert_almost_equal(fp32_rslt, int8_rslt, atol = 1e-4)
+
+    for qtype in ['int8', 'uint8']:
+        check_quantized_elemwise_mul((4, 6), qtype)
+        check_quantized_elemwise_mul((13, 74, 52), qtype)
+        check_quantized_elemwise_mul((3, 4, 56, 56), qtype)
+        check_quantized_elemwise_mul((32, 56, 64, 11), qtype)
 
 @with_seed()
 def test_quantized_pooling():
@@ -1005,7 +1065,7 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape=N
                     else:
                         excluded_sym_names = excluded_names + optional_names
             if name == 'sym4':
-                excluded_op_names += ['elemwise_add']
+                excluded_op_names += ['elemwise_add', 'elemwise_mul']
 
             qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=s,
                                                                              arg_params=arg_params,
diff --git a/tests/python/unittest/test_gluon_batch_processor.py b/tests/python/unittest/test_gluon_batch_processor.py
index 4bd6f769aa44..8604713fc129 100644
--- a/tests/python/unittest/test_gluon_batch_processor.py
+++ b/tests/python/unittest/test_gluon_batch_processor.py
@@ -84,7 +84,7 @@ def test_batch_processor_validation():
     ctx = mx.cpu()
     loss = gluon.loss.L2Loss()
     acc = mx.metric.Accuracy()
-    evaluation_loss = gluon.loss.L1Loss()
+    val_loss = gluon.loss.L1Loss()
     net.initialize(ctx=ctx)
     processor = BatchProcessor()
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
@@ -93,7 +93,7 @@ def test_batch_processor_validation():
                     train_metrics=acc,
                     trainer=trainer,
                     context=ctx,
-                    evaluation_loss=evaluation_loss,
+                    val_loss=val_loss,
                     batch_processor=processor)
     # Input dataloader
     est.fit(train_data=dataloader,
diff --git a/tests/python/unittest/test_gluon_estimator.py b/tests/python/unittest/test_gluon_estimator.py
index 924dd083bef4..ca61e4b40caa 100644
--- a/tests/python/unittest/test_gluon_estimator.py
+++ b/tests/python/unittest/test_gluon_estimator.py
@@ -88,7 +88,7 @@ def test_validation():
     ctx = mx.cpu()
     loss = gluon.loss.L2Loss()
     acc = mx.metric.Accuracy()
-    evaluation_loss = gluon.loss.L1Loss()
+    val_loss = gluon.loss.L1Loss()
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
@@ -96,7 +96,7 @@ def test_validation():
                     train_metrics=acc,
                     trainer=trainer,
                     context=ctx,
-                    evaluation_loss=evaluation_loss)
+                    val_loss=val_loss)
     # Input dataloader
     est.fit(train_data=dataloader,
             val_data=dataloader,
@@ -376,16 +376,16 @@ def test_default_handlers():
     assert isinstance(handlers[1], MetricHandler)
     assert isinstance(handlers[4], LoggingHandler)
 
-def test_eval_net():
-    ''' test estimator with a different evaluation net '''
+def test_val_net():
+    ''' test estimator with different training and validation networks '''
     ''' test weight sharing of sequential networks without namescope '''
     net = _get_test_network()
-    eval_net = _get_test_network(params=net.collect_params())
+    val_net = _get_test_network(params=net.collect_params())
     dataloader, dataiter = _get_test_data()
     num_epochs = 1
     ctx = mx.cpu()
     loss = gluon.loss.L2Loss()
-    evaluation_loss = gluon.loss.L2Loss()
+    val_loss = gluon.loss.L2Loss()
     acc = mx.metric.Accuracy()
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
@@ -394,8 +394,8 @@ def test_eval_net():
                     train_metrics=acc,
                     trainer=trainer,
                     context=ctx,
-                    evaluation_loss=evaluation_loss,
-                    eval_net=eval_net)
+                    val_loss=val_loss,
+                    val_net=val_net)
 
     with assert_raises(RuntimeError):
         est.fit(train_data=dataloader,
@@ -404,7 +404,7 @@ def test_eval_net():
 
     ''' test weight sharing of sequential networks with namescope '''
     net = _get_test_network_with_namescope()
-    eval_net = _get_test_network_with_namescope(params=net.collect_params())
+    val_net = _get_test_network_with_namescope(params=net.collect_params())
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
@@ -412,8 +412,8 @@ def test_eval_net():
                     train_metrics=acc,
                     trainer=trainer,
                     context=ctx,
-                    evaluation_loss=evaluation_loss,
-                    eval_net=eval_net)
+                    val_loss=val_loss,
+                    val_net=val_net)
 
     est.fit(train_data=dataloader,
             val_data=dataloader,
@@ -422,20 +422,20 @@ def test_eval_net():
     ''' test weight sharing of two resnets '''
     net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, ctx=ctx)
     net.output = gluon.nn.Dense(10)
-    eval_net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, ctx=ctx)
-    eval_net.output = gluon.nn.Dense(10, params=net.collect_params())
+    val_net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, ctx=ctx)
+    val_net.output = gluon.nn.Dense(10, params=net.collect_params())
     dataset = gluon.data.ArrayDataset(mx.nd.zeros((10, 3, 224, 224)), mx.nd.zeros((10, 10)))
     dataloader = gluon.data.DataLoader(dataset=dataset, batch_size=5)
     net.initialize(ctx=ctx)
-    eval_net.initialize(ctx=ctx)
+    val_net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=acc,
                     trainer=trainer,
                     context=ctx,
-                    evaluation_loss=evaluation_loss,
-                    eval_net=eval_net)
+                    val_loss=val_loss,
+                    val_net=val_net)
 
     est.fit(train_data=dataloader,
             val_data=dataloader,
diff --git a/tests/python/unittest/test_higher_order_grad.py b/tests/python/unittest/test_higher_order_grad.py
index 527c35d5dd94..eeba4226dc36 100644
--- a/tests/python/unittest/test_higher_order_grad.py
+++ b/tests/python/unittest/test_higher_order_grad.py
@@ -133,6 +133,44 @@ def grad_grad_op(x):
             array, tanh, grad_grad_op, rtol=1e-6, atol=1e-6)
 
 
+@with_seed()
+def test_arcsin():
+    def arcsin(x):
+        return nd.arcsin(x)
+
+    def grad_grad_op(x):
+        return x / nd.sqrt((1-x**2)**3)
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        array = random_arrays(shape)
+        # Hack: Decrease std_dev to make
+        # sure all elements
+        # are in range -1 to 1
+        # i.e. Domain of arcsin
+        array *= 0.2
+        check_second_order_unary(array, arcsin, grad_grad_op)
+
+
+@with_seed()
+def test_arccos():
+    def arccos(x):
+        return nd.arccos(x)
+
+    def grad_grad_op(x):
+        return -x / nd.sqrt((1-x**2)**3)
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        array = random_arrays(shape)
+        # Hack: Decrease std_dev to make
+        # sure all elements
+        # are in range -1 to 1
+        # i.e. Domain of arccos
+        array *= 0.2
+        check_second_order_unary(array, arccos, grad_grad_op)
+
+
 @with_seed()
 def test_arctan():
     def arctan(x):
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index a1e5128d8ac6..e7273fba35d5 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -18,6 +18,7 @@
 import mxnet as mx
 import numpy as np
 import scipy
+from scipy.stats import pearsonr
 import json
 import math
 from common import with_seed
@@ -267,7 +268,7 @@ def test_pearsonr():
     pred1 = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
     label1 = mx.nd.array([[1, 0], [0, 1], [0, 1]])
     pearsonr_expected_np = np.corrcoef(pred1.asnumpy().ravel(), label1.asnumpy().ravel())[0, 1]
-    pearsonr_expected_scipy, _ = scipy.stats.pearsonr(pred1.asnumpy().ravel(), label1.asnumpy().ravel())
+    pearsonr_expected_scipy, _ = pearsonr(pred1.asnumpy().ravel(), label1.asnumpy().ravel())
     macro_pr = mx.metric.create('pearsonr', average='macro')
     micro_pr = mx.metric.create('pearsonr', average='micro')
 
@@ -289,7 +290,7 @@ def test_pearsonr():
     label12 = mx.nd.array([[1, 0], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1]])
 
     pearsonr_expected_np = np.corrcoef(pred12.asnumpy().ravel(), label12.asnumpy().ravel())[0, 1]
-    pearsonr_expected_scipy, _ = scipy.stats.pearsonr(pred12.asnumpy().ravel(), label12.asnumpy().ravel())
+    pearsonr_expected_scipy, _ = pearsonr(pred12.asnumpy().ravel(), label12.asnumpy().ravel())
 
     macro_pr.reset()
     micro_pr.update([label2], [pred2])
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index a670f794860f..3d26ee28b22e 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -67,6 +67,23 @@ def _add_workload_unravel_index():
     OpArgMngr.add_workload('unravel_index', np.array([],dtype=_np.int64), (10, 3, 5))
     OpArgMngr.add_workload('unravel_index', np.array([3], dtype=_np.int32), (2,2))
 
+def _add_workload_bincount():
+    y = np.arange(4).astype(int)
+    y1 = np.array([1, 5, 2, 4, 1], dtype=_np.int64)
+    y2 = np.array((), dtype=_np.int8)
+    w = np.array([0.2, 0.3, 0.5, 0.1])
+    w1 = np.array([0.2, 0.3, 0.5, 0.1, 0.2])
+
+    OpArgMngr.add_workload('bincount', y)
+    OpArgMngr.add_workload('bincount', y1)
+    OpArgMngr.add_workload('bincount', y, w)
+    OpArgMngr.add_workload('bincount', y1, w1)
+    OpArgMngr.add_workload('bincount', y1, w1, 8)
+    OpArgMngr.add_workload('bincount', y, minlength=3)
+    OpArgMngr.add_workload('bincount', y, minlength=8)
+    OpArgMngr.add_workload('bincount', y2, minlength=0)
+    OpArgMngr.add_workload('bincount', y2, minlength=5)
+    
 
 def _add_workload_diag():
     def get_mat(n):
@@ -577,6 +594,10 @@ def _add_workload_around():
     OpArgMngr.add_workload('around', np.array([1.56, 72.54, 6.35, 3.25]), decimals=1)
 
 
+def _add_workload_round():
+    OpArgMngr.add_workload('round', np.array([1.56, 72.54, 6.35, 3.25]), decimals=1)
+
+
 def _add_workload_argsort():
     for dtype in [np.int32, np.float32]:
         a = np.arange(101, dtype=dtype)
@@ -741,7 +762,7 @@ def _add_workload_reshape():
     # OpArgMngr.add_workload('reshape', b, (2, 2), order='F')  # Items are not equal with order='F'
 
     a = np.array(_np.ones((0, 2)))
-    OpArgMngr.add_workload('reshape', a, -1, 2)
+    OpArgMngr.add_workload('reshape', a, (-1, 2))
 
 
 def _add_workload_rint(array_pool):
@@ -1225,6 +1246,24 @@ def _add_workload_logical_not(array_pool):
     OpArgMngr.add_workload('logical_not', np.array([True, False, True, False], dtype=np.bool))
 
 
+def _add_workload_bitwise_not():
+    OpArgMngr.add_workload('bitwise_not', np.array([True, False, True, False], dtype=np.bool))
+    for dtype in [np.int8, np.int32, np.int64]:
+        zeros = np.array([0], dtype=dtype)
+        ones = np.array([-1], dtype=dtype)
+        OpArgMngr.add_workload('bitwise_not', zeros)
+        OpArgMngr.add_workload('bitwise_not', ones)
+
+
+def _add_workload_invert():
+    OpArgMngr.add_workload('invert', np.array([True, False, True, False], dtype=np.bool))
+    for dtype in [np.int8, np.int32, np.int64]:
+        zeros = np.array([0], dtype=dtype)
+        ones = np.array([-1], dtype=dtype)
+        OpArgMngr.add_workload('invert', zeros)
+        OpArgMngr.add_workload('invert', ones)
+
+
 def _add_workload_vdot():
     OpArgMngr.add_workload('vdot', np.random.normal(size=(2, 4)), np.random.normal(size=(4, 2)))
     OpArgMngr.add_workload('vdot', np.random.normal(size=(2, 4)).astype(np.float64), np.random.normal(size=(2, 4)).astype(np.float64))
@@ -1407,8 +1446,10 @@ def _prepare_workloads():
     _add_workload_argmin()
     _add_workload_argmax()
     _add_workload_around()
+    _add_workload_round()
     _add_workload_argsort()
     _add_workload_append()
+    _add_workload_bincount()
     _add_workload_broadcast_arrays(array_pool)
     _add_workload_broadcast_to()
     _add_workload_clip()
@@ -1508,6 +1549,8 @@ def _prepare_workloads():
     _add_workload_turnc(array_pool)
     _add_workload_floor(array_pool)
     _add_workload_logical_not(array_pool)
+    _add_workload_bitwise_not()
+    _add_workload_invert()
     _add_workload_vdot()
     _add_workload_vstack(array_pool)
     _add_workload_column_stack()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 545466bf0814..3f9f1d6677cc 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -1778,6 +1778,66 @@ def hybrid_forward(self, F, a, *args, **kwargs):
             check_unary_func(func, ref_grad, shape, low, high)
 
 
+@with_seed()
+@use_np
+def test_np_bitwise_not():
+    def check_unary_func(func, ref_grad, shape, low, high):
+        class TestUnary(HybridBlock):
+            def __init__(self, func):
+                super(TestUnary, self).__init__()
+                self._func = func
+
+            def hybrid_forward(self, F, a, *args, **kwargs):
+                return getattr(F.np, self._func)(a)
+
+        np_func = getattr(_np, func)
+        mx_func = TestUnary(func)
+        np_test_data = _np.random.uniform(low, high, shape).astype(_np.int32)
+        mx_test_data = mx.numpy.array(np_test_data)
+        for hybridize in [True, False]:
+            if hybridize:
+                mx_func.hybridize()
+            if ref_grad:
+                mx_test_data.attach_grad()
+            np_out = np_func(np_test_data)
+            with mx.autograd.record():
+                y = mx_func(mx_test_data)
+            assert y.shape == np_out.shape
+            assert_almost_equal(y.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+            if np_out.dtype == np.bool_:
+                assert y.dtype == np.bool_
+
+            if ref_grad:
+                y.backward()
+                assert_almost_equal(mx_test_data.grad.asnumpy(), ref_grad(np_test_data), rtol=1e-1, atol=1e-2, equal_nan=True)
+
+        np_out = getattr(_np, func)(np_test_data)
+        mx_out = getattr(mx.np, func)(mx_test_data)
+        assert mx_out.shape == np_out.shape
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data, where=False)
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  subok=False)
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  dtype=_np.int8)
+        assertRaises(TypeError, getattr(np, func), mx_test_data,  dtype="abcdefg")
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  casting='safe')
+        assertRaises(TypeError, getattr(np, func), mx_test_data,  casting='mxnet')
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  order='C')
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  order='mxnet')
+
+    funcs = {
+        'bitwise_not' : (None, -5, 5),
+        'invert' : (None, -5, 5),
+    }
+    ndim = random.choice([2, 3, 4])
+    shape = random.choice([rand_shape_nd(ndim, dim=3), (1, 0, 2)])
+    for shape in [rand_shape_nd(ndim, dim=3), (1, 0, 2)]:
+        for func, func_data in funcs.items():
+            ref_grad, low, high = func_data
+            check_unary_func(func, ref_grad, shape, low, high)
+
+
 @with_seed()
 @use_np
 def test_np_binary_funcs():
@@ -3029,6 +3089,28 @@ def hybrid_forward(self, F, x):
                 assert out.shape == expected_shape
 
 
+@with_seed()
+@use_np
+def test_np_randn():
+    # Test shapes.
+    shapes = [
+        (3, 3),
+        (3, 4),
+        (0, 0),
+        (3, 3, 3),
+        (0, 0, 0),
+        (2, 2, 4, 3),
+        (2, 2, 4, 3),
+        (2, 0, 3, 0),
+        (2, 0, 2, 3)
+    ]
+    dtypes = ['float16', 'float32', 'float64']
+    for dtype in dtypes:
+        for shape in shapes:
+            data_mx = np.random.randn(*shape, dtype=dtype)
+            assert data_mx.shape == shape
+
+
 @with_seed()
 @use_np
 def test_random_seed():
@@ -3760,11 +3842,7 @@ def get_grad_b(A, X):
     nrhs = (-1, 0, 1, 2, 3)
     dtypes = ['float32', 'float64']
     for hybridize, shape, dtype, nrh in itertools.product([False, True], shapes, dtypes, nrhs):
-        rtol = 1e-3
-        atol = 1e-5
-        if dtype == 'float32':
-            rtol = 1e-2
-            atol = 1e-4
+        rtol, atol =1e-2, 1e-4
         test_solve = TestSolve()
         if hybridize:
             test_solve.hybridize()
@@ -4357,6 +4435,38 @@ def hybrid_forward(self, F, x):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
 
 
+@with_seed()
+@use_np
+def test_np_round():
+    class TestRound(HybridBlock):
+        def __init__(self, decimals):
+            super(TestRound, self).__init__()
+            self.decimals = decimals
+
+        def hybrid_forward(self, F, x):
+            return F.np.round(x, self.decimals)
+
+    shapes = [(), (1, 2, 3), (1, 0)]
+    types = ['int32', 'int64', 'float32', 'float64']
+    for hybridize in [True, False]:
+        for oneType in types:
+            rtol, atol = 1e-3, 1e-5
+            for shape in shapes:
+                for d in range(-5, 6):
+                    test_round = TestRound(d)
+                    if hybridize:
+                        test_round.hybridize()
+                    x = rand_ndarray(shape, dtype=oneType).as_np_ndarray()
+                    np_out = _np.round(x.asnumpy(), d)
+                    mx_out = test_round(x)
+                    assert mx_out.shape == np_out.shape
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+                    mx_out = np.round(x, d)
+                    np_out = _np.round(x.asnumpy(), d)
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+
 @with_seed()
 @use_np
 def test_np_nonzero():
@@ -5670,6 +5780,56 @@ def hybrid_forward(self, F, a):
             assert_almost_equal(elem_mx.asnumpy(), elem_np, rtol=rtol, atol=atol)
 
 
+@with_seed()
+@use_np
+def test_np_bincount():
+    class TestBincount(HybridBlock):
+        def __init__(self, minlength=0):
+            super(TestBincount, self).__init__()
+            self._minlength = minlength
+
+        def hybrid_forward(self, F, a):
+            return F.np.bincount(a, None, self._minlength)
+    
+    class TestBincountWeights(HybridBlock):
+        def __init__(self, minlength=0):
+            super(TestBincountWeights, self).__init__()
+            self._minlength = minlength
+
+        def hybrid_forward(self, F, a, weights):
+            return F.np.bincount(a, weights, self._minlength)
+
+    dtypes = [np.int8, np.uint8, np.int32, np.int64]
+    weight_types = [np.int32, np.int64, np.float16, np.float32, np.float64]
+    shapes = [(), (5,), (10,), (15,), (20,), (30,), (50,)]
+    min_lengths = [0, 5, 20, 50]
+    has_weights = [True, False] 
+    combinations = itertools.product([True, False], shapes, dtypes, weight_types, has_weights, min_lengths)
+    for hybridize, shape, dtype, weight_type, has_weight, minlength in combinations: 
+        rtol = 1e-2 if weight_type == np.float16 else 1e-3
+        atol = 1e-4 if weight_type == np.float16 else 1e-5
+        if shape != ():
+            data = np.random.uniform(0, 10, size=shape).astype(dtype) 
+            weights = np.random.uniform(0, 10, size=shape).astype(weight_type) if has_weight else None
+        else:
+            data = np.array(()).astype(dtype)
+            weights = np.array(()).astype(weight_type) if has_weight else None
+        weights_np = weights.asnumpy() if has_weight else None
+        test_bincount = TestBincountWeights(minlength) if has_weight else TestBincount(minlength)
+        if hybridize:
+            test_bincount.hybridize()
+        mx_out = test_bincount(data, weights) if has_weight else test_bincount(data)
+        np_out = _np.bincount(data.asnumpy(), weights_np, minlength)
+        assert mx_out.shape == np_out.shape
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+        # No backward operation for operator bincount at this moment
+
+        # Test imperative once again
+        mx_out = np.bincount(data, weights, minlength)
+        np_out = _np.bincount(data.asnumpy(), weights_np, minlength)
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+        
 if __name__ == '__main__':
     import nose
     nose.runmodule()