diff --git a/.gitmodules b/.gitmodules
index 836d824a6f5a..e0ffec11bfd0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -20,12 +20,12 @@
 	path = 3rdparty/mkldnn
 	url = https://github.com/intel/mkl-dnn.git
 	branch = master
-[submodule "3rdparty/cub"]
-	path = 3rdparty/cub
-	url = https://github.com/dmlc/cub
 [submodule "3rdparty/tvm"]
 	path = 3rdparty/tvm
 	url = https://github.com/dmlc/tvm
 [submodule "3rdparty/onnx-tensorrt"]
 	path = 3rdparty/onnx-tensorrt
 	url = https://github.com/onnx/onnx-tensorrt.git
+[submodule "3rdparty/nvidia_cub"]
+	path = 3rdparty/nvidia_cub
+	url = https://github.com/NVlabs/cub.git
diff --git a/3rdparty/cub b/3rdparty/cub
deleted file mode 160000
index 05eb57faa0a4..000000000000
--- a/3rdparty/cub
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 05eb57faa0a4cac37c2a86fdf4b4dc865a95a1a3
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 55f3c7bc1d87..3ffea8694adf 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 55f3c7bc1d875fbc7d34fc26651bb8c6818c8355
+Subproject commit 3ffea8694adf9c0363f9abbf162dc0e4a45b22c5
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index 722901c9aaef..7de7e5d02bf6 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit 722901c9aaefa579698df778d061d4848ab8c3e3
+Subproject commit 7de7e5d02bf687f971e7668963649728356e0c20
diff --git a/3rdparty/nvidia_cub b/3rdparty/nvidia_cub
new file mode 160000
index 000000000000..c3cceac115c0
--- /dev/null
+++ b/3rdparty/nvidia_cub
@@ -0,0 +1 @@
+Subproject commit c3cceac115c072fb63df1836ff46d8c60d9eb304
diff --git a/3rdparty/sparse-matrix/Makefile b/3rdparty/sparse-matrix/Makefile
new file mode 100644
index 000000000000..214312f6586c
--- /dev/null
+++ b/3rdparty/sparse-matrix/Makefile
@@ -0,0 +1,21 @@
+CC = g++
+C = gcc
+MKLROOT = /opt/intel/mkl
+
+ifneq ($(USE_INTEL_PATH),)
+	MKLROOT = $(USE_INTEL_PATH)/mkl
+endif
+
+CFLAGS  = -fpic -O2 -I/opt/intel/mkl/include -c -Wall -Werror  -DMKL_ILP64 -m64  -std=c++11 
+LDFLAGS =  -Wl,--start-group -L${MKLROOT}/../compiler/lib/intel64 ${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a ${MKLROOT}/lib/intel64/libmkl_intel_thread.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -liomp5 -lpthread -lm -ldl
+
+default: libsparse_matrix.so
+
+libsparse_matrix.so:  sparse_matrix.o
+	$(CC) -shared -o libsparse_matrix.so sparse_matrix.o $(LDFLAGS) 
+
+sparse_matrix.o:  sparse_matrix.cc sparse_matrix.h
+	$(CC) $(CFLAGS) sparse_matrix.cc 
+
+clean:
+	$(RM) libsparse_matrix.so *.o *~
diff --git a/3rdparty/sparse-matrix/sparse_matrix.cc b/3rdparty/sparse-matrix/sparse_matrix.cc
new file mode 100644
index 000000000000..fa362f0f8a18
--- /dev/null
+++ b/3rdparty/sparse-matrix/sparse_matrix.cc
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <mkl_spblas.h>
+#include "sparse_matrix.h"
+
+
+
+bool mkl_DotCsrDnsDns(SP_INT64* rows_start, SP_INT64* col_indx,
+	float* values, float* X, float* y,
+	int rows, int cols, int X_columns)
+{
+
+	sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO;
+	sparse_status_t status;
+	sparse_matrix_t A = NULL;
+	sparse_layout_t layout = SPARSE_LAYOUT_ROW_MAJOR;
+	float one, zero;
+	one = (float)1.0;
+	zero = (float)0.0;
+
+	MKL_INT* rows_end = rows_start + 1;
+	status = mkl_sparse_s_create_csr(&A, indexing, rows, cols, rows_start, rows_end, col_indx, values);
+
+  if (status != SPARSE_STATUS_SUCCESS)
+  {
+    std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
+    return false;
+  }
+	sparse_operation_t operation = SPARSE_OPERATION_NON_TRANSPOSE;
+	struct matrix_descr descrA;
+	descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+
+	status = mkl_sparse_s_mm(operation, one, A, descrA, layout, X, X_columns, X_columns, zero, y, X_columns);
+  if (status != SPARSE_STATUS_SUCCESS)
+  {
+    std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
+    return false;
+  }
+	
+	mkl_sparse_destroy(A);
+	
+	return true;
+
+}
diff --git a/3rdparty/sparse-matrix/sparse_matrix.h b/3rdparty/sparse-matrix/sparse_matrix.h
new file mode 100644
index 000000000000..93054a80b374
--- /dev/null
+++ b/3rdparty/sparse-matrix/sparse_matrix.h
@@ -0,0 +1,48 @@
+#ifndef MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
+#define MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
+
+
+#if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER)
+#define SP_INT64 __int64
+#define SP_UINT64 unsigned __int64
+#else
+#define SP_INT64 long long int
+#define SP_UINT64 unsigned long long int
+#endif
+
+
+#if defined _WIN32 || defined __CYGWIN__
+  #ifdef BUILDING_DLL
+    #ifdef __GNUC__
+      #define SPM_API_PUBLIC __attribute__ ((dllexport))
+    #else
+      #define SPM_API_PUBLIC __declspec(dllexport) // Note: actually gcc seems to also supports this syntax.
+    #endif
+  #else
+    #ifdef __GNUC__
+      #define SPM_API_PUBLIC __attribute__ ((dllimport))
+    #else
+      #define SPM_API_PUBLIC __declspec(dllimport) // Note: actually gcc seems to also supports this syntax.
+    #endif
+  #endif
+  #define SPM_API_LOCAL
+#else
+  #if __GNUC__ >= 4
+    #define SPM_API_PUBLIC __attribute__ ((visibility ("default")))
+    #define SPM_API_LOCAL  __attribute__ ((visibility ("hidden")))
+  #else
+    #define SPM_API_PUBLIC
+    #define SPM_API_LOCAL
+  #endif
+#endif
+
+
+
+extern "C"
+{
+	extern SPM_API_PUBLIC bool mkl_DotCsrDnsDns(SP_INT64* rows_start, SP_INT64* col_indx,
+		float* values, float* X, float* y, int rows, int cols, int X_columns);
+
+}
+
+#endif //MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d8ef524bb389..9cd68e14093c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,7 @@ mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
 mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
 mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON IF NOT MSVC)
-mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
+mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support" OFF)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
 mxnet_option(USE_PROFILER         "Build with Profiler support"   ON)
 mxnet_option(USE_DIST_KVSTORE     "Build with DIST_KVSTORE support" OFF)
@@ -255,6 +255,7 @@ if(USE_MKLDNN)
   add_subdirectory(3rdparty/mkldnn)
 
   include_directories(3rdparty/mkldnn/include)
+  include_directories(${PROJECT_BINARY_DIR}/3rdparty/mkldnn/include)
   add_definitions(-DUSE_MKL=1)
   add_definitions(-DCUB_MKL=1)
   add_definitions(-DMXNET_USE_MKLDNN=1)
@@ -327,7 +328,7 @@ endforeach()
 
 include_directories("include")
 include_directories("3rdparty/mshadow")
-include_directories("3rdparty/cub")
+include_directories("3rdparty/nvidia_cub")
 include_directories("3rdparty/tvm/nnvm/include")
 include_directories("3rdparty/tvm/include")
 include_directories("3rdparty/dmlc-core/include")
@@ -511,7 +512,7 @@ list(APPEND CUDA ${MSHADOW_CUDASOURCE})
 FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "3rdparty/tvm/nnvm/*.cc" "plugin/*.cc")
 FILE(GLOB_RECURSE GROUP_Include "src/*.h" "3rdparty/tvm/nnvm/*.h" "3rdparty/mshadow/mshadow/*.h" "plugin/*.h")
 FILE(GLOB_RECURSE GROUP_CUDA "src/*.cu" "src/*.cuh" "3rdparty/mshadow/mshadow/*.cuh" "plugin/*.cu"
-  "plugin/*.cuh" "3rdparty/cub/cub/*.cuh")
+  "plugin/*.cuh" "3rdparty/nvidia_cub/cub/*.cuh")
 assign_source_group("Source" ${GROUP_SOURCE})
 assign_source_group("Include" ${GROUP_Include})
 assign_source_group("CUDA" ${GROUP_CUDA})
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index caf61e8be6c2..5c5c217b47eb 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -65,6 +65,10 @@ The committers are the granted write access to the project.
  - Marco is the creator of the current MXNet CI.
 * [Carin Meier](https://github.com/gigasquid)
   - Carin created and is the current maintainer for the Clojure interface.
+* [Patric Zhao](https://github.com/pengzhao-intel)
+  - Patric is a parallel computing expert and a major contributor to the MXNet MKL-DNN backend.
+* [Tao Lv](https://github.com/TaoLv)
+  - Tao is a major contributor to the MXNet MKL-DNN backend and performance on CPU.
 
 
 ### Become a Committer
@@ -211,10 +215,27 @@ List of Contributors
 * [Harsh Patel](https://github.com/harshp8l)
 * [Xiao Wang](https://github.com/BeyonderXX)
 * [Piyush Ghai](https://github.com/piyushghai)
+* [Dang Trung Kien](https://github.com/kiendang)
 * [Zach Boldyga](https://github.com/zboldyga)
 * [Gordon Reid](https://github.com/gordon1992)
-
 * [Ming Yang](http://ufoym.com)
+* [Satya Krishna Gorti](https://github.com/satyakrishnagorti)
+* [Neo Chien](https://github.com/cchung100m)
+* [Wujie Zhou](https://github.com/eureka7mt)
+* [Ciyong Chen](https://github.com/ciyongch)
+* [Hao Li](https://github.com/lihaofd)
+* [Jin Huang](https://github.com/jinhuang415)
+* [Luobao Zou](https://github.com/luobao-intel)
+* [Pengxin Yuan](https://github.com/pengxin99)
+* [Rong Zhang](https://github.com/rongzha1/)
+* [Shu Zhang](https://github.com/Sherry-Zhang)
+* [Shufan Wu](https://github.com/juliusshufan)
+* [Wenting Jiang](https://github.com/wentingj)
+* [Xiaotao Chen](https://github.com/XiaotaoChen)
+* [Xinyu Chen](https://github.com/xinyu-intel)
+* [Zhennan Qin](https://github.com/ZhennanQin)
+* [Zhiyuan Huang](https://github.com/huangzhiyuan)
+* [Zak Jost](https://github.com/zjost)
 
 Label Bot
 ---------
diff --git a/LICENSE b/LICENSE
index b73ba3740c3a..72fe08f4e316 100644
--- a/LICENSE
+++ b/LICENSE
@@ -315,10 +315,10 @@
          Copyright 2005-2008, Google Inc.
     3. Moderngpu - For details, see, 3rdparty/ctc_include/contrib/moderngpu/LICENSE
          Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
-    4. CUB Library - For details, see, 3rdparty/cub/LICENSE.TXT
+    4. CUB Library - For details, see, 3rdparty/nvidia_cub/LICENSE.TXT
          Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
          Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
-    5. CUB mersenne.h - For details, see 3rdparty/cub/test/mersenne.h
+    5. CUB mersenne.h - For details, see 3rdparty/nvidia_cub/test/mersenne.h
          Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
     6. Googlemock - For details, see, 3rdparty/googletest/googlemock/LICENSE
          Copyright 2006-2015, Google Inc.
diff --git a/MKLDNN_README.md b/MKLDNN_README.md
index 214fc83985fb..34790c9c513d 100644
--- a/MKLDNN_README.md
+++ b/MKLDNN_README.md
@@ -15,316 +15,4 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-# Build/Install MXNet with MKL-DNN
-
-A better training and inference performance is expected to be achieved on Intel-Architecture CPUs with MXNet built with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
-In the following sections, you will find build instructions for MXNet with Intel MKL-DNN on Linux, MacOS and Windows.
-
-The detailed performance data collected on Intel Xeon CPU with MXNet built with Intel MKL-DNN can be found [here](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu).
-
-
-<h2 id="0">Contents</h2>
-
-* [1. Linux](#1)
-* [2. MacOS](#2)
-* [3. Windows](#3)
-* [4. Verify MXNet with python](#4)
-* [5. Enable MKL BLAS](#5)
-* [6. Enable graph optimization](#6)
-* [7. Quantization](#7)
-* [8. Support](#8)
-
-<h2 id="1">Linux</h2>
-
-### Prerequisites
-
-```
-sudo apt-get update
-sudo apt-get install -y build-essential git
-sudo apt-get install -y libopenblas-dev liblapack-dev
-sudo apt-get install -y libopencv-dev
-sudo apt-get install -y graphviz
-```
-
-### Clone MXNet sources
-
-```
-git clone --recursive https://github.com/apache/incubator-mxnet.git
-cd incubator-mxnet
-```
-
-### Build MXNet with MKL-DNN
-
-```
-make -j $(nproc) USE_OPENCV=1 USE_MKLDNN=1 USE_BLAS=mkl USE_INTEL_PATH=/opt/intel
-```
-
-If you don't have the full [MKL](https://software.intel.com/en-us/intel-mkl) library installation, you might use OpenBLAS as the blas library, by setting USE_BLAS=openblas.
-
-<h2 id="2">MacOS</h2>
-
-### Prerequisites
-
-Install the dependencies, required for MXNet, with the following commands:
-
-- [Homebrew](https://brew.sh/)
-- llvm (clang in macOS does not support OpenMP)
-- OpenCV (for computer vision operations)
-
-```
-# Paste this command in Mac terminal to install Homebrew
-/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
-
-# install dependency
-brew update
-brew install pkg-config
-brew install graphviz
-brew tap homebrew/core
-brew install opencv
-brew tap homebrew/versions
-brew install llvm
-```
-
-### Clone MXNet sources
-
-```
-git clone --recursive https://github.com/apache/incubator-mxnet.git
-cd incubator-mxnet
-```
-
-### Build MXNet with MKL-DNN
-
-```
-LIBRARY_PATH=$(brew --prefix llvm)/lib/ make -j $(sysctl -n hw.ncpu) CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ USE_OPENCV=1 USE_OPENMP=1 USE_MKLDNN=1 USE_BLAS=apple USE_PROFILER=1
-```
-
-<h2 id="3">Windows</h2>
-
-On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) and [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) to compile MXNet with Intel MKL-DNN.
-[Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is recommended.
-
-**Visual Studio 2015**
-
-To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
-
-1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
-2. Download and Install [CMake 3](https://cmake.org/) if it is not already installed.
-3. Download and install [OpenCV 3](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
-4. Unzip the OpenCV package.
-5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (```C:\opencv\build\x64\vc14``` for example). Also, you need to add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
-6. If you have Intel Math Kernel Library (MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in
-```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\mkl```.
-7. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/). Note that you should also download ```mingw64.dll.zip`` along with openBLAS and add them to PATH.
-8. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```. 
-
-After you have installed all of the required dependencies, build the MXNet source code:
-
-1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet). Don't forget to pull the submodules:
-```
-git clone --recursive https://github.com/apache/incubator-mxnet.git
-```
-
-2. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
-
-3. Start a Visual Studio command prompt.
-
-4. Use [CMake 3](https://cmake.org/) to create a Visual Studio solution in ```./build``` or some other directory. Make sure to specify the architecture in the 
-[CMake 3](https://cmake.org/) command:
-```
-mkdir build
-cd build
-cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
-```
-
-5. In Visual Studio, open the solution file,```.sln```, and compile it.
-These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
-Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
-
-6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
-
-**Visual Studio 2017**
-
-To build and install MXNet yourself using [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/), you need the following dependencies. Install the required dependencies:
-
-1. If [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) is not already installed, download and install it. You can download and install the free community edition.
-2. Download and install [CMake 3](https://cmake.org/files/v3.11/cmake-3.11.0-rc4-win64-x64.msi) if it is not already installed.
-3. Download and install [OpenCV](https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.4.1/opencv-3.4.1-vc14_vc15.exe/download).
-4. Unzip the OpenCV package.
-5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (e.g., ```OpenCV_DIR = C:\utils\opencv\build```).
-6. If you don’t have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](https://sourceforge.net/projects/openblas/files/v0.2.20/OpenBLAS%200.2.20%20version.zip/download).
-7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories (e.g., ```OpenBLAS_HOME = C:\utils\OpenBLAS```).
-
-After you have installed all of the required dependencies, build the MXNet source code:
-
-1. Start ```cmd``` in windows.
-
-2. Download the MXNet source code from GitHub by using following command:
-
-```r
-cd C:\
-git clone --recursive https://github.com/apache/incubator-mxnet.git
-```
-
-3. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
-
-4. Follow [this link](https://docs.microsoft.com/en-us/visualstudio/install/modify-visual-studio) to modify ```Individual components```, and check ```VC++ 2017 version 15.4 v14.11 toolset```, and click ```Modify```.
-
-5. Change the version of the Visual studio 2017 to v14.11 using the following command (by default the VS2017 is installed in the following path):
-
-```r
-"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.11
-```
-
-6. Create a build dir using the following command and go to the directory, for example:
-
-```r
-mkdir C:\build
-cd C:\build
-```
-
-7. CMake the MXNet source code by using following command:
-
-```r
-cmake -G "Visual Studio 15 2017 Win64" .. -T host=x64 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
-```
-
-8. After the CMake successfully completed, compile the the MXNet source code by using following command:
-
-```r
-msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
-```
-
-9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
-
-<h2 id="4">Verify MXNet with python</h2>
-
-```
-cd python
-sudo python setup.py install
-python -c "import mxnet as mx;print((mx.nd.ones((2, 3))*2).asnumpy());"
-
-Expected Output:
-
-[[ 2.  2.  2.]
- [ 2.  2.  2.]]
-```
-
-### Verify whether MKL-DNN works
-
-After MXNet is installed, you can verify if MKL-DNN backend works well with a single Convolution layer.
-
-```
-import mxnet as mx
-import numpy as np
-
-num_filter = 32
-kernel = (3, 3)
-pad = (1, 1)
-shape = (32, 32, 256, 256)
-
-x = mx.sym.Variable('x')
-w = mx.sym.Variable('w')
-y = mx.sym.Convolution(data=x, weight=w, num_filter=num_filter, kernel=kernel, no_bias=True, pad=pad)
-exe = y.simple_bind(mx.cpu(), x=shape)
-
-exe.arg_arrays[0][:] = np.random.normal(size=exe.arg_arrays[0].shape)
-exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape)
-
-exe.forward(is_train=False)
-o = exe.outputs[0]
-t = o.asnumpy()
-```
-
-More detailed debugging and profiling information can be logged by setting the environment variable 'MKLDNN_VERBOSE':
-```
-export MKLDNN_VERBOSE=1
-```
-For example, by running above code snippet, the following debugging logs providing more insights on MKL-DNN primitives `convolution` and `reorder`. That includes: Memory layout, infer shape and the time cost of primitive execution.
-```
-mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nchw out:f32_nChw16c,num:1,32x32x256x256,6.47681
-mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0429688
-mkldnn_verbose,exec,convolution,jit:avx512_common,forward_inference,fsrc:nChw16c fwei:OIhw16i16o fbia:undef fdst:nChw16c,alg:convolution_direct,mb32_g1ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1,9.98193
-mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0510254
-mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nChw16c out:f32_nchw,num:1,32x32x256x256,20.4819
-```
-
-<h2 id="5">Enable MKL BLAS</h2>
-
-With MKL BLAS, the performace is expected to furtherly improved with variable range depending on the computation load of the models.
-You can redistribute not only dynamic libraries but also headers, examples and static libraries on accepting the license [Intel® Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license).
-Installing the full MKL installation enables MKL support for all operators under the linalg namespace.
-
-  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl)
-
-  2. Run `make -j ${nproc} USE_BLAS=mkl`
-
-  3. Navigate into the python directory
-
-  4. Run `sudo python setup.py install`
-
-### Verify whether MKL works
-
-After MXNet is installed, you can verify if MKL BLAS works well with a single dot layer.
-
-```
-import mxnet as mx
-import numpy as np
-
-shape_x = (1, 10, 8)
-shape_w = (1, 12, 8)
-
-x_npy = np.random.normal(0, 1, shape_x)
-w_npy = np.random.normal(0, 1, shape_w)
-
-x = mx.sym.Variable('x')
-w = mx.sym.Variable('w')
-y = mx.sym.batch_dot(x, w, transpose_b=True)
-exe = y.simple_bind(mx.cpu(), x=x_npy.shape, w=w_npy.shape)
-
-exe.forward(is_train=False)
-o = exe.outputs[0]
-t = o.asnumpy()
-```
-
-You can open the `MKL_VERBOSE` flag by setting environment variable:
-```
-export MKL_VERBOSE=1
-```
-Then by running above code snippet, you probably will get the following output message which means `SGEMM` primitive from MKL are called. Layout information and primitive execution performance are also demonstrated in the log message.
-```
-Numpy + Intel(R) MKL: THREADING LAYER: (null)
-Numpy + Intel(R) MKL: setting Intel(R) MKL to use INTEL OpenMP runtime
-Numpy + Intel(R) MKL: preloading libiomp5.so runtime
-MKL_VERBOSE Intel(R) MKL 2018.0 Update 1 Product build 20171007 for Intel(R) 64 architecture Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) enabled processors, Lnx 2.40GHz lp64 intel_thread NMICDev:0
-MKL_VERBOSE SGEMM(T,N,12,10,8,0x7f7f927b1378,0x1bc2140,8,0x1ba8040,8,0x7f7f927b1380,0x7f7f7400a280,12) 8.93ms CNR:OFF Dyn:1 FastMM:1 TID:0  NThr:40 WDiv:HOST:+0.000
-```
-
-<h2 id="6">Enable graph optimization</h2>
-
-Graph optimization by subgraph feature are available in master branch. You can build from source and then use below command to enable this *experimental* feature for better performance:
-
-```
-export MXNET_SUBGRAPH_BACKEND=MKLDNN
-```
-
-This limitations of this experimental feature are:
-
-- Use this feature only for inference. When training, be sure to turn the feature off by unsetting the `MXNET_SUBGRAPH_BACKEND` environment variable.
-
-- This feature will only run on the CPU, even if you're using a GPU-enabled build of MXNet. 
-
-- [MXNet Graph Optimization and Quantization Technical Information and Performance Details](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN).
-
-<h2 id="7">Quantization and Inference with INT8</h2>
-
-Benefiting from Intel® MKL-DNN, MXNet built with Intel® MKL-DNN brings outstanding performance improvement on quantization and inference with INT8 Intel® CPU Platform on Intel® Xeon® Scalable Platform.
-
-- [CNN Quantization Examples](https://github.com/apache/incubator-mxnet/tree/master/example/quantization).
-
-<h2 id="8">Next Steps and Support</h2>
-
-- For questions or support specific to MKL, visit the [Intel MKL](https://software.intel.com/en-us/mkl) website.
-
-- For questions or support specific to MKL, visit the [Intel MKLDNN](https://github.com/intel/mkl-dnn) website.
-
-- If you find bugs, please open an issue on GitHub for [MXNet with MKL](https://github.com/apache/incubator-mxnet/labels/MKL) or [MXNet with MKLDNN](https://github.com/apache/incubator-mxnet/labels/MKLDNN).
+File is moved to [docs/tutorials/mkldnn/MKLDNN_README.md](docs/tutorials/mkldnn/MKLDNN_README.md).
diff --git a/Makefile b/Makefile
index cd6610581893..53998ac31919 100644
--- a/Makefile
+++ b/Makefile
@@ -129,7 +129,7 @@ ifdef CAFFE_PATH
 endif
 
 ifndef LINT_LANG
-	LINT_LANG="all"
+	LINT_LANG = "all"
 endif
 
 ifeq ($(USE_MKLDNN), 1)
@@ -144,13 +144,36 @@ ifeq ($(USE_MKLDNN), 1)
 	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
 endif
 
+
 # setup opencv
 ifeq ($(USE_OPENCV), 1)
-	CFLAGS += -DMXNET_USE_OPENCV=1 $(shell pkg-config --cflags opencv)
-	LDFLAGS += $(filter-out -lopencv_ts, $(shell pkg-config --libs opencv))
+	CFLAGS += -DMXNET_USE_OPENCV=1
+	ifneq ($(filter-out NONE, $(USE_OPENCV_INC_PATH)),)
+		CFLAGS += -I$(USE_OPENCV_INC_PATH)/include
+		ifeq ($(filter-out NONE, $(USE_OPENCV_LIB_PATH)),)
+$(error Please add the path of OpenCV shared library path into `USE_OPENCV_LIB_PATH`, when `USE_OPENCV_INC_PATH` is not NONE)
+		endif
+		LDFLAGS += -L$(USE_OPENCV_LIB_PATH)
+		ifneq ($(wildcard $(USE_OPENCV_LIB_PATH)/libopencv_imgcodecs.*),)
+			LDFLAGS += -lopencv_imgcodecs
+		endif
+		ifneq ($(wildcard $(USE_OPENCV_LIB_PATH)/libopencv_highgui.*),)
+			LDFLAGS += -lopencv_highgui
+		endif
+	else
+		ifeq ("$(shell pkg-config --exists opencv4; echo $$?)", "0")
+			OPENCV_LIB = opencv4
+		else
+			OPENCV_LIB = opencv
+		endif
+		CFLAGS += $(shell pkg-config --cflags $(OPENCV_LIB))
+		LDFLAGS += $(shell pkg-config --libs-only-L $(OPENCV_LIB))
+		LDFLAGS += $(filter -lopencv_imgcodecs -lopencv_highgui, $(shell pkg-config --libs-only-l $(OPENCV_LIB)))
+	endif
+	LDFLAGS += -lopencv_imgproc -lopencv_core
 	BIN += bin/im2rec
 else
-	CFLAGS+= -DMXNET_USE_OPENCV=0
+	CFLAGS += -DMXNET_USE_OPENCV=0
 endif
 
 ifeq ($(USE_OPENMP), 1)
@@ -388,6 +411,14 @@ ifeq ($(USE_DIST_KVSTORE), 1)
 	LDFLAGS += $(PS_LDFLAGS_A)
 endif
 
+#sparse-matrix
+ifeq ($(USE_BLAS), mkl)
+	SPARSE_MATRIX_DIR =  $(ROOTDIR)/3rdparty/sparse-matrix
+	LIB_DEP += $(SPARSE_MATRIX_DIR)/libsparse_matrix.so
+	CFLAGS += -I$(SPARSE_MATRIX_DIR)
+	LDFLAGS += -L$(SPARSE_MATRIX_DIR) -lsparse_matrix -Wl,-rpath,'$${ORIGIN}'
+endif
+
 .PHONY: clean all extra-packages test lint docs clean_all rcpplint rcppexport roxygen\
 	cython2 cython3 cython cyclean
 
@@ -431,7 +462,7 @@ LIB_DEP += $(DMLC_CORE)/libdmlc.a $(NNVM_PATH)/lib/libnnvm.a
 ALL_DEP = $(OBJ) $(EXTRA_OBJ) $(PLUGIN_OBJ) $(LIB_DEP)
 
 ifeq ($(USE_CUDA), 1)
-	CFLAGS += -I$(ROOTDIR)/3rdparty/cub
+	CFLAGS += -I$(ROOTDIR)/3rdparty/nvidia_cub
 	ALL_DEP += $(CUOBJ) $(EXTRA_CUOBJ) $(PLUGIN_CUOBJ)
 	LDFLAGS += -lcufft
 	ifeq ($(ENABLE_CUDA_RTC), 1)
@@ -525,11 +556,30 @@ ifeq ($(UNAME_S), Darwin)
 endif
 endif
 
+ifeq ($(USE_BLAS), mkl)
+ifeq ($(UNAME_S), Darwin)
+	install_name_tool -change '@rpath/libsparse_matrix.dylib' '@loader_path/libsparse_matrix.dylib' $@
+endif
+endif
+
 $(PS_PATH)/build/libps.a: PSLITE
 
 PSLITE:
 	$(MAKE) CXX="$(CXX)" DEPS_PATH="$(DEPS_PATH)" -C $(PS_PATH) ps
 
+ifeq ($(USE_BLAS), mkl)
+$(SPARSE_MATRIX_DIR)/libsparse_matrix.so: SPARSE_MATRIX
+
+SPARSE_MATRIX:
+ifeq ($(USE_INTEL_PATH), NONE)
+	$(MAKE) -C $(SPARSE_MATRIX_DIR)
+else
+	$(MAKE) -C $(SPARSE_MATRIX_DIR) USE_INTEL_PATH=$(USE_INTEL_PATH)
+endif
+	mkdir -p $(ROOTDIR)/lib
+	cp $(SPARSE_MATRIX_DIR)/libsparse_matrix.so $(ROOTDIR)/lib/
+endif
+
 $(DMLC_CORE)/libdmlc.a: DMLCCORE
 
 DMLCCORE:
@@ -606,6 +656,10 @@ rpkg:
 		cp -rf lib/libmklml_intel.so R-package/inst/libs; \
 	fi
 
+	if [ -e "lib/libsparse_matrix.so" ]; then \
+		cp -rf lib/libsparse_matrix.so R-package/inst/libs; \
+	fi
+
 	mkdir -p R-package/inst/include
 	cp -rl include/* R-package/inst/include
 	Rscript -e "if(!require(devtools)){install.packages('devtools', repo = 'https://cloud.r-project.org/')}"
@@ -648,8 +702,10 @@ rclean:
 ifneq ($(EXTRA_OPERATORS),)
 clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r build lib bin deps *~ */*~ */*/*~ */*/*/*~ 
+	(cd scala-package && mvn clean) || true
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
+	cd $(SPARSE_MATRIX_DIR); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
 	cd $(AMALGAMATION_PATH); $(MAKE) clean; cd -
 	$(RM) -r  $(patsubst %, %/*.d, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.d, $(EXTRA_OPERATORS))
@@ -657,8 +713,10 @@ clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
 else
 clean: rclean mkldnn_clean cyclean testclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ 
+	(cd scala-package && mvn clean) || true
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
+	cd $(SPARSE_MATRIX_DIR); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
 	cd $(AMALGAMATION_PATH); $(MAKE) clean; cd -
 endif
diff --git a/NEWS.md b/NEWS.md
index 1af4b138e600..ad842ac84786 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -164,7 +164,7 @@ MKLDNN backend takes advantage of MXNet subgraph to implement the most of possib
 ##### Quantization
 Performance of reduced-precision (INT8) computation is also dramatically improved after the graph optimization feature is applied on CPU Platforms. Various models are supported and can benefit from reduced-precision computation, including symbolic models, Gluon models and even custom models. Users can run most of the pre-trained models with only a few lines of commands and a new quantization script imagenet_gen_qsym_mkldnn.py. The observed accuracy loss is less than 0.5% for popular CNN networks, like ResNet-50, Inception-BN, MobileNet, etc.
 
-Please find detailed information and performance/accuracy numbers here: [MKLDNN README](https://github.com/apache/incubator-mxnet/blob/master/MKLDNN_README.md), [quantization README](https://github.com/apache/incubator-mxnet/tree/master/example/quantization#1) and [design proposal](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN)
+Please find detailed information and performance/accuracy numbers here: [MKLDNN README](https://github.com/apache/incubator-mxnet/blob/master/docs/tutorials/mkldnn/MKLDNN_README.md), [quantization README](https://github.com/apache/incubator-mxnet/tree/master/example/quantization#1) and [design proposal](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN)
 
 ### New Operators
 
@@ -584,7 +584,7 @@ Submodule@commit ID::Last updated by MXNet:: Last update in submodule
 * dlpack@10892ac:: Oct 30, 2017 :: Aug 23, 2018
 * dmlc-core@0a0e8ad:: Aug 15, 2018 :: Nov 15, 2018
 * googletest@ec44c6c:: July 14, 2016 :: July 14, 2016
-* mkldnn@a7c5f53:: Nov 7, 2018 :: Nov 5, 2018
+* mkldnn@722901c:: Feb 13, 2019 :: Feb 12, 2019
 * mshadow@696803b:: Sep 28, 2018 :: Nov 7,  2018
 * onnx-tensorrt@3d8ee04:: Aug 22, 2018 :: Nov 10, 2018
 * openmp@37c7212: Nov 22, 2017 :: Nov 13, 2018
diff --git a/R-package/src/ndarray.cc b/R-package/src/ndarray.cc
index 94d24f3fb46b..0409d3ba8887 100644
--- a/R-package/src/ndarray.cc
+++ b/R-package/src/ndarray.cc
@@ -179,9 +179,9 @@ Rcpp::RObject NDArrayPacker::CreateNDArrayPacker() {
 }
 
 Rcpp::Dimension NDArray::dim() const {
-  mx_uint ndim;
-  const mx_uint *pshape;
-  MX_CALL(MXNDArrayGetShape(
+  int ndim;
+  const int *pshape;
+  MX_CALL(MXNDArrayGetShapeEx(
       ptr_->handle, &ndim, &pshape));
   Rcpp::IntegerVector dat(pshape, pshape + ndim);
   std::reverse(dat.begin(), dat.end());
diff --git a/R-package/src/symbol.cc b/R-package/src/symbol.cc
index 031c9a254019..317e82568012 100644
--- a/R-package/src/symbol.cc
+++ b/R-package/src/symbol.cc
@@ -167,8 +167,8 @@ Symbol::RObjectType Symbol::GetOutput(mx_uint index) const {
 
 // helper function to convert shape into Rcpp vector
 inline Rcpp::List BuildShapeData(mx_uint shape_size,
-                                 const mx_uint *shape_ndim,
-                                 const mx_uint **shape_data,
+                                 const int *shape_ndim,
+                                 const int **shape_data,
                                  const std::vector<std::string> &names) {
   Rcpp::List ret(shape_size);
   for (mx_uint i = 0; i < shape_size; ++i) {
@@ -185,7 +185,7 @@ SEXP Symbol::InferShape(const Rcpp::List& kwargs) const {
       << "Need to pass parameters in key=value style.\n";
   std::vector<std::string> keys = kwargs.names();
   std::vector<mx_uint> arg_ind_ptr(1, 0);
-  std::vector<mx_uint> arg_shape_data;
+  std::vector<int> arg_shape_data;
 
   for (size_t i = 0; i < kwargs.size(); ++i) {
     RCHECK(keys[i].length() != 0)
@@ -197,17 +197,17 @@ SEXP Symbol::InferShape(const Rcpp::List& kwargs) const {
   std::vector<const char*> c_keys = CKeys(keys);
 
   mx_uint in_shape_size;
-  const mx_uint *in_shape_ndim;
-  const mx_uint **in_shape_data;
+  const int *in_shape_ndim;
+  const int **in_shape_data;
   mx_uint out_shape_size;
-  const mx_uint *out_shape_ndim;
-  const mx_uint **out_shape_data;
+  const int *out_shape_ndim;
+  const int **out_shape_data;
   mx_uint aux_shape_size;
-  const mx_uint *aux_shape_ndim;
-  const mx_uint **aux_shape_data;
+  const int *aux_shape_ndim;
+  const int **aux_shape_data;
   int complete;
 
-  MX_CALL(MXSymbolInferShape(
+  MX_CALL(MXSymbolInferShapeEx(
       handle_, static_cast<mx_uint>(kwargs.size()), dmlc::BeginPtr(c_keys),
       dmlc::BeginPtr(arg_ind_ptr), dmlc::BeginPtr(arg_shape_data),
       &in_shape_size, &in_shape_ndim, &in_shape_data,
diff --git a/R-package/tests/testthat/get_data.R b/R-package/tests/testthat/get_data.R
index 691131c11739..9bcacdb46ac8 100644
--- a/R-package/tests/testthat/get_data.R
+++ b/R-package/tests/testthat/get_data.R
@@ -55,13 +55,16 @@ GetInception <- function() {
   if (!dir.exists("model")) {
     dir.create("model/")
   }
+
   if (!file.exists("model/Inception-BN-0126.params")) {
-    download.file("http://data.dmlc.ml/models/imagenet/inception-bn/Inception-BN-0126.params", 
-      destfile = "model/Inception-BN-0126.params")
+    download.file(
+        "http://data.mxnet.io/mxnet/models/imagenet/inception-bn/Inception-BN-0126.params?raw=true",
+        destfile = "model/Inception-BN-0126.params")
   }
   if (!file.exists("model/Inception-BN-symbol.json")) {
-    download.file("http://data.dmlc.ml/models/imagenet/inception-bn/Inception-BN-symbol.json", 
-      destfile = "model/Inception-BN-symbol.json")
+    download.file(
+        "http://data.mxnet.io/mxnet/models/imagenet/inception-bn/Inception-BN-symbol.json",
+        destfile = "model/Inception-BN-symbol.json")
   }
 }
 
diff --git a/R-package/vignettes/CatsDogsFinetune.Rmd b/R-package/vignettes/CatsDogsFinetune.Rmd
index 680b5a302498..726bb1a43c77 100644
--- a/R-package/vignettes/CatsDogsFinetune.Rmd
+++ b/R-package/vignettes/CatsDogsFinetune.Rmd
@@ -162,13 +162,13 @@ val   <- data$val
 
 ## Load pretrained model
 
-Here we use the pretrained model from http://data.dmlc.ml/models/imagenet/.
+Here we use the pretrained model from http://data.mxnet.io/mxnet/data/.
 There are 1000 classes in imagenet,
 and we need to replace the last fully connected layer with a new layer for 2 classes.
 
 
 ```{r}
-download.file('http://data.dmlc.ml/data/Inception.zip', destfile = 'Inception.zip')
+download.file('http://data.mxnet.io/mxnet/data/Inception.zip', destfile = 'Inception.zip')
 unzip("Inception.zip")
 inception_bn <- mx.model.load("./Inception-BN", iteration = 126)
 
diff --git a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
index ff631e0f5ce9..9cfdd5a5473f 100644
--- a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
+++ b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
@@ -31,7 +31,7 @@ Make sure you unzip the pre-trained model in current folder. And we can use the
 loading function to load the model into R.
 
 ```{r}
-download.file('http://data.dmlc.ml/data/Inception.zip', destfile = 'Inception.zip')
+download.file('http://data.mxnet.io/mxnet/data/Inception.zip', destfile = 'Inception.zip')
 unzip("Inception.zip")
 model <- mx.model.load("Inception/Inception_BN", iteration = 39)
 ```
diff --git a/README.md b/README.md
index 6a8ecdd99bd0..cab2c6deebc7 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ At its core, MXNet contains a dynamic dependency scheduler that automatically pa
 A graph optimization layer on top of that makes symbolic execution fast and memory efficient.
 MXNet is portable and lightweight, scaling effectively to multiple GPUs and multiple machines.
 
-MXNet is also more than a deep learning project. It is also a collection of
+MXNet is more than a deep learning project. It is a collection of
 [blue prints and guidelines](https://mxnet.incubator.apache.org/architecture/index.html#deep-learning-system-design-concepts) for building
 deep learning systems, and interesting insights of DL systems for hackers.
 
@@ -50,6 +50,7 @@ How to Contribute
 
 What's New
 ----------
+* [Version 1.4.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.4.0) - MXNet 1.4.0 Release.
 * [Version 1.3.1 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.3.1) - MXNet 1.3.1 Patch Release.
 * [Version 1.3.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.3.0) - MXNet 1.3.0 Release.
 * [Version 1.2.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.2.0) - MXNet 1.2.0 Release.
@@ -64,8 +65,8 @@ What's New
 * [Version 0.9.1 Release (NNVM refactor)](./docs/architecture/release_note_0_9.md) - NNVM branch is merged into master now. An official release will be made soon.
 * [Version 0.8.0 Release](https://github.com/dmlc/mxnet/releases/tag/v0.8.0)
 * [Updated Image Classification with new Pre-trained Models](./example/image-classification)
-* [Notebooks How to Use MXNet](https://github.com/zackchase/mxnet-the-straight-dope)
-* [MKLDNN for Faster CPU Performance](./MKLDNN_README.md)
+* [Notebooks How to Use MXNet](https://github.com/d2l-ai/d2l-en)
+* [MKLDNN for Faster CPU Performance](./docs/tutorials/mkldnn/MKLDNN_README.md)
 * [MXNet Memory Monger, Training Deeper Nets with Sublinear Memory Cost](https://github.com/dmlc/mxnet-memonger)
 * [Tutorial for NVidia GTC 2016](https://github.com/dmlc/mxnet-gtc-tutorial)
 * [Embedding Torch layers and functions in MXNet](https://mxnet.incubator.apache.org/faq/torch.html)
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index 0a4be02b8ff9..e47ab6b0e22e 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -46,6 +46,14 @@
 if platform.system() != 'Windows':
     blacklist.append('windows.h')
     blacklist.append('process.h')
+    blacklist.append('Shlwapi.h')
+
+if platform.system() == 'Windows':
+    blacklist.append('unistd.h')
+
+if 'freebsd' not in sys.platform:
+    blacklist.append('sys/endian.h')
+
 
 
 def get_sources(def_file):
@@ -94,6 +102,7 @@ def find_source(name, start, stage):
 
 re1 = re.compile('<([./a-zA-Z0-9_-]*)>')
 re2 = re.compile('"([./a-zA-Z0-9_-]*)"')
+re3 = re.compile('DMLC_EXECINFO_H')
 
 sysheaders = []
 history = set([])
@@ -129,6 +138,9 @@ def expand(x, pending, stage):
     with open(x, 'rb') as x_h:
         for line in x_h.readlines():
             uline = line.decode('utf-8')
+            if '#define DMLC_LOG_STACK_TRACE 1' in uline.strip():
+                # Do not enable stacktrace logging
+                continue
             if uline.find('#include') < 0:
                 out.write(line)
                 continue
@@ -138,10 +150,15 @@ def expand(x, pending, stage):
             m = re1.search(uline)
             if not m:
                 m = re2.search(uline)
-            if not m:
-                print(uline + ' not found')
-                continue
-            path = m.groups()[0]
+            if m:
+                path = m.groups()[0]
+            else:
+                m = re3.search(uline)
+                if m:
+                    path = 'execinfo.h'
+                else:
+                    print(uline + ' not found')
+                    continue
             h = path.strip('./') if "../3rdparty/" not in path else path
             if h.endswith('complex.h') and x.endswith('openblas_config.h'):
                 source = ''
diff --git a/benchmark/python/control_flow/rnn.py b/benchmark/python/control_flow/rnn.py
index 08498724b1b4..24e326c9afd1 100644
--- a/benchmark/python/control_flow/rnn.py
+++ b/benchmark/python/control_flow/rnn.py
@@ -79,12 +79,7 @@ def _array(shape, ctx):
 
 
 def _get_gpus():
-    try:
-        re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True)
-    except OSError:
-        return []
-    return range(len([i for i in re.split('\n') if 'GPU' in i]))
-
+    return range(mx.util.get_gpu_count())
 
 def run_benchmark(cell_type, ctx, seq_len, batch_size, hidden_dim):
     obj = {"foreach": ForeachRNN, "while_loop": WhileRNN}[args.benchmark]
diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
index 054deb5f87d2..38cc0b927c43 100644
--- a/ci/Jenkinsfile_utils.groovy
+++ b/ci/Jenkinsfile_utils.groovy
@@ -186,18 +186,27 @@ def update_github_commit_status(state, message) {
     context = get_github_context()
     echo "context=${context}"
 
-    step([
-      $class: 'GitHubCommitStatusSetter',
-      reposSource: [$class: "ManuallyEnteredRepositorySource", url: repoUrl],
-      contextSource: [$class: "ManuallyEnteredCommitContextSource", context: context],
-      commitShaSource: [$class: "ManuallyEnteredShaSource", sha: commitSha],
-      statusBackrefSource: [$class: "ManuallyEnteredBackrefSource", backref: "${env.RUN_DISPLAY_URL}"],
-      errorHandlers: [[$class: 'ShallowAnyErrorHandler']],
-      statusResultSource: [
-        $class: 'ConditionalStatusResultSource',
-        results: [[$class: "AnyBuildResult", message: message, state: state]]
-      ]
-    ])
+    // a few attempts need to be made: https://github.com/apache/incubator-mxnet/issues/11654
+    for (int attempt = 1; attempt <= 3; attempt++) {
+      echo "Sending GitHub status attempt ${attempt}..."
+
+      step([
+        $class: 'GitHubCommitStatusSetter',
+        reposSource: [$class: "ManuallyEnteredRepositorySource", url: repoUrl],
+        contextSource: [$class: "ManuallyEnteredCommitContextSource", context: context],
+        commitShaSource: [$class: "ManuallyEnteredShaSource", sha: commitSha],
+        statusBackrefSource: [$class: "ManuallyEnteredBackrefSource", backref: "${env.RUN_DISPLAY_URL}"],
+        errorHandlers: [[$class: 'ShallowAnyErrorHandler']],
+        statusResultSource: [
+          $class: 'ConditionalStatusResultSource',
+          results: [[$class: "AnyBuildResult", message: message, state: state]]
+        ]
+      ])
+
+      if (attempt <= 2) {
+        sleep 1
+      }
+    }
 
     echo "Publishing commit status done."
 
diff --git a/ci/build_windows.py b/ci/build_windows.py
index b7d47fb1fde1..e8658995b68e 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -213,11 +213,11 @@ def main():
     if system == 'Windows':
         logging.info("Detected Windows platform")
         if 'OpenBLAS_HOME' not in os.environ:
-            os.environ["OpenBLAS_HOME"] = "C:\\mxnet\\openblas"
+            os.environ["OpenBLAS_HOME"] = "C:\\Program Files\\OpenBLAS-v0.2.19"
         if 'OpenCV_DIR' not in os.environ:
-            os.environ["OpenCV_DIR"] = "C:\\mxnet\\opencv_vc14"
+            os.environ["OpenCV_DIR"] = "C:\\Program Files\\OpenCV-v3.4.1\\build"
         if 'CUDA_PATH' not in os.environ:
-            os.environ["CUDA_PATH"] = "C:\\CUDA\\v8.0"
+            os.environ["CUDA_PATH"] = "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.2"
         windows_build(args)
 
     elif system == 'Linux' or system == 'Darwin':
diff --git a/ci/docker/install/ubuntu_core.sh b/ci/docker/install/ubuntu_core.sh
index 61a4637830da..3cb806e0aadd 100755
--- a/ci/docker/install/ubuntu_core.sh
+++ b/ci/docker/install/ubuntu_core.sh
@@ -45,6 +45,7 @@ apt-get install -y \
     software-properties-common \
     sudo \
     unzip \
+    vim-nox \
     wget
 
 # Use libturbojpeg package as it is correctly compiled with -fPIC flag
diff --git a/ci/docker/install/ubuntu_mklml.sh b/ci/docker/install/ubuntu_mklml.sh
index 862e2846403a..e50b6d273b8c 100755
--- a/ci/docker/install/ubuntu_mklml.sh
+++ b/ci/docker/install/ubuntu_mklml.sh
@@ -21,5 +21,5 @@
 # the whole docker cache for the image
 
 set -ex
-wget -q --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_lnx_2019.0.1.20180928.tgz
+wget -q --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.18/mklml_lnx_2019.0.3.20190220.tgz
 tar -zxf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
diff --git a/ci/docker/qemu/runtime_functions.py b/ci/docker/qemu/runtime_functions.py
index 8b8e5acb503c..5a57cb8dae6a 100755
--- a/ci/docker/qemu/runtime_functions.py
+++ b/ci/docker/qemu/runtime_functions.py
@@ -77,8 +77,8 @@ def run_ut_python3_qemu_internal():
     logging.info("=== NOW Running inside QEMU ===")
     logging.info("PIP Installing %s", pkg)
     check_call(['sudo', 'pip3', 'install', pkg])
-    logging.info("PIP Installing mxnet/tests/requirements.txt")
-    check_call(['sudo', 'pip3', 'install', '-r', 'mxnet/tests/requirements.txt'])
+    logging.info("PIP Installing mxnet/test_requirements.txt") 
+    check_call(['sudo', 'pip3', 'install', '-r', 'mxnet/test_requirements.txt'])
     logging.info("Running tests in mxnet/tests/python/unittest/")
     check_call(['nosetests', '--with-timer', '--with-xunit', '--xunit-file', 'nosetests_unittest.xml', '--verbose', 'mxnet/tests/python/unittest/test_engine.py'])
     # Example to run a single unit test:
diff --git a/ci/docker/qemu/vmcontrol.py b/ci/docker/qemu/vmcontrol.py
index d80e22b1db85..31ef4d2550c3 100644
--- a/ci/docker/qemu/vmcontrol.py
+++ b/ci/docker/qemu/vmcontrol.py
@@ -229,6 +229,7 @@ def qemu_provision(ssh_port=QEMU_SSH_PORT):
     qemu_rsync(ssh_port, '/work/runtime_functions.py','')
     qemu_rsync(ssh_port, '/work/vmcontrol.py','')
     qemu_rsync(ssh_port, 'mxnet/tests', 'mxnet')
+    qemu_rsync(ssh_port, 'mxnet/ci/qemu/test_requirements.txt', 'mxnet/test_requirements.txt')
     logging.info("Provisioning completed successfully.")
 
 
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index de1b7795ce69..a89c51de0d8e 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -803,7 +803,7 @@ unittest_ubuntu_python2_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export CUDNN_VERSION=7.0.3
+    export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
@@ -812,7 +812,7 @@ unittest_ubuntu_python3_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export CUDNN_VERSION=7.0.3
+    export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
@@ -829,7 +829,7 @@ unittest_ubuntu_tensorrt_gpu() {
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
-    export CUDNN_VERSION=7.0.3
+    export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     python tests/python/tensorrt/lenet5_train.py
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose --nocapture tests/python/tensorrt/
 }
@@ -841,7 +841,7 @@ unittest_ubuntu_python2_quantization_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export CUDNN_VERSION=7.0.3
+    export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
 
@@ -852,7 +852,7 @@ unittest_ubuntu_python3_quantization_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export CUDNN_VERSION=7.0.3
+    export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
 
@@ -1003,7 +1003,7 @@ unittest_centos7_cpu() {
 unittest_centos7_gpu() {
     set -ex
     cd /work/mxnet
-    export CUDNN_VERSION=7.0.3
+    export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     python3.6 -m "nose" $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index cfbf484756e5..5b9ad47f6afb 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -33,7 +33,7 @@ mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/li
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, lib/libsparse_matrix.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'build/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
@@ -476,13 +476,11 @@ def compile_unix_amalgamation() {
 def compile_windows_cpu() {
     return ['Build CPU windows':{
       node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/build-cpu') {
-            withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
-              utils.init_git_win()
-              powershell 'python ci/build_windows.py -f WIN_CPU'
-              stash includes: 'windows_package.7z', name: 'windows_package_cpu'
-            }
+        ws('workspace/build-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git_win()
+            powershell 'py -3 ci/build_windows.py -f WIN_CPU'
+            stash includes: 'windows_package.7z', name: 'windows_package_cpu'
           }
         }
       }
@@ -492,13 +490,11 @@ def compile_windows_cpu() {
 def compile_windows_gpu() {
     return ['Build GPU windows':{
       node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/build-gpu') {
-            withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
+        ws('workspace/build-gpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
               utils.init_git_win()
-              powershell 'python ci/build_windows.py -f WIN_GPU'
+              powershell 'py -3 ci/build_windows.py -f WIN_GPU'
               stash includes: 'windows_package.7z', name: 'windows_package_gpu'
-            }
           }
         }
       }
@@ -508,13 +504,11 @@ def compile_windows_gpu() {
 def compile_windows_gpu_mkldnn() {
     return ['Build GPU MKLDNN windows':{
       node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/build-gpu') {
-            withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0','BUILD_NAME=vc14_gpu_mkldnn']) {
-              utils.init_git_win()
-              powershell 'python ci/build_windows.py -f WIN_GPU_MKLDNN'
-              stash includes: 'windows_package.7z', name: 'windows_package_gpu_mkldnn'
-            }
+        ws('workspace/build-gpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git_win()
+            powershell 'py -3 ci/build_windows.py -f WIN_GPU_MKLDNN'
+            stash includes: 'windows_package.7z', name: 'windows_package_gpu_mkldnn'
           }
         }
       }
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index 664e591abbd8..f6191deb7a68 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -50,7 +50,8 @@ core_logic: {
     custom_steps.test_unix_python2_mkldnn_gpu(),
     custom_steps.test_unix_python3_mkldnn_gpu(),
     custom_steps.test_unix_python3_mkldnn_nocudnn_gpu(),
-    custom_steps.test_unix_python3_tensorrt_gpu(),
+// Disabled temporarily for https://github.com/apache/incubator-mxnet/issues/14626
+//    custom_steps.test_unix_python3_tensorrt_gpu(),
     custom_steps.test_unix_perl_gpu(),
     custom_steps.test_unix_r_gpu(),
     custom_steps.test_unix_cpp_gpu(),
diff --git a/ci/jenkins/Jenkinsfile_windows_cpu b/ci/jenkins/Jenkinsfile_windows_cpu
index 01ca673433f6..5bc40d625930 100644
--- a/ci/jenkins/Jenkinsfile_windows_cpu
+++ b/ci/jenkins/Jenkinsfile_windows_cpu
@@ -34,14 +34,14 @@ utils.assign_node_labels(utility: 'utility', windows_cpu: 'mxnetwindows-cpu')
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    // custom_steps.compile_windows_cpu()
+    custom_steps.compile_windows_cpu()
   ])
 
   utils.parallel_stage('Tests', [
-    // custom_steps.test_windows_python2_cpu(),
-    // custom_steps.test_windows_python3_cpu(),
-    // custom_steps.test_windows_julia07_cpu(),
-    // custom_steps.test_windows_julia10_cpu()
+    custom_steps.test_windows_python2_cpu(),
+    custom_steps.test_windows_python3_cpu(),
+    custom_steps.test_windows_julia07_cpu(),
+    custom_steps.test_windows_julia10_cpu()
   ])
 }
 ,
diff --git a/ci/jenkins/Jenkinsfile_windows_gpu b/ci/jenkins/Jenkinsfile_windows_gpu
index b3447b960b22..2319f25942de 100644
--- a/ci/jenkins/Jenkinsfile_windows_gpu
+++ b/ci/jenkins/Jenkinsfile_windows_gpu
@@ -34,14 +34,14 @@ utils.assign_node_labels(utility: 'utility', windows_cpu: 'mxnetwindows-cpu', wi
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    // custom_steps.compile_windows_gpu(),
-    // custom_steps.compile_windows_gpu_mkldnn()
+    custom_steps.compile_windows_gpu(),
+    custom_steps.compile_windows_gpu_mkldnn()
   ])
 
   utils.parallel_stage('Tests', [
-    // custom_steps.test_windows_python2_gpu(),
-    // custom_steps.test_windows_python3_gpu(),
-    // custom_steps.test_windows_python3_gpu_mkldnn()
+    custom_steps.test_windows_python2_gpu(),
+    custom_steps.test_windows_python3_gpu(),
+    custom_steps.test_windows_python3_gpu_mkldnn()
   ])
 }
 ,
diff --git a/ci/qemu/README.md b/ci/qemu/README.md
index 498f8b7a8739..4beca4a03690 100644
--- a/ci/qemu/README.md
+++ b/ci/qemu/README.md
@@ -86,3 +86,7 @@ pip3 install -r mxnet_requirements.txt
 
 
 To access qemu control console from tmux: `ctrl-a a c`
+
+# CI and Testing
+
+Formally, [runtime_functions.py](https://github.com/apache/incubator-mxnet/blob/master/ci/docker/qemu/runtime_functions.py) would [run](https://github.com/apache/incubator-mxnet/blob/8beea18e3d9835f90b59d3f9de8f9945ac819423/ci/docker/qemu/runtime_functions.py#L81) *pip install -r [mxnet/tests/requirements.txt](https://github.com/apache/incubator-mxnet/blob/master/tests/requirements.txt)*. If the requirements change, there can be an unfortunate side-effect that there are no wheel files for Raspberry Pi for the new requirement. This would trigger a build from source on the emulator, which can take a long time and cause job timeouts. Therefore, we no longer install the `tests/requirements.txt` requirements, but rather rely on [test_requirements.txt](https://github.com/apache/incubator-mxnet/blob/master/ci/qemu/test_requirements.txt) to maintain the requirements for the qemu tests. Should any requirements changes lead to a job time out, it is incumbent on the submitter to update the image to include the requirement and unblock ci.
diff --git a/ci/qemu/test_requirements.txt b/ci/qemu/test_requirements.txt
new file mode 100644
index 000000000000..77037d89c673
--- /dev/null
+++ b/ci/qemu/test_requirements.txt
@@ -0,0 +1,3 @@
+mock
+nose
+nose-timer
\ No newline at end of file
diff --git a/ci/windows/test_py2_cpu.ps1 b/ci/windows/test_py2_cpu.ps1
index 46e49baeadbb..1c4a72682ae5 100644
--- a/ci/windows/test_py2_cpu.ps1
+++ b/ci/windows/test_py2_cpu.ps1
@@ -22,8 +22,8 @@ $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
-c:\Anaconda3\envs\py2\Scripts\pip install -r tests\requirements.txt
-c:\Anaconda3\envs\py2\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+C:\Python27\Scripts\pip install -r tests\requirements.txt
+C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
 if (! $?) { Throw ("Error running unittest") }
-c:\Anaconda3\envs\py2\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
+C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
 if (! $?) { Throw ("Error running train tests") }
diff --git a/ci/windows/test_py2_gpu.ps1 b/ci/windows/test_py2_gpu.ps1
index d362c61da02b..8a6c8e9b44f9 100644
--- a/ci/windows/test_py2_gpu.ps1
+++ b/ci/windows/test_py2_gpu.ps1
@@ -22,12 +22,12 @@ $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
-c:\Anaconda3\envs\py2\Scripts\pip install -r tests\requirements.txt
-c:\Anaconda3\envs\py2\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+C:\Python27\Scripts\pip install -r tests\requirements.txt
+C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
 if (! $?) { Throw ("Error running unittest") }
-c:\Anaconda3\envs\py2\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py
+C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py
 if (! $?) { Throw ("Error running tests") }
-c:\Anaconda3\envs\py2\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py
+C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py
 if (! $?) { Throw ("Error running tests") }
-c:\Anaconda3\envs\py2\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error tests\python\train
+C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error tests\python\train
 if (! $?) { Throw ("Error running tests") }
diff --git a/ci/windows/test_py3_cpu.ps1 b/ci/windows/test_py3_cpu.ps1
index 32da4885fe0a..a7067f9f3f83 100644
--- a/ci/windows/test_py3_cpu.ps1
+++ b/ci/windows/test_py3_cpu.ps1
@@ -22,8 +22,8 @@ $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
-c:\Anaconda3\envs\py3\Scripts\pip install -r tests\requirements.txt
-c:\Anaconda3\envs\py3\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+C:\Python37\Scripts\pip install -r tests\requirements.txt
+C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
 if (! $?) { Throw ("Error running unittest") }
-c:\Anaconda3\envs\py3\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
+C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
 if (! $?) { Throw ("Error running train tests") }
diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1
index b30b22ae90e4..5fbc9f2f8036 100644
--- a/ci/windows/test_py3_gpu.ps1
+++ b/ci/windows/test_py3_gpu.ps1
@@ -22,12 +22,12 @@ $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
-c:\Anaconda3\envs\py3\Scripts\pip install -r tests\requirements.txt
-c:\Anaconda3\envs\py3\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+C:\Python37\Scripts\pip install -r tests\requirements.txt
+C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
 if (! $?) { Throw ("Error running unittest") }
-c:\Anaconda3\envs\py3\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py
+C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py
 if (! $?) { Throw ("Error running tests") }
-c:\Anaconda3\envs\py3\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py
+C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py
 if (! $?) { Throw ("Error running tests") }
-c:\Anaconda3\envs\py3\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
+C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
 if (! $?) { Throw ("Error running tests") }
diff --git a/cmake/DownloadMKLML.cmake b/cmake/DownloadMKLML.cmake
index eabf861a4b2a..7b0e5ecf7c9c 100644
--- a/cmake/DownloadMKLML.cmake
+++ b/cmake/DownloadMKLML.cmake
@@ -19,15 +19,19 @@
 
 message(STATUS "Downloading MKLML...")
 
-set(MKLDNN_RELEASE v0.17-rc)
-set(MKLML_RELEASE_FILE_SUFFIX 2019.0.1.20180928)
+set(MKLDNN_RELEASE v0.18)
+set(MKLML_RELEASE_FILE_SUFFIX 2019.0.3.20190220)
+
+set(MKLML_LNX_MD5 76354b74325cd293aba593d7cbe36b3f)
+set(MKLML_WIN_MD5 02286cb980f12af610c05e99dbd78755)
+set(MKLML_MAC_MD5 3b28da686a25a4cf995ca4fc5e30e514)
 
 if(MSVC)
   set(MKL_NAME "mklml_win_${MKLML_RELEASE_FILE_SUFFIX}")
 
   file(DOWNLOAD "https://github.com/intel/mkl-dnn/releases/download/${MKLDNN_RELEASE}/${MKL_NAME}.zip"
        "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.zip"
-       EXPECTED_MD5 "443e661bdfd32dbbc99b460b43afceee" SHOW_PROGRESS)
+       EXPECTED_MD5 "${MKLML_WIN_MD5}" SHOW_PROGRESS)
   file(DOWNLOAD "https://github.com/apache/incubator-mxnet/releases/download/utils/7z.exe"
        "${CMAKE_CURRENT_BINARY_DIR}/mklml/7z2.exe"
        EXPECTED_MD5 "E1CF766CF358F368EC97662D06EA5A4C" SHOW_PROGRESS)
@@ -47,7 +51,7 @@ elseif(APPLE)
 
   file(DOWNLOAD "https://github.com/intel/mkl-dnn/releases/download/${MKLDNN_RELEASE}/${MKL_NAME}.tgz"
        "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.tgz"
-       EXPECTED_MD5 "95f887af332205b1d15b392260003952" SHOW_PROGRESS)
+       EXPECTED_MD5 "${MKLML_MAC_MD5}" SHOW_PROGRESS)
   execute_process(COMMAND "tar" "-xzf" "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.tgz"
                   "-C" "${CMAKE_CURRENT_BINARY_DIR}/mklml/")
 
@@ -61,7 +65,7 @@ elseif(UNIX)
 
   file(DOWNLOAD "https://github.com/intel/mkl-dnn/releases/download/${MKLDNN_RELEASE}/${MKL_NAME}.tgz"
        "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.tgz"
-       EXPECTED_MD5 "a63abf155361322b9c03f8fc50f4f317" SHOW_PROGRESS)
+       EXPECTED_MD5 "${MKLML_LNX_MD5}" SHOW_PROGRESS)
   execute_process(COMMAND "tar" "-xzf" "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.tgz"
                   "-C" "${CMAKE_CURRENT_BINARY_DIR}/mklml/")
 
diff --git a/contrib/clojure-package/.gitignore b/contrib/clojure-package/.gitignore
index f5d81ddc7620..71d812e56ecd 100644
--- a/contrib/clojure-package/.gitignore
+++ b/contrib/clojure-package/.gitignore
@@ -39,6 +39,8 @@ examples/visualization/test-vis.pdf
 src/.DS_Store
 src/org/.DS_Store
 test/test-ndarray.clj
+test/test-ndarray-api.clj
 test/test-symbol.clj
+test/test-symbol-api.clj
 src/org/apache/clojure_mxnet/gen/*
 
diff --git a/contrib/clojure-package/examples/bert-qa/.gitignore b/contrib/clojure-package/examples/bert-qa/.gitignore
new file mode 100644
index 000000000000..d18f225992a9
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/.gitignore
@@ -0,0 +1,12 @@
+/target
+/classes
+/checkouts
+profiles.clj
+pom.xml
+pom.xml.asc
+*.jar
+*.class
+/.lein-*
+/.nrepl-port
+.hgignore
+.hg/
diff --git a/contrib/clojure-package/examples/bert-qa/README.md b/contrib/clojure-package/examples/bert-qa/README.md
new file mode 100644
index 000000000000..9a21bcdfd66b
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/README.md
@@ -0,0 +1,91 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+# bert-qa
+
+**This example was based off of the Java API one. It shows how to do inference with a pre-trained BERT network that is trained on Questions and Answers using the [SQuAD Dataset](https://rajpurkar.github.io/SQuAD-explorer/)**
+
+The pretrained model was created using GluonNLP and then exported to the MXNet symbol format. You can find more information in the background section below.
+
+In this tutorial, we will walk through the BERT QA model trained by MXNet. 
+Users can provide a question with a paragraph contains answer to the model and
+the model will be able to find the best answer from the answer paragraph.
+
+Example:
+
+```
+{:input-answer "Steam engines are external combustion engines, where the working fluid is separate from the combustion products. Non-combustion heat sources such as solar power, nuclear power or geothermal energy may be used. The ideal thermodynamic cycle used to analyze this process is called the Rankine cycle. In the cycle, water is heated and transforms into steam within a boiler operating at a high pressure. When expanded through pistons or turbines, mechanical work is done. The reduced-pressure steam is then condensed and pumped back into the boiler."
+  :input-question "Along with geothermal and nuclear, what is a notable non-combustion heat source?"
+  :ground-truth-answers ["solar"
+                         "solar power"
+                         "solar power, nuclear power or geothermal energysolar"]}
+```
+
+The prediction in this case would be `solar power`
+
+## Setup Guide
+
+### Step 1: Download the model
+
+For this tutorial, you can get the model and vocabulary by running following bash file. This script will use `wget` to download these artifacts from AWS S3.
+
+From the example directory:
+
+```bash
+./get_bert_data.sh
+```
+
+Some sample questions and answers are provide in the `squad-sample.edn` file. Some are taken directly from the SQuAD dataset and one was just made up. Feel free to edit the file and add your own!
+
+
+## To run
+
+* `lein install` in the root of the main project directory
+* cd into this project directory and do `lein run`. This will execute the cpu version.
+
+`lein run :cpu` - to run with cpu
+`lein run :gpu` - to run with gpu
+
+## Background
+
+To learn more about how BERT works in MXNet, please follow this [MXNet Gluon tutorial on NLP using BERT](https://medium.com/apache-mxnet/gluon-nlp-bert-6a489bdd3340).
+
+The model was extracted from MXNet GluonNLP with static length settings.
+
+[Download link for the script](https://gluon-nlp.mxnet.io/_downloads/bert.zip)
+
+The original description can be found in the [MXNet GluonNLP model zoo](https://gluon-nlp.mxnet.io/model_zoo/bert/index.html#bert-base-on-squad-1-1).
+```bash
+python static_finetune_squad.py --optimizer adam --accumulate 2 --batch_size 6 --lr 3e-5 --epochs 2 --gpu 0 --export
+
+```
+This script will generate `json` and `param` fles that are the standard MXNet model files.
+By default, this model are using `bert_12_768_12` model with extra layers for QA jobs.
+
+After that, to be able to use it in Java, we need to export the dictionary from the script to parse the text
+to actual indexes. Please add the following lines after [this line](https://github.com/dmlc/gluon-nlp/blob/master/scripts/bert/staticbert/static_finetune_squad.py#L262).
+```python
+import json
+json_str = vocab.to_json()
+f = open("vocab.json", "w")
+f.write(json_str)
+f.close()
+```
+This would export the token vocabulary in json format.
+Once you have these three files, you will be able to run this example without problems.
+
diff --git a/contrib/clojure-package/examples/bert-qa/get_bert_data.sh b/contrib/clojure-package/examples/bert-qa/get_bert_data.sh
new file mode 100755
index 000000000000..603194a03c05
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/get_bert_data.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+data_path=model
+
+if [ ! -d "$data_path" ]; then
+  mkdir -p "$data_path"
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/vocab.json -o $data_path/vocab.json
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/static_bert_qa-0002.params -o $data_path/static_bert_qa-0002.params
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/static_bert_qa-symbol.json -o $data_path/static_bert_qa-symbol.json
+fi
diff --git a/contrib/clojure-package/examples/bert-qa/project.clj b/contrib/clojure-package/examples/bert-qa/project.clj
new file mode 100644
index 000000000000..d256d44d0798
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/project.clj
@@ -0,0 +1,28 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+(defproject bert-qa "0.1.0-SNAPSHOT"
+  :description "BERT QA Example"
+  :plugins [[lein-cljfmt "0.5.7"]]
+  :dependencies [[org.clojure/clojure "1.9.0"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
+                 [cheshire "5.8.1"]]
+  :pedantic? :skip
+  :java-source-paths ["src/java"]
+  :main bert-qa.infer
+  :repl-options {:init-ns bert-qa.infer})
diff --git a/contrib/clojure-package/examples/bert-qa/squad-samples.edn b/contrib/clojure-package/examples/bert-qa/squad-samples.edn
new file mode 100644
index 000000000000..e99a181f7d17
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/squad-samples.edn
@@ -0,0 +1,39 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+[{:input-answer "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
+  :input-question "By what main attribute are computational problems classified utilizing computational complexity theory?"
+  :ground-truth-answers ["Computational complexity theory"
+                         "Computational  complexity theory"
+                         "complexity theory"]}
+ {:input-answer "Steam engines are external combustion engines, where the working fluid is separate from the combustion products. Non-combustion heat sources such as solar power, nuclear power or geothermal energy may be used. The ideal thermodynamic cycle used to analyze this process is called the Rankine cycle. In the cycle, water is heated and transforms into steam within a boiler operating at a high pressure. When expanded through pistons or turbines, mechanical work is done. The reduced-pressure steam is then condensed and pumped back into the boiler."
+  :input-question "Along with geothermal and nuclear, what is a notable non-combustion heat source?"
+  :ground-truth-answers ["solar"
+                         "solar power"
+                         "solar power, nuclear power or geothermal energysolar"]}
+ {:input-answer "In the 1960s, a series of discoveries, the most important of which was seafloor spreading, showed that the Earth's lithosphere, which includes the crust and rigid uppermost portion of the upper mantle, is separated into a number of tectonic plates that move across the plastically deforming, solid, upper mantle, which is called the asthenosphere. There is an intimate coupling between the movement of the plates on the surface and the convection of the mantle: oceanic plate motions and mantle convection currents always move in the same direction, because the oceanic lithosphere is the rigid upper thermal boundary layer of the convecting mantle. This coupling between rigid plates moving on the surface of the Earth and the convecting mantle is called plate tectonics."
+  :input-question "What was the most important discovery that led to the understanding that Earth's lithosphere is separated into tectonic plates?"
+  :ground-truth-answers ["seafloor spreading"]}
+ ;;; totally made up
+ {:input-answer "Susan had a cat named Sammy when she lived in the green house."
+  :input-question "What was Susan's cat named?"
+  :ground-truth-answers ["Sammy" "sammy"]}
+ ;;; more or less from wikipedia on clojure
+ {:input-answer "Rich Hickey is the creator of the Clojure language. Before Clojure, he developed dotLisp, a similar project based on the .NET platform, and three earlier attempts to provide interoperability between Lisp and Java: a Java foreign language interface for Common Lisp, A Foreign Object Interface for Lisp, and a Lisp-friendly interface to Java Servlets."
+  :input-question "Who created Clojure?"
+  :ground-truth-answers ["rich" "hickey"]}]
diff --git a/contrib/clojure-package/examples/bert-qa/src/bert_qa/infer.clj b/contrib/clojure-package/examples/bert-qa/src/bert_qa/infer.clj
new file mode 100644
index 000000000000..836684e04977
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/src/bert_qa/infer.clj
@@ -0,0 +1,159 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+(ns bert-qa.infer
+  (:require [clojure.string :as string]
+            [clojure.reflect :as r]
+            [cheshire.core :as json]
+            [clojure.java.io :as io]
+            [clojure.set :as set]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [org.apache.clojure-mxnet.ndarray :as ndarray]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [clojure.pprint :as pprint]))
+
+(def model-path-prefix "model/static_bert_qa")
+;; epoch number of the model
+(def epoch 2)
+;; the vocabulary used in the model
+(def model-vocab "model/vocab.json")
+;; the input question
+;; the maximum length of the sequence
+(def seq-length 384)
+
+;;; data helpers
+
+(defn break-out-punctuation [s str-match]
+  (->> (string/split (str s "<punc>") (re-pattern (str "\\" str-match)))
+       (map #(string/replace % "<punc>" str-match))))
+
+(defn break-out-punctuations [s]
+  (if-let [target-char (first (re-seq #"[.,?!]" s))]
+    (break-out-punctuation s target-char)
+    [s]))
+
+(defn tokenize [s]
+  (->> (string/split s #"\s+")
+       (mapcat break-out-punctuations)
+       (into [])))
+
+(defn pad [tokens pad-item num]
+  (if (>= (count tokens) num)
+    tokens
+    (into tokens (repeat (- num (count tokens)) pad-item))))
+
+(defn get-vocab []
+  (let [vocab (json/parse-stream (clojure.java.io/reader "model/vocab.json"))]
+    {:idx->token (get vocab "idx_to_token")
+     :token->idx (get vocab "token_to_idx")}))
+
+(defn tokens->idxs [token->idx tokens]
+  (let [unk-idx (get token->idx "[UNK]")]
+   (mapv #(get token->idx % unk-idx) tokens)))
+
+(defn idxs->tokens [idx->token idxs]
+  (mapv #(get idx->token %) idxs))
+
+(defn post-processing [result tokens]
+  (let [output1 (ndarray/slice-axis result 2 0 1)
+        output2 (ndarray/slice-axis result 2 1 2)
+        ;;; get the formatted logits result
+        start-logits (ndarray/reshape output1 [0 -3])
+        end-logits (ndarray/reshape output2 [0 -3])
+        start-prob (ndarray/softmax start-logits)
+        end-prob (ndarray/softmax end-logits)
+        start-idx (-> (ndarray/argmax start-prob 1)
+                      (ndarray/->vec)
+                      (first))
+        end-idx (-> (ndarray/argmax end-prob 1)
+                    (ndarray/->vec)
+                    (first))]
+    (if (> end-idx start-idx)
+      (subvec tokens start-idx (inc end-idx))
+      (subvec tokens end-idx (inc end-idx)))))
+
+(defn make-predictor [ctx]
+  (let [input-descs [{:name "data0"
+                      :shape [1 seq-length]
+                      :dtype dtype/FLOAT32
+                      :layout layout/NT}
+                     {:name "data1"
+                      :shape [1 seq-length]
+                      :dtype dtype/FLOAT32
+                      :layout layout/NT}
+                     {:name "data2"
+                      :shape [1]
+                      :dtype dtype/FLOAT32
+                      :layout layout/N}]
+        factory (infer/model-factory model-path-prefix input-descs)]
+    (infer/create-predictor
+     factory
+     {:contexts [ctx]
+      :epoch 2})))
+
+(defn pre-processing [ctx idx->token token->idx qa-map]
+  (let [{:keys [input-question input-answer ground-truth-answers]} qa-map
+       ;;; pre-processing tokenize sentence
+        token-q (tokenize (string/lower-case input-question))
+        token-a (tokenize (string/lower-case input-answer))
+        valid-length (+ (count token-q) (count token-a))
+        ;;; generate token types [0000...1111...0000]
+        qa-embedded (into (pad [] 0 (count token-q))
+                          (pad [] 1 (count token-a)))
+        token-types (pad qa-embedded 0 seq-length)
+        ;;; make BERT pre-processing standard
+        token-a (conj token-a "[SEP]")
+        token-q (into [] (concat ["[CLS]"] token-q ["[SEP]"] token-a))
+        tokens (pad token-q "[PAD]" seq-length)
+        ;;; pre-processing - token to index translation
+
+        indexes (tokens->idxs token->idx tokens)]
+    {:input-batch [(ndarray/array indexes [1 seq-length] {:context ctx})
+                   (ndarray/array token-types [1 seq-length] {:context ctx})
+                   (ndarray/array [valid-length] [1] {:context ctx})]
+     :tokens tokens
+     :qa-map qa-map}))
+
+(defn infer [ctx]
+  (let [ctx (context/default-context)
+        predictor (make-predictor ctx)
+        {:keys [idx->token token->idx]} (get-vocab)
+        ;;; samples taken from https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/
+        question-answers (clojure.edn/read-string (slurp "squad-samples.edn"))]
+    (doseq [qa-map question-answers]
+      (let [{:keys [input-batch tokens qa-map]} (pre-processing ctx idx->token token->idx qa-map)
+            result (first (infer/predict-with-ndarray predictor input-batch))
+            answer (post-processing result tokens)]
+        (println "===============================")
+        (println "      Question Answer Data")
+        (pprint/pprint qa-map)
+        (println)
+        (println "  Predicted Answer: " answer)
+        (println "===============================")))))
+
+(defn -main [& args]
+  (let [[dev] args]
+    (if (= dev ":gpu")
+      (infer (context/gpu))
+      (infer (context/cpu)))))
+
+(comment
+
+  (infer :cpu))
diff --git a/contrib/clojure-package/examples/bert-qa/test/bert_qa/infer_test.clj b/contrib/clojure-package/examples/bert-qa/test/bert_qa/infer_test.clj
new file mode 100644
index 000000000000..767fb089f284
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/test/bert_qa/infer_test.clj
@@ -0,0 +1,42 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+(ns bert-qa.infer-test
+  (:require [bert-qa.infer :refer :all]
+            [clojure.java.io :as io]
+            [clojure.java.shell :refer [sh]]
+            [clojure.test :refer :all]
+            [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.infer :as infer]))
+
+(def model-dir "model/")
+
+(when-not (.exists (io/file (str model-dir "static_bert_qa-0002.params")))
+  (println "Downloading bert qa data")
+  (sh "./get_bert_data.sh"))
+
+(deftest infer-test
+  (let [ctx (context/default-context)
+        predictor (make-predictor ctx)
+        {:keys [idx->token token->idx]} (get-vocab)
+        ;;; samples taken from https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/
+        question-answers (clojure.edn/read-string (slurp "squad-samples.edn"))]
+    (let [qa-map (last question-answers)
+          {:keys [input-batch tokens qa-map]} (pre-processing ctx idx->token token->idx qa-map)
+          result (first (infer/predict-with-ndarray predictor input-batch))]
+      (is (= ["rich" "hickey"] (post-processing result tokens))))))
diff --git a/contrib/clojure-package/examples/infer/objectdetector/project.clj b/contrib/clojure-package/examples/infer/objectdetector/project.clj
index cdd9a8991dc8..da01797f5a21 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/project.clj
+++ b/contrib/clojure-package/examples/infer/objectdetector/project.clj
@@ -22,7 +22,6 @@
   :aliases {"run-detector" ["run" "--" "-m" "models/resnet50_ssd/resnet50_ssd_model" "-i" "images/dog.jpg" "-d" "images/"]}
   :dependencies [[org.clojure/clojure "1.9.0"]
                  [org.clojure/tools.cli "0.4.1"]
-                 [origami "4.0.0-3"]
                  [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main ^:skip-aot infer.objectdetector-example
   :profiles {:uberjar {:aot :all}})
diff --git a/contrib/clojure-package/examples/infer/objectdetector/src/infer/draw.clj b/contrib/clojure-package/examples/infer/objectdetector/src/infer/draw.clj
deleted file mode 100644
index d29b34b5c22a..000000000000
--- a/contrib/clojure-package/examples/infer/objectdetector/src/infer/draw.clj
+++ /dev/null
@@ -1,44 +0,0 @@
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns infer.draw
-  (:require
-   [opencv4.colors.rgb :as rgb]
-   [opencv4.core :refer [FONT_HERSHEY_PLAIN imread imwrite new-point put-text! rectangle]]))
-
-(defn black-boxes! [img results]
-  (doseq [{confidence :confidence label :label top-left :top-left bottom-right :bottom-right} results]
-    (let [w (.width img)
-          h (.height img)
-          top-left-p (new-point (int (* w (first top-left))) (int (* h (second top-left))))
-          bottom-right-p (new-point (int (* w (first bottom-right))) (int (* h (second bottom-right))))]
-      (if (< 15 confidence)
-        (do
-          (rectangle img top-left-p bottom-right-p rgb/white 1)
-          (put-text! img
-                     (str label "[" confidence "% ]")
-                     top-left-p
-                     FONT_HERSHEY_PLAIN
-                     1.0
-                     rgb/white 1)))))
-  img)
-
-(defn draw-bounds [image results output-dir]
-  (let [out-file (str output-dir "/" (.getName (clojure.java.io/as-file image)))]
-    (-> image
-        (imread)
-        (black-boxes! results)
-        (imwrite out-file))))
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj b/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj
index 9331798b038c..65d822ff36aa 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj
+++ b/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj
@@ -17,13 +17,15 @@
 (ns infer.objectdetector-example
   (:require [org.apache.clojure-mxnet.context :as context]
             [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.image :as image]
             [org.apache.clojure-mxnet.infer :as infer]
             [org.apache.clojure-mxnet.layout :as layout]
             [clojure.java.io :as io]
-            [infer.draw :as draw]
-            [clojure.string :refer [join]]
+            [clojure.string :as string]
             [clojure.tools.cli :refer [parse-opts]])
-  (:gen-class))
+  (:gen-class)
+  (:import (javax.imageio ImageIO)
+           (java.io File)))
 
 (defn check-valid-dir
   "Check that the input directory exists"
@@ -54,27 +56,36 @@
     :validate [check-valid-dir "Input directory not found"]]
    ["-h" "--help"]])
 
-(defn result->map [{:keys [class prob x-min y-min x-max y-max]}]
-  (hash-map
-   :label class
-   :confidence (int (* 100 prob))
-   :top-left [x-min y-min]
-   :bottom-right [x-max y-max]))
 
-(defn print-results [results]
-  (doseq [_r results]
-    (println (format "Class: %s Confidence=%s Coords=(%s, %s)"
-                     (_r :label)
-                     (_r :confidence)
-                     (_r :top-left)
-                     (_r :bottom-right)))))
+(defn process-result! [output-dir image-path predictions]
+  (println "looking at image" image-path)
+  (println "predictions: " predictions)
+  (let [buf (ImageIO/read (new File image-path))
+        width (.getWidth buf)
+        height (.getHeight buf)
+        names (mapv :class predictions)
+        coords (mapv (fn [prediction]
+                       (-> prediction
+                           (update :x-min #(* width %))
+                           (update :x-max #(* width %))
+                           (update :y-min #(* height %))
+                           (update :y-max #(* height %))))
+                     predictions)
+        new-img  (-> (ImageIO/read (new File image-path))
+                     (image/draw-bounding-box! coords
+                                               {:stroke 2
+                                                :names (mapv #(str (:class %) "-" (:prob %))
+                                                             predictions)
+                                                :transparency 0.5
+
+                                                :font-size-mult 1.0}))]
+    (->> (string/split image-path #"\/")
+         last
+         (io/file output-dir)
+         (ImageIO/write new-img "jpg"))))
 
 (defn process-results [images results output-dir]
-  (dotimes [i (count images)]
-    (let [image (nth images i) _results (map result->map (nth results i))]
-      (println "processing: " image)
-      (print-results _results)
-      (draw/draw-bounds image _results output-dir))))
+  (doall (map (partial process-result! output-dir) images results)))
 
 (defn detect-single-image
   "Detect objects in a single image and print top-5 predictions"
@@ -82,7 +93,7 @@
   ([detector input-image output-dir]
     (.mkdir (io/file output-dir))
   (let [image (infer/load-image-from-file input-image)
-        topk 5
+        topk 3
         res (infer/detect-objects detector image topk)
         ]
     (process-results
@@ -109,7 +120,7 @@
     (apply concat
      (for [image-files image-file-batches]
        (let [image-batch (infer/load-image-paths image-files) 
-             topk 5 
+             topk 3
              res (infer/detect-objects-batch detector image-batch topk) ]
          (process-results
           image-files
@@ -143,5 +154,5 @@
         (parse-opts args cli-options)]
     (cond
       (:help options) (println summary)
-      (some? errors) (println (join "\n" errors))
+      (some? errors) (println (string/join "\n" errors))
       :else (run-detector options))))
diff --git a/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj b/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj
index 696d96b3ae3a..3d20c614918f 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj
+++ b/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj
@@ -47,11 +47,11 @@
         {:keys [class prob x-min x-max y-min y-max] :as pred} (first predictions)]
     (clojure.pprint/pprint predictions)
     (is (some? predictions))
-    (is (= 5 (count predictions)))
+    (is (= 3 (count predictions)))
     (is (string? class))
     (is (< 0.8 prob))
     (is (every? #(< 0 % 1) [x-min x-max y-min y-max]))
-    (is (= #{"dog" "person" "bicycle" "car"} (set (mapv :class predictions))))))
+    (is (= #{"dog" "bicycle" "car"} (set (mapv :class predictions))))))
 
 (deftest test-batch-detection
   (let [detector (create-detector)
@@ -60,7 +60,7 @@
         predictions (first batch-predictions)
         {:keys [class prob x-min x-max y-min y-max] :as pred} (first predictions)]
     (is (some? batch-predictions))
-    (is (= 5 (count predictions)))
+    (is (= 3 (count predictions)))
     (is (string? class))
     (is (< 0.8 prob))
     (println [x-min x-max y-min y-max])
diff --git a/contrib/clojure-package/integration-tests.sh b/contrib/clojure-package/integration-tests.sh
index 3f80ea5fb29a..3df9ba9787b5 100755
--- a/contrib/clojure-package/integration-tests.sh
+++ b/contrib/clojure-package/integration-tests.sh
@@ -26,7 +26,7 @@ lein install
 # then run through the examples 
 EXAMPLES_HOME=${MXNET_HOME}/contrib/clojure-package/examples
 # use AWK pattern for blacklisting
-TEST_CASES=`find ${EXAMPLES_HOME} -name test | awk '!/dontselect1|cnn-text-classification/'`
+TEST_CASES=`find ${EXAMPLES_HOME} -name test | awk '!/dontselect1|cnn-text-classification|gan|neural-style|pre-trained-models/'`
 for i in $TEST_CASES ; do
  cd ${i} && lein test
 done
diff --git a/contrib/clojure-package/src/dev/generator.clj b/contrib/clojure-package/src/dev/generator.clj
index ca93c3421d2a..34210bef63d0 100644
--- a/contrib/clojure-package/src/dev/generator.clj
+++ b/contrib/clojure-package/src/dev/generator.clj
@@ -17,10 +17,14 @@
 
 (ns dev.generator
   (:require [t6.from-scala.core :as scala]
+            [t6.from-scala.core :refer [$ $$] :as $]
             [clojure.reflect :as r]
-            [org.apache.clojure-mxnet.util :as util]
-            [clojure.pprint])
-  (:import (org.apache.mxnet NDArray Symbol))
+            [clojure.pprint]
+            [org.apache.clojure-mxnet.util :as util])
+  (:import (org.apache.mxnet NDArray NDArrayAPI
+                             Symbol SymbolAPI
+                             Base Base$RefInt Base$RefLong Base$RefFloat Base$RefString)
+           (scala.collection.mutable ListBuffer ArrayBuffer))
   (:gen-class))
 
 
@@ -34,17 +38,17 @@
       (clojure.string/replace #"\_" "-")
       (clojure.string/replace #"\/" "div")))
 
-(defn symbol-transform-param-name [parameter-types]
+(defn transform-param-names [coerce-fn parameter-types]
   (->> parameter-types
        (map str)
-       (map (fn [x] (or (util/symbol-param-coerce x) x)))
+       (map (fn [x] (or (coerce-fn x) x)))
        (map (fn [x] (last (clojure.string/split x #"\."))))))
 
+(defn symbol-transform-param-name [parameter-types]
+  (transform-param-names util/symbol-param-coerce parameter-types))
+
 (defn ndarray-transform-param-name [parameter-types]
-  (->> parameter-types
-       (map str)
-       (map (fn [x] (or (util/ndarray-param-coerce x) x)))
-       (map (fn [x] (last (clojure.string/split x #"\."))))))
+  (transform-param-names util/ndarray-param-coerce parameter-types))
 
 (defn has-variadic? [params]
   (->> params
@@ -56,37 +60,136 @@
 
 (defn increment-param-name [pname]
   (if-let [num-str (re-find #"-\d" pname)]
-    (str (first (clojure.string/split pname #"-")) "-" (inc (Integer/parseInt (last (clojure.string/split num-str #"-")))))
+    (str 
+     (first (clojure.string/split pname #"-"))
+     "-"
+     (inc (Integer/parseInt (last (clojure.string/split num-str #"-")))))
     (str pname "-" 1)))
 
-(defn rename-duplicate-params [params]
-  (reduce (fn [known-names n] (conj known-names (if (contains? (set known-names) n)
-                                                  (increment-param-name n)
-                                                  n)))
-          []
-          params))
-
+(defn rename-duplicate-params [pnames]
+  (->> (reduce
+        (fn [pname-counts n]
+          (let [rn (if (pname-counts n) (str n "-" (pname-counts n)) n)
+                inc-pname-counts (update-in pname-counts [n] (fnil inc 0))]
+            (update-in inc-pname-counts [:params] conj rn)))
+        {:params []}
+        pnames)
+       :params))
+
+(defn get-public-no-default-methods [obj]
+  (->> (r/reflect obj)
+       :members
+       (map #(into {} %))
+       (filter #(-> % :flags :public))
+       (remove #(re-find #"org\$apache\$mxnet" (str (:name %))))
+       (remove #(re-find #"\$default" (str (:name %))))))
+
+(defn get-public-to-gen-methods [public-to-hand-gen public-no-default]
+  (let [public-to-hand-gen-names
+        (into #{} (mapv (comp str :name) public-to-hand-gen))]
+    (remove #(-> % :name str public-to-hand-gen-names) public-no-default)))
 
-;;;;;;; symbol
+(defn public-by-name-and-param-count [public-reflect-info]
+ (->> public-reflect-info
+      (group-by :name)
+      (map (fn [[k v]] [k (group-by #(count (:parameter-types %)) v)]))
+      (into {})))
 
-(def symbol-reflect-info (->> (:members (r/reflect Symbol))
-                              (map #(into {} %))))
+(def license
+  (str
+   ";; Licensed to the Apache Software Foundation (ASF) under one or more\n"
+   ";; contributor license agreements.  See the NOTICE file distributed with\n"
+   ";; this work for additional information regarding copyright ownership.\n"
+   ";; The ASF licenses this file to You under the Apache License, Version 2.0\n"
+   ";; (the \"License\"); you may not use this file except in compliance with\n"
+   ";; the License.  You may obtain a copy of the License at\n"
+   ";;\n"
+   ";;    http://www.apache.org/licenses/LICENSE-2.0\n"
+   ";;\n"
+   ";; Unless required by applicable law or agreed to in writing, software\n"
+   ";; distributed under the License is distributed on an \"AS IS\" BASIS,\n"
+   ";; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"
+   ";; See the License for the specific language governing permissions and\n"
+   ";; limitations under the License.\n"
+   ";;\n"))
 
-(def symbol-public (filter (fn [x] (-> x :flags :public)) symbol-reflect-info))
+(defn write-to-file [functions ns-gen fname]
+  (with-open [w (clojure.java.io/writer fname)]
+    (.write w ns-gen)
+    (.write w "\n\n")
+    (.write w ";; Do not edit - this is auto-generated")
+    (.write w "\n\n")
+    (.write w license)
+    (.write w "\n\n")
+    (.write w "\n\n")
+  (doseq [f functions]
+    (clojure.pprint/pprint f w)
+    (.write w "\n"))))
 
-(def symbol-public-no-default (->> symbol-public
-                                   (filter #(not (re-find #"org\$apache\$mxnet" (str (:name %)))))
-                                   (filter #(not (re-find #"\$default" (str (:name %)))))))
+;;;;;;; Common operations
+
+(def libinfo (Base/_LIB))
+(def op-names
+  (let [l ($ ListBuffer/empty)]
+    (do (.mxListAllOpNames libinfo l)
+        (remove #(or (= "Custom" %)
+                     (re-matches #"^_.*" %))
+                (util/buffer->vec l)))))
+
+(defn- parse-arg-type [s]
+  (let [[_ var-arg-type _ set-arg-type arg-spec _ type-req _ default-val] (re-find #"(([\w-\[\]\s]+)|\{([^}]+)\})\s*(\([^)]+\))?(,\s*(optional|required)(,\s*default=(.*))?)?" s)]
+    {:type (clojure.string/trim (or set-arg-type var-arg-type))
+     :spec arg-spec
+     :optional? (or (= "optional" type-req)
+                    (= "boolean" var-arg-type))
+     :default default-val
+     :orig s}))
+
+(defn- get-op-handle [op-name]
+  (let [ref (new Base$RefLong 0)]
+    (do (.nnGetOpHandle libinfo op-name ref)
+        (.value ref))))
+
+(defn gen-op-info [op-name]
+  (let [handle (get-op-handle op-name)
+        name (new Base$RefString nil)
+        desc (new Base$RefString nil)
+        key-var-num-args (new Base$RefString nil)
+        num-args (new Base$RefInt 0)
+        arg-names ($ ListBuffer/empty)
+        arg-types ($ ListBuffer/empty)
+        arg-descs ($ ListBuffer/empty)]
+    (do (.mxSymbolGetAtomicSymbolInfo libinfo
+                                      handle
+                                      name
+                                      desc
+                                      num-args
+                                      arg-names
+                                      arg-types
+                                      arg-descs
+                                      key-var-num-args)
+        {:fn-name (clojure-case (.value name))
+         :fn-description (.value desc)
+         :args (mapv (fn [t n d] (assoc t :name n :description d))
+                     (mapv parse-arg-type (util/buffer->vec arg-types))
+                     (mapv clojure-case (util/buffer->vec arg-names))
+                     (util/buffer->vec arg-descs))
+         :key-var-num-args (clojure-case (.value key-var-num-args))})))
+
+;;;;;;; Symbol
+
+(def symbol-public-no-default
+  (get-public-no-default-methods Symbol))
 
 (into #{} (mapcat :parameter-types symbol-public-no-default))
-                                        ;#{java.lang.Object scala.collection.Seq scala.Option long double scala.collection.immutable.Map int ml.dmlc.mxnet.Executor float ml.dmlc.mxnet.Context java.lang.String scala.Enumeration$Value ml.dmlc.mxnet.Symbol int<> ml.dmlc.mxnet.Symbol<> ml.dmlc.mxnet.Shape java.lang.String<>}
+;; #{java.lang.Object scala.collection.Seq scala.Option long double scala.collection.immutable.Map int ml.dmlc.mxnet.Executor float ml.dmlc.mxnet.Context java.lang.String scala.Enumeration$Value ml.dmlc.mxnet.Symbol int<> ml.dmlc.mxnet.Symbol<> ml.dmlc.mxnet.Shape java.lang.String<>}
 
-(def symbol-hand-gen-set  #{"scala.Option"
-                            "int org.apache.mxnet.Executor"
-                            "scala.Enumeration$Value"
-                            "org.apache.mxnet.Context"
-                            "scala.Tuple2"
-                            "scala.collection.Traversable"} )
+(def symbol-hand-gen-set
+  #{"scala.Option"
+    "scala.Enumeration$Value"
+    "org.apache.mxnet.Context"
+    "scala.Tuple2"
+    "scala.collection.Traversable"})
 
 ;;; min and max have a conflicting arity of 2 with the auto gen signatures
 (def symbol-filter-name-set #{"max" "min"})
@@ -102,34 +205,35 @@
         count
         pos?)))
 
-(def symbol-public-to-hand-gen (filter is-symbol-hand-gen? symbol-public-no-default))
-(def symbol-public-to-gen (->> (remove  #(contains?(->>  symbol-public-to-hand-gen
-                                                          (mapv :name)
-                                                          (mapv str)
-                                                          (set)) (str (:name %))) symbol-public-no-default)))
+(def symbol-public-to-hand-gen
+  (filter is-symbol-hand-gen? symbol-public-no-default))
+(def symbol-public-to-gen
+  (get-public-to-gen-methods symbol-public-to-hand-gen
+                             symbol-public-no-default))
 
 
 (count symbol-public-to-hand-gen) ;=> 35 mostly bind!
 (count symbol-public-to-gen) ;=> 307
 
-(into #{} (map :name symbol-public-to-hand-gen));=>  #{arange bind ones zeros simpleBind Variable}
+(into #{} (map :name symbol-public-to-hand-gen))
+;;=>  #{arange bind ones zeros simpleBind Variable}
 
-(defn public-by-name-and-param-count [public-reflect-info]
- (->> public-reflect-info
-      (group-by :name)
-      (map (fn [[k v]] [k (group-by #(count (:parameter-types %)) v)]))
-      (into {})))
 
 
 (defn symbol-vector-args []
-  `(if (map? ~'kwargs-map-or-vec-or-sym) (~'util/empty-list) (~'util/coerce-param ~'kwargs-map-or-vec-or-sym #{"scala.collection.Seq"})))
+  `(if (map? ~'kwargs-map-or-vec-or-sym)
+     (~'util/empty-list)
+     (~'util/coerce-param ~'kwargs-map-or-vec-or-sym #{"scala.collection.Seq"})))
 
 (defn symbol-map-args []
-  `(if (map? ~'kwargs-map-or-vec-or-sym) (util/convert-symbol-map ~'kwargs-map-or-vec-or-sym) nil))
+  `(if (map? ~'kwargs-map-or-vec-or-sym)
+     (util/convert-symbol-map ~'kwargs-map-or-vec-or-sym)
+     nil))
 
 
 (defn add-symbol-arities [params function-name]
-  (if (= ["sym-name" "kwargs-map" "symbol-list" "kwargs-map-1"] (mapv str params))
+  (if (= ["sym-name" "kwargs-map" "symbol-list" "kwargs-map-1"]
+         (mapv str params))
     [`([~'sym-name ~'attr-map ~'kwargs-map]
        (~function-name ~'sym-name (~'util/convert-symbol-map ~'attr-map) (~'util/empty-list) (~'util/convert-symbol-map ~'kwargs-map)))
      `([~'sym-name ~'kwargs-map-or-vec-or-sym]
@@ -180,36 +284,7 @@
      `(~'defn ~function-name
        ~@(remove nil? (gen-symbol-function-arity op-name op-values function-name))))))
 
-(def license
-  (str
-   ";; Licensed to the Apache Software Foundation (ASF) under one or more\n"
-   ";; contributor license agreements.  See the NOTICE file distributed with\n"
-   ";; this work for additional information regarding copyright ownership.\n"
-   ";; The ASF licenses this file to You under the Apache License, Version 2.0\n"
-   ";; (the \"License\"); you may not use this file except in compliance with\n"
-   ";; the License.  You may obtain a copy of the License at\n"
-   ";;\n"
-   ";;    http://www.apache.org/licenses/LICENSE-2.0\n"
-   ";;\n"
-   ";; Unless required by applicable law or agreed to in writing, software\n"
-   ";; distributed under the License is distributed on an \"AS IS\" BASIS,\n"
-   ";; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"
-   ";; See the License for the specific language governing permissions and\n"
-   ";; limitations under the License.\n"
-   ";;\n"))
 
-(defn write-to-file [functions ns-gen fname]
-  (with-open [w (clojure.java.io/writer fname)]
-    (.write w ns-gen)
-    (.write w "\n\n")
-    (.write w ";; Do not edit - this is auto-generated")
-    (.write w "\n\n")
-    (.write w license)
-    (.write w "\n\n")
-    (.write w "\n\n")
-  (doseq [f functions]
-    (clojure.pprint/pprint f w)
-    (.write w "\n"))))
 
 (def symbol-gen-ns "(ns org.apache.clojure-mxnet.symbol
   (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
@@ -223,25 +298,18 @@
   (println "Generating symbol file")
   (write-to-file all-symbol-functions symbol-gen-ns "src/org/apache/clojure_mxnet/gen/symbol.clj"))
 
+;;;;;;; NDArray
 
-;;;;;;;;NDARRAY
-
-
-(def ndarray-reflect-info (->> (:members (r/reflect NDArray))
-                                (map #(into {} %))))
 
+(def ndarray-public-no-default
+  (get-public-no-default-methods NDArray))
 
-(def ndarray-public (filter (fn [x] (-> x :flags :public)) ndarray-reflect-info))
-
-(def ndarray-public-no-default (->> ndarray-public
-                                    (filter #(not (re-find #"org\$apache\$mxnet" (str (:name %)))))
-                                    (filter #(not (re-find #"\$default" (str (:name %)))))))
-
-(def ndarray-hand-gen-set  #{"org.apache.mxnet.NDArrayFuncReturn"
-                             "org.apache.mxnet.Context"
-                             "scala.Enumeration$Value"
-                             "scala.Tuple2"
-                             "scala.collection.Traversable"} )
+(def ndarray-hand-gen-set
+  #{"org.apache.mxnet.NDArrayFuncReturn"
+    "org.apache.mxnet.Context"
+    "scala.Enumeration$Value"
+    "scala.Tuple2"
+    "scala.collection.Traversable"})
 
 (defn is-ndarray-hand-gen? [info]
   (->> (map str (:parameter-types info))
@@ -251,17 +319,17 @@
        pos?))
 
 
-(def ndarray-public-to-hand-gen (filter is-ndarray-hand-gen? ndarray-public-no-default))
-(def ndarray-public-to-gen (->> (remove  #(contains?(->>  ndarray-public-to-hand-gen
-                                                          (mapv :name)
-                                                          (mapv str)
-                                                          (set)) (str (:name %))) ndarray-public-no-default)))
+(def ndarray-public-to-hand-gen
+  (filter is-ndarray-hand-gen? ndarray-public-no-default))
+(def ndarray-public-to-gen
+  (get-public-to-gen-methods ndarray-public-to-hand-gen
+                             ndarray-public-no-default))
 
 
 (count ndarray-public-to-hand-gen) ;=> 15
 (count ndarray-public-to-gen) ;=> 486
 
-(map :name ndarray-public-to-hand-gen)
+(->> ndarray-public-to-hand-gen (map :name) (into #{}))
 
 
 
@@ -294,16 +362,19 @@
           )))))
 
 
+(defn gen-ndarray-functions [public-to-gen-methods]
+  (for [operation (sort (public-by-name-and-param-count public-to-gen-methods))]
+    (let [[op-name op-values] operation
+          function-name (-> op-name
+                            str
+                            scala/decode-scala-symbol
+                            clojure-case
+                            symbol)]
+      `(~'defn ~function-name
+        ~@(remove nil? (gen-ndarray-function-arity op-name op-values))))))
+
 (def all-ndarray-functions
- (for [operation  (sort (public-by-name-and-param-count ndarray-public-to-gen))]
-   (let [[op-name op-values] operation
-         function-name (-> op-name
-                           str
-                           scala/decode-scala-symbol
-                           clojure-case
-                           symbol)]
-     `(~'defn ~function-name
-       ~@(remove nil? (gen-ndarray-function-arity op-name op-values))))))
+  (gen-ndarray-functions ndarray-public-to-gen))
 
 (def ndarray-gen-ns "(ns org.apache.clojure-mxnet.ndarray
   (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
@@ -314,16 +385,191 @@
 
 (defn generate-ndarray-file []
   (println "Generating ndarray file")
-  (write-to-file all-ndarray-functions ndarray-gen-ns  "src/org/apache/clojure_mxnet/gen/ndarray.clj"))
+  (write-to-file all-ndarray-functions
+                 ndarray-gen-ns
+                 "src/org/apache/clojure_mxnet/gen/ndarray.clj"))
+
+;;;;;;; SymbolAPI
+
+(defn symbol-api-coerce-param
+  [{:keys [name sym type optional?]}]
+  (let [coerced-param (case type
+                        "Shape" `(when ~sym (~'mx-shape/->shape ~sym))
+                        "NDArray-or-Symbol[]" `(~'clojure.core/into-array ~sym)
+                        "Map[String, String]"
+                        `(when ~sym
+                           (->> ~sym
+                                (mapv (fn [[~'k ~'v]] [~'k (str ~'v)]))
+                                (into {})
+                                ~'util/convert-map))
+                        sym)
+        nil-param-allowed? (#{"name" "attr"} name)]
+    (if (and optional? (not nil-param-allowed?))
+      `(~'util/->option ~coerced-param)
+      coerced-param)))
+
+(defn gen-symbol-api-doc [fn-description params]
+  (let [param-descriptions (mapv (fn [{:keys [name description optional?]}]
+                                   (str "`" name "`: "
+                                        description
+                                        (when optional? " (optional)")
+                                        "\n"))
+                                 params)]
+    (str fn-description "\n\n"
+         (apply str param-descriptions))))
+
+(defn gen-symbol-api-default-arity [op-name params]
+  (let [opt-params (filter :optional? params)
+        coerced-params (mapv symbol-api-coerce-param params)
+        default-args (array-map :keys (mapv :sym params)
+                                :or (into {}
+                                          (mapv (fn [{:keys [sym]}] [sym nil])
+                                                opt-params))
+                                :as 'opts)]
+    `([~default-args]
+      (~'util/coerce-return
+       (~(symbol (str "SymbolAPI/" op-name))
+        ~@coerced-params)))))
+
+(defn gen-symbol-api-function [op-name]
+  (let [{:keys [fn-name fn-description args]} (gen-op-info op-name)
+        params (mapv (fn [{:keys [name type optional?] :as opts}]
+                       (assoc opts
+                              :sym (symbol name)
+                              :optional? (or optional?
+                                             (= "NDArray-or-Symbol" type))))
+                     (conj args
+                           {:name "name"
+                            :type "String"
+                            :optional? true
+                            :description "Name of the symbol"}
+                           {:name "attr"
+                            :type "Map[String, String]"
+                            :optional? true
+                            :description "Attributes of the symbol"}))
+        doc (gen-symbol-api-doc fn-description params)
+        default-call (gen-symbol-api-default-arity op-name params)]
+    `(~'defn ~(symbol fn-name)
+      ~doc
+      ~@default-call)))
+
+(def all-symbol-api-functions
+  (mapv gen-symbol-api-function op-names))
+
+(def symbol-api-gen-ns "(ns
+  ^{:doc \"Experimental\"}
+  org.apache.clojure-mxnet.symbol-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
+                            min repeat reverse set sort take to-array empty sin
+                            get apply shuffle ref])
+  (:require [org.apache.clojure-mxnet.util :as util]
+            [org.apache.clojure-mxnet.shape :as mx-shape])
+  (:import (org.apache.mxnet SymbolAPI)))")
+
+(defn generate-symbol-api-file []
+  (println "Generating symbol-api file")
+  (write-to-file all-symbol-api-functions symbol-api-gen-ns "src/org/apache/clojure_mxnet/gen/symbol_api.clj"))
+
+;;;;;;; NDArrayAPI
+
+(defn ndarray-api-coerce-param
+  [{:keys [sym type optional?]}]
+  (let [coerced-param (case type
+                        "Shape" `(when ~sym (~'mx-shape/->shape ~sym))
+                        "NDArray-or-Symbol[]" `(~'clojure.core/into-array ~sym)
+                        sym)]
+    (if optional?
+      `(~'util/->option ~coerced-param)
+      coerced-param)))
+
+(defn gen-ndarray-api-doc [fn-description params]
+  (let [param-descriptions (mapv (fn [{:keys [name description optional?]}]
+                                   (str "`" name "`: "
+                                        description
+                                        (when optional? " (optional)")
+                                        "\n"))
+                                 params)]
+    (str fn-description "\n\n"
+         (apply str param-descriptions))))
+
+(defn gen-ndarray-api-default-arity [op-name params]
+  (let [opt-params (filter :optional? params)
+        coerced-params (mapv ndarray-api-coerce-param params)
+        default-args (array-map :keys (mapv :sym params)
+                                :or (into {}
+                                          (mapv (fn [{:keys [sym]}] [sym nil])
+                                                opt-params))
+                                :as 'opts)]
+    `([~default-args]
+      (~'util/coerce-return
+       (~(symbol (str "NDArrayAPI/" op-name))
+        ~@coerced-params)))))
+
+(defn gen-ndarray-api-required-arity [fn-name req-params]
+  (let [req-args (->> req-params
+                      (mapv (fn [{:keys [sym]}] [(keyword sym) sym]))
+                      (into {}))]
+    `(~(mapv :sym req-params)
+      (~(symbol fn-name) ~req-args))))
+
+(defn gen-ndarray-api-function [op-name]
+  (let [{:keys [fn-name fn-description args]} (gen-op-info op-name)
+        params (mapv (fn [{:keys [name] :as opts}]
+                       (assoc opts :sym (symbol name)))
+                     (conj args {:name "out"
+                                 :type "NDArray-or-Symbol"
+                                 :optional? true
+                                 :description "Output array."}))
+        doc (gen-ndarray-api-doc fn-description params)
+        opt-params (filter :optional? params)
+        req-params (remove :optional? params)
+        req-call (gen-ndarray-api-required-arity fn-name req-params)
+        default-call (gen-ndarray-api-default-arity op-name params)]
+    (if (= 1 (count req-params))
+      `(~'defn ~(symbol fn-name)
+        ~doc
+        ~@default-call)
+      `(~'defn ~(symbol fn-name)
+        ~doc
+        ~req-call
+        ~default-call))))
+
+(def all-ndarray-api-functions
+  (mapv gen-ndarray-api-function op-names))
+
+(def ndarray-api-gen-ns "(ns
+  ^{:doc \"Experimental\"}
+  org.apache.clojure-mxnet.ndarray-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
+                            min repeat reverse set sort take to-array empty shuffle
+                            ref])
+  (:require [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util])
+  (:import (org.apache.mxnet NDArrayAPI)))")
+
+
+(defn generate-ndarray-api-file []
+  (println "Generating ndarray-api file")
+  (write-to-file all-ndarray-api-functions
+                 ndarray-api-gen-ns
+                 "src/org/apache/clojure_mxnet/gen/ndarray_api.clj"))
 
 ;;; autogen the files
 (do
   (generate-ndarray-file)
-  (generate-symbol-file))
+  (generate-ndarray-api-file)
+  (generate-symbol-file)
+  (generate-symbol-api-file))
 
 
 (comment
 
+  (gen-op-info "ElementWiseSum")
+
+  (gen-ndarray-api-function "Activation")
+
+  (gen-symbol-api-function "Activation")
+
   ;; This generates a file with the bulk of the nd-array functions
   (generate-ndarray-file)
 
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/eval_metric.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/eval_metric.clj
index 1946103a4a2d..f1fe2d18bd35 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/eval_metric.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/eval_metric.clj
@@ -18,7 +18,7 @@
 (ns org.apache.clojure-mxnet.eval-metric
   (:refer-clojure :exclude [get update])
   (:require [org.apache.clojure-mxnet.util :as util])
-  (:import (org.apache.mxnet Accuracy TopKAccuracy F1 Perplexity MAE MSE RMSE CustomMetric)))
+  (:import (org.apache.mxnet Accuracy TopKAccuracy F1 Perplexity MAE MSE RMSE CustomMetric CompositeEvalMetric)))
 
 (defn accuracy
   "Basic Accuracy Metric"
@@ -74,11 +74,21 @@
   [f-eval mname]
   `(new CustomMetric (util/scala-fn ~f-eval) ~mname))
 
+(defn comp-metric
+  "Create a metric instance composed out of several metrics"
+  [metrics]
+  (let [cm (CompositeEvalMetric.)]
+    (doseq [m metrics] (.add cm m))
+    cm))
+
 (defn get
-  "Get the values of the metric in a vector form (name and value)"
+  "Get the values of the metric in as a map of {name value} pairs"
   [metric]
-  (let [[[mname] [mvalue]] (util/tuple->vec (.get metric))]
-    [mname mvalue]))
+  (let [m (apply zipmap (-> (.get metric)
+                            util/tuple->vec))]
+    (if-not (instance? CompositeEvalMetric metric)
+      (first m)
+      m)))
 
 (defn reset
   "clear the internal statistics to an initial state"
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
index e2e87ed47e2f..f81a35803171 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
@@ -16,12 +16,14 @@
 ;;
 
 (ns org.apache.clojure-mxnet.image
+  "Image API of Clojure package."
   (:require [t6.from-scala.core :refer [$ $$] :as $]
             [org.apache.clojure-mxnet.dtype :as dtype]
             [org.apache.clojure-mxnet.ndarray :as ndarray]
             [org.apache.clojure-mxnet.util :as util]
             [clojure.spec.alpha :as s])
   (:import (org.apache.mxnet Image NDArray)
+           (java.awt.image BufferedImage)
            (java.io InputStream)))
 
 ;; Flags for conversion of images
@@ -37,7 +39,18 @@
   (s/keys :opt-un [::color-flag ::to-rgb ::output]))
 
 (defn decode-image
-  "Decodes an image from an input stream"
+  "Decodes an image from an input stream with OpenCV
+    `input-stream`: `InputStream` - Contains the binary encoded image
+    `color-flag`: 0 or 1 - Convert decoded image to grayscale (0) or color (1)
+    `to-rgb`: boolean - Whether to convert decoded image to mxnet's default RGB
+            format (instead of opencv's default BGR)
+    `output`: nil or `NDArray`
+    returns: `NDArray` with dtype uint8
+
+  Ex:
+    (decode-image input-stream)
+    (decode-image input-stream {:color-flag 1})
+    (decode-image input-stream {:color-flag 0 :output nd})"
   ([input-stream {:keys [color-flag to-rgb output]
                   :or {color-flag COLOR to-rgb true output nil}
                   :as opts}]
@@ -54,7 +67,19 @@
   (s/or :none nil? :some ::to-rgb))
 
 (defn read-image
-  "Reads an image file and returns an ndarray"
+  "Reads an image file and returns an ndarray with OpenCV. It returns image in
+   RGB by default instead of OpenCV's default BGR.
+    `filename`: string - Name of the image file to be loaded
+    `color-flag`: 0 or 1 - Convert decoded image to grayscale (0) or color (1)
+    `to-rgb`: boolean - Whether to convert decoded image to mxnet's default RGB
+            format (instead of opencv's default BGR)
+    `output`: nil or `NDArray`
+    returns: `NDArray` with dtype uint8
+
+   Ex:
+     (read-image \"cat.jpg\")
+     (read-image \"cat.jpg\" {:color-flag 0})
+     (read-image \"cat.jpg\" {:color-flag 1 :output nd})"
   ([filename {:keys [color-flag to-rgb output]
               :or {color-flag nil to-rgb nil output nil}
               :as opts}]
@@ -74,7 +99,17 @@
 (s/def ::optional-int (s/or :none nil? :some int?))
 
 (defn resize-image
-  "Resizes the image array to (width, height)"
+  "Resizes the image array to (width, height)
+   `input`: `NDArray` - source image in NDArray
+   `w`: int - Width of resized image
+   `h`: int - Height of resized image
+   `interpolation`: Interpolation method. Default is INTER_LINEAR
+   `ouput`: nil or `NDArray`
+   returns: `NDArray`
+
+   Ex:
+     (resize-image nd-img 300 300)
+     (resize-image nd-img 28 28 {:output nd})"
   ([input w h {:keys [interpolation output]
                :or {interpolation nil output nil}
                :as opts}]
@@ -88,7 +123,21 @@
    (resize-image input w h {})))
 
 (defn apply-border
-  "Pad image border"
+  "Pad image border with OpenCV.
+   `input`: `NDArray` - source image in NDArray
+   `top`: int - Top margin
+   `bottom`: int - Bottom margin
+   `left`: int - Left margin
+   `right`: int - Right margin
+   `fill-type`: nil or Filling type - Default BORDER_CONSTANT
+   `value`: nil or double - Deprecated, use `values` instead
+   `values`: Fill with value(RGB or gray), up to 4 channels
+   `output`: nil or `NDArray`
+   returns: `NDArray`
+
+   Ex:
+     (apply-border img-nd 1 1 1 1)
+     (apply-border img-nd 3 3 0 0)"
   ([input top bottom left right
     {:keys [fill-type value values output]
      :or {fill-type nil value nil values nil output nil}
@@ -109,7 +158,17 @@
    (apply-border input top bottom left right {})))
 
 (defn fixed-crop
-  "Return a fixed crop of the image"
+  "Return a fixed crop of the image.
+   `input`: `NDArray` - Source image in NDArray
+   `x0`: int - Starting x point
+   `y0`: int - Starting y point
+   `w`: int - Width of the image
+   `h`: int - Height of the image
+   returns: cropped `NDArray`
+
+   Ex:
+     (fixed-crop nd-img 0 0 28 28)
+     (fixed-crop nd-img 10 0 100 300)"
   [input x0 y0 w h]
   (util/validate! ::ndarray input "Invalid input array")
   (util/validate! ::int x0 "Invalid starting x coordinate")
@@ -119,7 +178,9 @@
   (Image/fixedCrop input x0 y0 w h))
 
 (defn rgb-array?
-  "Returns whether the ndarray is in the RGB format"
+  "Returns whether the ndarray is in the RGB format
+   `input`: `NDArray` - Source image in NDArray
+   returns: boolean"
   [input]
   (util/validate! ::ndarray input "Invalid input array")
   (let [shape (ndarray/shape-vec input)]
@@ -133,7 +194,74 @@
   (s/and ::ndarray ::all-bytes ::rgb-array))
 
 (defn to-image
-  "Convert a NDArray image in RGB format to a real image"
+  "Convert a NDArray image in RGB format to a real image.
+   `input`: `NDArray` - Source image in NDArray
+   returns: `BufferedImage`"
   [input]
   (util/validate! ::to-image-ndarray input "Invalid input array")
   (Image/toImage input))
+
+(s/def ::buffered-image #(instance? BufferedImage %))
+(s/def ::x-min number?)
+(s/def ::x-max number?)
+(s/def ::y-min number?)
+(s/def ::y-max number?)
+(s/def ::coordinate (s/keys :req-un [::x-min ::x-max ::y-min ::y-max]))
+(s/def ::coordinates (s/coll-of ::coordinate))
+(s/def ::names (s/nilable (s/coll-of string?)))
+(s/def ::stroke (s/and integer? pos?))
+(s/def ::font-size-mult (s/and float? pos?))
+(s/def ::transparency (s/and float? #(<= 0.0 % 1.0)))
+(s/def ::coordinates-names
+  (fn [[coordinates names]] (= (count coordinates) (count names))))
+
+(defn- convert-coordinate
+  "Convert bounding box coordinate to Scala correct types."
+  [{:keys [x-min x-max y-min y-max]}]
+  {:xmin (int x-min)
+   :xmax (int x-max)
+   :ymin (int y-min)
+   :ymax (int y-max)})
+
+(defn draw-bounding-box!
+  "Draw bounding boxes on `buffered-image` and Mutate the input image.
+  `buffered-image`: BufferedImage
+  `coordinates`: collection of {:xmin int :xmax int :ymin int :ymax int}
+  `font-size-mult`: positive float - Font size multiplier
+  `names`: collection of strings - List of names for the bounding boxes
+  `stroke`: positive integer - thickness of the bounding box
+  `transparency`: float in (0.0, 1.0) - Transparency of the bounding box
+  returns: Modified `buffered-image`
+  Ex:
+    (draw-bounding-box! img [{:x-min 0 :x-max 100 :y-min 0 :y-max 100}])
+    (draw-bounding-box! [{:x-min 190 :x-max 850 :y-min 50 :y-max 450}
+                         {:x-min 200 :x-max 350 :y-min 440 :y-max 530}]
+                        {:stroke 2
+                         :names [\"pug\" \"cookie\"]
+                         :transparency 0.8
+                         :font-size-mult 2.0})"
+  ([buffered-image coordinates]
+   (draw-bounding-box! buffered-image coordinates {}))
+  ([buffered-image coordinates
+    {:keys [names stroke font-size-mult transparency]
+     :or {stroke 3 font-size-mult 1.0 transparency 1.0}
+     :as opts}]
+  (util/validate! ::buffered-image buffered-image "Invalid input image")
+  (util/validate! ::coordinates coordinates "Invalid input coordinates")
+  (util/validate! ::names names "Invalid input names")
+  (util/validate! ::stroke stroke "Invalid input stroke")
+  (util/validate! ::font-size-mult font-size-mult "Invalid input font-size-mult")
+  (util/validate! ::transparency transparency "Invalid input transparency")
+  (when (pos? (count names))
+    (util/validate!  ::coordinates-names [coordinates names] "Invalid number of names"))
+  (Image/drawBoundingBox
+    buffered-image
+    (->> coordinates
+         (map convert-coordinate)
+         (map util/convert-map)
+         (into-array))
+    (util/->option (into-array names))
+    (util/->option (int stroke))
+    (util/->option (float font-size-mult))
+    (util/->option (float transparency)))
+  buffered-image))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
index aa5ce39f7a80..09f17e5d81f4 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
@@ -16,6 +16,7 @@
 ;;
 
 (ns org.apache.clojure-mxnet.module
+  "Module API for Clojure package."
   (:refer-clojure :exclude [update symbol])
   (:require [org.apache.clojure-mxnet.callback :as callback]
             [org.apache.clojure-mxnet.context :as context]
@@ -31,18 +32,29 @@
   (:import (org.apache.mxnet.module Module FitParams BaseModule)
            (org.apache.mxnet.io MXDataIter NDArrayIter)
            (org.apache.mxnet Initializer Optimizer NDArray DataBatch
-                             Context EvalMetric Monitor Callback$Speedometer DataDesc)))
+                             Context EvalMetric Monitor Callback$Speedometer
+                             DataDesc)))
 
 (defn module
-  "Module is a basic module that wrap a symbol.
-   sym : Symbol definition.
-   map of options
-       :data-names - Input data names.
-       :label-names - Input label names
-       :contexts - Default is cpu().
-       :workload-list - Default nil, indicating uniform workload.
-       :fixed-param-names Default nil, indicating no network parameters are fixed."
-  ([sym {:keys [data-names label-names contexts workload-list fixed-param-names] :as opts
+  "Module is a basic module that wrap a `symbol`.
+    `sym`: Symbol definition.
+    `opts-map` {
+      `data-names`: vector of strings - Default is [\"data\"]
+          Input data names
+      `label-names`: vector of strings - Default is [\"softmax_label\"]
+          Input label names
+      `contexts`: Context - Default is `context/cpu`.
+      `workload-list`: Default nil
+          Indicating uniform workload.
+      `fixed-param-names`: Default nil
+          Indicating no network parameters are fixed.
+    }
+   Ex:
+     (module sym)
+     (module sym {:data-names [\"data\"]
+                  :label-names [\"linear_regression_label\"]}"
+  ([sym {:keys [data-names label-names contexts
+                workload-list fixed-param-names] :as opts
          :or {data-names ["data"]
               label-names ["softmax_label"]
               contexts [(context/default-context)]}}]
@@ -80,31 +92,41 @@
 (s/def ::force-rebind boolean?)
 (s/def ::shared-module #(instance? Module))
 (s/def ::grad-req string?)
-(s/def ::bind-opts (s/keys :req-un [::data-shapes] :opt-un [::label-shapes ::for-training ::inputs-need-grad
-                                                            ::force-rebind ::shared-module ::grad-req]))
+(s/def ::bind-opts
+  (s/keys :req-un [::data-shapes]
+          :opt-un [::label-shapes ::for-training ::inputs-need-grad
+                   ::force-rebind ::shared-module ::grad-req]))
 
 (defn bind
   "Bind the symbols to construct executors. This is necessary before one
    can perform computation with the module.
-   mod : module
-   map of opts:
-     :data-shapes Typically is  (provide-data-desc data-iter). Data shape must be in the form of io/data-desc with is a map of :name :shape :dtype and :layout
-     :label-shapes Typically is  (provide-label-desc data-iter). map of :name :shape :dtype and :layout
-     :for-training Default is `true`. Whether the executors should be bind for training.
-     :inputs-need-grad Default is `false`.
-                       Whether the gradients to the input data need to be computed.
-                       Typically this is not needed.
-                       But this might be needed when implementing composition of modules.
-     :force-rebind Default is `false`.
-                   This function does nothing if the executors are already binded.
-                   But with this `true`, the executors will be forced to rebind.
-     :shared-module Default is nil. This is used in bucketing.
-                    When not `None`, the shared module essentially corresponds to
-                    a different bucket -- a module with different symbol
-                    but with the same sets of parameters
-                    (e.g. unrolled RNNs with different lengths). "
-  [mod {:keys [data-shapes label-shapes for-training inputs-need-grad force-rebind
-               shared-module grad-req] :as opts
+    `mod`: module
+    `opts-map` {
+      `data-shapes`: map of `:name`, `:shape`, `:dtype`, and `:layout`
+          Typically is `(provide-data-desc data-iter)`.Data shape must be in the
+          form of `io/data-desc`
+      `label-shapes`: map of `:name` `:shape` `:dtype` and `:layout`
+          Typically is `(provide-label-desc data-iter)`.
+      `for-training`: boolean - Default is `true`
+          Whether the executors should be bind for training.
+      `inputs-need-grad`: boolean - Default is `false`.
+          Whether the gradients to the input data need to be computed.
+          Typically this is not needed. But this might be needed when
+          implementing composition of modules.
+      `force-rebind`: boolean - Default is `false`.
+          This function does nothing if the executors are already binded. But
+          with this `true`, the executors will be forced to rebind.
+      `shared-module`: Default is nil.
+          This is used in bucketing. When not `nil`, the shared module
+          essentially corresponds to a different bucket -- a module with
+          different symbol but with the same sets of parameters (e.g. unrolled
+          RNNs with different lengths).
+    }
+   Ex:
+     (bind {:data-shapes (mx-io/provide-data train-iter)
+            :label-shapes (mx-io/provide-label test-iter)})) "
+  [mod {:keys [data-shapes label-shapes for-training inputs-need-grad
+               force-rebind shared-module grad-req] :as opts
         :or {for-training true
              inputs-need-grad false
              force-rebind false
@@ -129,24 +151,36 @@
 (s/def ::aux-params map?)
 (s/def ::force-init boolean?)
 (s/def ::allow-extra boolean?)
-(s/def ::init-params-opts (s/keys :opt-un [::initializer ::arg-params ::aux-params
-                                           ::force-init ::allow-extra]))
+(s/def ::init-params-opts
+  (s/keys :opt-un [::initializer ::arg-params ::aux-params
+                   ::force-init ::allow-extra]))
 
 (defn init-params
-  " Initialize the parameters and auxiliary states.
-   options map
-     :initializer - Called to initialize parameters if needed.
-     :arg-params -  If not nil, should be a map of existing arg-params.
-                     Initialization will be copied from that.
-     :auxParams - If not nil, should be a map of existing aux-params.
-                    Initialization will be copied from that.
-     :allow-missing - If true, params could contain missing values,
-                       and the initializer will be called to fill those missing params.
-     :force-init -  If true, will force re-initialize even if already initialized.
-     :allow-extra -  Whether allow extra parameters that are not needed by symbol.
-             If this is True, no error will be thrown when argParams or auxParams
-             contain extra parameters that is not needed by the executor."
-  ([mod {:keys [initializer arg-params aux-params allow-missing force-init allow-extra] :as opts
+  "Initialize the parameters and auxiliary states.
+    `opts-map` {
+      `initializer`: Initializer - Default is `uniform`
+          Called to initialize parameters if needed.
+      `arg-params`: map
+          If not nil, should be a map of existing arg-params. Initialization
+          will be copied from that.
+      `aux-params`: map
+          If not nil, should be a map of existing aux-params. Initialization
+          will be copied from that.
+      `allow-missing`: boolean - Default is `false`
+          If true, params could contain missing values, and the initializer will
+          be called to fill those missing params.
+      `force-init` boolean - Default is `false`
+          If true, will force re-initialize even if already initialized.
+      `allow-extra`: boolean - Default is `false`
+          Whether allow extra parameters that are not needed by symbol.
+          If this is `true`, no error will be thrown when `arg-params` or
+          `aux-params` contain extra parameters that is not needed by the
+          executor.
+   Ex:
+     (init-params {:initializer (initializer/xavier)})
+     (init-params {:force-init true :allow-extra true})"
+  ([mod {:keys [initializer arg-params aux-params allow-missing force-init
+                allow-extra] :as opts
          :or {initializer (initializer/uniform 0.01)
               allow-missing false
               force-init false
@@ -167,17 +201,23 @@
 (s/def ::kvstore string?)
 (s/def ::reset-optimizer boolean?)
 (s/def ::force-init boolean?)
-(s/def ::init-optimizer-opts (s/keys :opt-un [::optimizer ::kvstore ::reset-optimizer ::force-init]))
+(s/def ::init-optimizer-opts
+  (s/keys :opt-un [::optimizer ::kvstore ::reset-optimizer ::force-init]))
 
 (defn init-optimizer
-  " Install and initialize optimizers.
-   - mod Module
-   - options map of
-          - kvstore
-         - reset-optimizer Default `True`, indicating whether we should set
-           `rescaleGrad` & `idx2name` for optimizer according to executorGroup
-         -  force-init Default `False`, indicating whether we should force
-             re-initializing the optimizer in the case an optimizer is already installed."
+  "Install and initialize optimizers.
+    `mod`: Module
+    `opts-map` {
+      `kvstore`: string - Default is \"local\"
+      `optimizer`: Optimizer - Default is `sgd`
+      `reset-optimizer`: boolean - Default is `true`
+          Indicating whether we should set `rescaleGrad` & `idx2name` for
+          optimizer according to executorGroup.
+      `force-init`: boolean - Default is `false`
+          Indicating whether we should force re-initializing the optimizer
+          in the case an optimizer is already installed.
+   Ex:
+     (init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1})})"
   ([mod {:keys [kvstore optimizer reset-optimizer force-init] :as opts
          :or {kvstore "local"
               optimizer (optimizer/sgd)
@@ -191,8 +231,10 @@
 
 (defn forward
   "Forward computation.
-   data-batch -  input data of form io/data-batch either map or DataBatch
-   is-train -  Default is nil, which means `is_train` takes the value of `for_training`."
+    `data-batch`: Either map or DataBatch
+       Input data of form `io/data-batch`.
+    `is-train`: Default is nil
+       Which means `is_train` takes the value of `for_training`."
   ([mod data-batch is-train]
    (util/validate! ::mx-io/data-batch data-batch "Invalid data batch")
    (doto mod
@@ -209,9 +251,9 @@
 
 (defn backward
   "Backward computation.
-   out-grads -  Gradient on the outputs to be propagated back.
-                This parameter is only needed when bind is called
-                on outputs that are not a loss function."
+    `out-grads`: collection of NDArrays
+        Gradient on the outputs to be propagated back. This parameter is only
+        needed when bind is called on outputs that are not a loss function."
   ([mod out-grads]
    (util/validate! ::out-grads out-grads "Invalid out-grads")
    (doto mod
@@ -227,50 +269,48 @@
     (.forwardBackward data-batch)))
 
 (defn outputs
-  " Get outputs of the previous forward computation.
-  In the case when data-parallelism is used,
-            the outputs will be collected from multiple devices.
-            The results will look like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`,
-           those `NDArray` might live on different devices."
+  "Get outputs of the previous forward computation.
+   In the case when data-parallelism is used, the outputs will be collected from
+   multiple devices. The results will look like
+   `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`.
+   Those `NDArray`s might live on different devices."
   [mod]
   (->> (.getOutputs mod)
        (util/scala-vector->vec)
        (mapv util/scala-vector->vec)))
 
 (defn update
-  "Update parameters according to the installed optimizer and the gradients computed
-   in the previous forward-backward batch."
+  "Update parameters according to the installed optimizer and the gradients
+   computed in the previous forward-backward batch."
   [mod]
   (doto mod
     (.update)))
 
 (defn outputs-merged
-  " Get outputs of the previous forward computation.
-    return In the case when data-parallelism is used,
-            the outputs will be merged from multiple devices,
-            as they look like from a single executor.
-            The results will look like `[out1, out2]`"
+  "Get outputs of the previous forward computation.
+   In the case when data-parallelism is used, the outputs will be merged from
+   multiple devices, as they look like from a single executor.
+   The results will look like `[out1, out2]`."
   [mod]
   (->> (.getOutputsMerged mod)
        (util/scala-vector->vec)))
 
 (defn input-grads
-  "  Get the gradients to the inputs, computed in the previous backward computation.
-  In the case when data-parallelism is used,
-            the outputs will be collected from multiple devices.
-            The results will look like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`
-           those `NDArray` might live on different devices."
+  "Get the gradients to the inputs, computed in the previous backward computation.
+   In the case when data-parallelism is used, the outputs will be collected from
+   multiple devices. The results will look like
+   `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`.
+   Those `NDArray`s might live on different devices."
   [mod]
   (->> (.getInputGrads mod)
        (util/scala-vector->vec)
        (mapv util/scala-vector->vec)))
 
 (defn input-grads-merged
-  " Get the gradients to the inputs, computed in the previous backward computation.
-    return In the case when data-parallelism is used,
-            the outputs will be merged from multiple devices,
-            as they look like from a single executor.
-            The results will look like `[grad1, grad2]`"
+  "Get the gradients to the inputs, computed in the previous backward computation.
+   In the case when data-parallelism is used, the outputs will be merged from
+   multiple devices, as they look like from a single executor.
+   The results will look like `[grad1, grad2]`."
   [mod]
   (->> (.getInputGradsMerged mod)
        (util/scala-vector->vec)))
@@ -278,16 +318,25 @@
 (s/def ::prefix string?)
 (s/def ::epoch int?)
 (s/def ::save-opt-states boolean?)
-(s/def ::save-checkpoint-opts (s/keys :req-un [::prefix ::epoch] :opt-un [::save-opt-states ::save-checkpoint]))
+(s/def ::save-checkpoint-opts
+  (s/keys :req-un [::prefix ::epoch]
+          :opt-un [::save-opt-states ::save-checkpoint]))
 
 (defn save-checkpoint
-  " Save current progress to checkpoint.
-    Use mx.callback.module_checkpoint as epoch_end_callback to save during training.
-    - mod Module
-    -  opt-map with
-       :prefix The file prefix to checkpoint to
-       :epoch The current epoch number
-       :save-opt-states Whether to save optimizer states for continue training "
+  "Save current progress to checkpoint.
+   Use mx.callback.module_checkpoint as epoch_end_callback to save during
+   training.
+    `mod`: Module
+    `opts-map` {
+       `prefix`: string
+           The file prefix to checkpoint to
+       `epoch`: int
+           The current epoch number
+       `save-opt-states`: boolean - Default is `false`
+           Whether to save optimizer states for continue training
+    }
+   Ex:
+     (save-checkpoint {:prefix \"saved_model\" :epoch 0 :save-opt-states true})"
   ([mod {:keys [prefix epoch save-opt-states] :as opts
          :or {save-opt-states false}}]
    (util/validate! ::save-checkpoint-opts opts "Invalid save checkpoint opts")
@@ -303,24 +352,34 @@
 (s/def ::contexts (s/coll-of ::context :kind vector?))
 (s/def ::workload-list (s/coll-of number? :kind vector?))
 (s/def ::fixed-params-names (s/coll-of string? :kind vector?))
-(s/def ::load-checkpoint-opts (s/keys :req-un [::prefix ::epoch]
-                                      :opt-un [::load-optimizer-states ::data-names ::label-names
-                                               ::contexts ::workload-list ::fixed-param-names]))
+(s/def ::load-checkpoint-opts
+  (s/keys :req-un [::prefix ::epoch]
+          :opt-un [::load-optimizer-states ::data-names ::label-names
+                   ::contexts ::workload-list ::fixed-param-names]))
 
 (defn load-checkpoint
   "Create a model from previously saved checkpoint.
-   - opts map of
-     -  prefix Path prefix of saved model files. You should have prefix-symbol.json,
-                 prefix-xxxx.params, and optionally prefix-xxxx.states,
-                 where xxxx is the epoch number.
-     -  epoch Epoch to load.
-     - load-optimizer-states Whether to load optimizer states.
-                          Checkpoint needs to have been made with save-optimizer-states=True
-     - dataNames Input data names.
-     - labelNames Input label names
-     - contexts Default is cpu().
-     -  workload-list  Default nil, indicating uniform workload.
-     - fixed-param-names Default nil, indicating no network parameters are fixed."
+    `opts-map` {
+      `prefix`: string
+          Path prefix of saved model files. You should have prefix-symbol.json,
+          prefix-xxxx.params, and optionally prefix-xxxx.states, where xxxx is
+          the epoch number.
+      `epoch`: int
+          Epoch to load.
+      `load-optimizer-states`: boolean - Default is false
+           Whether to load optimizer states. Checkpoint needs to have been made
+           with `save-optimizer-states` = `true`.
+       `data-names`: vector of strings - Default is [\"data\"]
+           Input data names.
+       `label-names`: vector of strings - Default is [\"softmax_label\"]
+           Input label names.
+       `contexts`: Context - Default is `context/cpu`
+       `workload-list`:  Default nil
+           Indicating uniform workload.
+       `fixed-param-names`: Default nil
+           Indicating no network parameters are fixed.
+   Ex:
+     (load-checkpoint {:prefix \"my-model\" :epoch 1 :load-optimizer-states true}"
   ([{:keys [prefix epoch load-optimizer-states data-names label-names contexts
             workload-list fixed-param-names] :as opts
      :or {load-optimizer-states false
@@ -358,10 +417,10 @@
   (util/scala-map->map (.auxParams mod)))
 
 (defn reshape
-  " Reshapes the module for new input shapes.
-   - mod module
-   - data-shapes Typically is `(provide-data data-iter)
-   - param label-shapes Typically is `(provide-label data-tier)`. "
+  "Reshapes the module for new input shapes.
+    `mod`: Module
+    `data-shapes`: Typically is `(provide-data data-iter)`
+    `label-shapes`: Typically is `(provide-label data-tier)`"
   ([mod data-shapes label-shapes]
    (util/validate! ::data-shapes data-shapes "Invalid data-shapes")
    (util/validate! (s/nilable ::label-shapes) label-shapes "Invalid label-shapes")
@@ -376,28 +435,35 @@
   ([mod data-shapes]
    (reshape mod data-shapes nil)))
 
-(s/def ::set-param-opts (s/keys :opt-un [::arg-params ::aux-params ::allow-missing ::force-init ::allow-extra]))
+(s/def ::set-param-opts
+  (s/keys :opt-un [::arg-params ::aux-params ::allow-missing
+                   ::force-init ::allow-extra]))
 
 (defn get-params [mod]
   (.getParams mod))
 
 (defn set-params
-  " Assign parameter and aux state values.
-    - mod module
-    - arg-params : map
-            map of name to value (`NDArray`) mapping.
-    - aux-params : map
-           map of name to value (`NDArray`) mapping.
-    - allow-missing : bool
-            If true, params could contain missing values, and the initializer will be
-            called to fill those missing params.
-    - force-init : bool
-            If true, will force re-initialize even if already initialized.
-   -  allow-extra : bool
-            Whether allow extra parameters that are not needed by symbol.
-            If this is True, no error will be thrown when arg-params or aux-params
-            contain extra parameters that is not needed by the executor."
-  [mod {:keys [arg-params aux-params allow-missing force-init allow-extra] :as opts
+  "Assign parameters and aux state values.
+    `mod`: Module
+    `opts-map` {
+      `arg-params`: map - map of name to value (`NDArray`) mapping.
+      `aux-params`: map - map of name to value (`NDArray`) mapping.
+      `allow-missing`: boolean
+          If true, params could contain missing values, and the initializer will
+          be called to fill those missing params.
+      `force-init`: boolean - Default is `false`
+          If true, will force re-initialize even if already initialized.
+      `allow-extra`: boolean - Default is `false`
+          Whether allow extra parameters that are not needed by symbol. If this
+          is `true`, no error will be thrown when arg-params or aux-params
+          contain extra parameters that is not needed by the executor.
+    }
+   Ex:
+     (set-params mod
+       {:arg-params {\"fc_0_weight\" (ndarray/array [0.15 0.2 0.25 0.3] [2 2])
+        :allow-missing true})"
+  [mod {:keys [arg-params aux-params allow-missing force-init
+               allow-extra] :as opts
         :or {allow-missing false force-init true allow-extra false}}]
   (util/validate! ::set-param-opts opts "Invalid set-params")
   (doto mod
@@ -409,33 +475,32 @@
      allow-extra)))
 
 (defn install-monitor
-  "Install monitor on all executors"
+  "Install monitor on all executors."
   [mod monitor]
   (doto mod
     (.installMonitor monitor)))
 
 (defn borrow-optimizer
-  "Borrow optimizer from a shared module. Used in bucketing, where exactly the same
-   optimizer (esp. kvstore) is used.
-   - mod module
-   - shared-module"
+  "Borrow optimizer from a shared module. Used in bucketing, where exactly the
+   same optimizer (esp. kvstore) is used.
+    `mod`: Module
+    `shared-module`"
   [mod shared-module]
   (doto mod
     (.borrowOptimizer shared-module)))
 
 (defn save-optimizer-states
-  "Save optimizer (updater) state to file
-   - mod module
-   - fname Path to output states file."
+  "Save optimizer (updater) state to file.
+    `mod`: Module
+    `fname`: string - Path to output states file."
   [mod fname]
   (doto mod
     (.saveOptimizerStates mod fname)))
 
 (defn load-optimizer-states
-  "Load optimizer (updater) state from file
-   - mod module
-   - fname Path to input states file.
-  "
+  "Load optimizer (updater) state from file.
+    `mod`: Module
+    `fname`: string - Path to input states file."
   [mod fname]
   (doto mod
     (.loadOptimzerStates fname)))
@@ -444,10 +509,13 @@
 (s/def ::labels (s/coll-of ::ndarray :kind vector?))
 
 (defn update-metric
-  "Evaluate and accumulate evaluation metric on outputs of the last forward computation.
-    - mod module
-    - eval-metric
-    - labels"
+  "Evaluate and accumulate evaluation metric on outputs of the last forward
+   computation.
+     `mod`: module
+     `eval-metric`: EvalMetric
+     `labels`: collection of NDArrays
+  Ex:
+    (update-metric mod (eval-metric/mse) labels)"
   [mod eval-metric labels]
   (util/validate! ::eval-metric eval-metric "Invalid eval metric")
   (util/validate! ::labels labels "Invalid labels")
@@ -458,18 +526,48 @@
 (s/def ::validation-metric ::eval-metric)
 (s/def ::monitor #(instance? Monitor %))
 (s/def ::batch-end-callback #(instance? Callback$Speedometer %))
-(s/def ::fit-params-opts (s/keys :opt-un [::eval-metric ::kvstore ::optimizer ::initializer
-                                          ::arg-params ::aux-params ::allow-missing ::force-rebind
-                                          ::force-init ::begin-epoch ::validation-metric ::monitor
-                                          ::batch-end-callback]))
+(s/def ::fit-params-opts
+  (s/keys :opt-un [::eval-metric ::kvstore ::optimizer ::initializer
+                   ::arg-params ::aux-params ::allow-missing ::force-rebind
+                   ::force-init ::begin-epoch ::validation-metric ::monitor
+                   ::batch-end-callback]))
 
 ;; callbacks are not supported for now
 (defn fit-params
-  "Fit Params"
+  "Initialize FitParams with provided parameters.
+    `eval-metric`: EvalMetric - Default is `accuracy`
+    `kvstore`: String - Default is \"local\"
+    `optimizer`: Optimizer - Default is `sgd`
+    `initializer`: Initializer - Default is `uniform`
+        Called to initialize parameters if needed.
+    `arg-params`: map
+        If not nil, should be a map of existing `arg-params`. Initialization
+        will be copied from that.
+    `aux-params`: map -
+        If not nil, should be a map of existing `aux-params`. Initialization
+        will be copied from that.
+    `allow-missing`: boolean - Default is `false`
+        If `true`, params could contain missing values, and the initializer will
+        be called to fill those missing params.
+    `force-rebind`: boolean - Default is `false`
+        This function does nothing if the executors are already binded. But with
+        this `true`, the executors will be forced to rebind.
+    `force-init`: boolean - Default is `false`
+        If `true`, will force re-initialize even if already initialized.
+    `begin-epoch`: int - Default is 0
+    `validation-metric`: EvalMetric
+    `monitor`: Monitor
+  Ex:
+    (fit-params {:force-init true :force-rebind true :allow-missing true})
+    (fit-params
+      {:batch-end-callback (callback/speedometer batch-size 100)
+       :initializer (initializer/xavier)
+       :optimizer (optimizer/sgd {:learning-rate 0.01})
+       :eval-metric (eval-metric/mse)})"
   ([{:keys [eval-metric kvstore optimizer
             initializer arg-params aux-params
-            allow-missing force-rebind force-init begin-epoch validation-metric monitor
-            batch-end-callback] :as opts
+            allow-missing force-rebind force-init begin-epoch
+            validation-metric monitor batch-end-callback] :as opts
      :or {eval-metric (eval-metric/accuracy)
           kvstore "local"
           optimizer (optimizer/sgd)
@@ -500,25 +598,36 @@
 (s/def ::ndarray-iter #(instance? NDArrayIter %))
 (s/def ::train-data (s/or :mx-iter ::mx-data-iter :ndarry-iter ::ndarray-iter))
 (s/def ::eval-data ::train-data)
-(s/def ::num-epoch int?)
+(s/def ::num-epoch (s/and int? pos?))
 (s/def ::fit-params #(instance? FitParams %))
-(s/def ::fit-options (s/keys :req-un [::train-data] :opt-un [::eval-data ::num-epoch ::fit-params]))
+(s/def ::fit-options
+  (s/keys :req-un [::train-data]
+          :opt-un [::eval-data ::num-epoch ::fit-params]))
 
 ;;; High Level API
 
 (defn score
-  " Run prediction on `eval-data` and evaluate the performance according to `eval-metric`.
-   - mod module
-   - option map with
-     :eval-data : DataIter
-     :eval-metric : EvalMetric
-     :num-batch Number of batches to run. Default is `Integer.MAX_VALUE`,
-                   indicating run until the `DataIter` finishes.
-     :batch-end-callback -not supported yet
-     :reset Default `True`,
-                 indicating whether we should reset `eval-data` before starting evaluating.
-     :epoch Default 0. For compatibility, this will be passed to callbacks (if any).
-                During training, this will correspond to the training epoch number."
+  "Run prediction on `eval-data` and evaluate the performance according to
+  `eval-metric`.
+    `mod`: module
+     `opts-map` {
+       `eval-data`: DataIter
+       `eval-metric`: EvalMetric
+       `num-batch`: int - Default is `Integer.MAX_VALUE`
+           Number of batches to run. Indicating run until the `DataIter`
+           finishes.
+       `batch-end-callback`: not supported yet.
+       `reset`: boolean - Default is `true`,
+           Indicating whether we should reset `eval-data` before starting
+           evaluating.
+       `epoch`: int - Default is 0
+           For compatibility, this will be passed to callbacks (if any). During
+           training, this will correspond to the training epoch number.
+     }
+   Ex:
+     (score mod {:eval-data data-iter :eval-metric (eval-metric/accuracy)})
+     (score mod {:eval-data data-iter
+                 :eval-metric (eval-metric/mse) :num-batch 10})"
   [mod {:keys [eval-data eval-metric num-batch reset epoch] :as opts
         :or {num-batch Integer/MAX_VALUE
              reset true
@@ -537,15 +646,30 @@
 
 (defn fit
   "Train the module parameters.
-   - mod module
-   - train-data (data-iterator)
-   - eval-data (data-iterator)If not nil, will be used as validation set and evaluate
-                   the performance after each epoch.
-   - num-epoch Number of epochs to run training.
-   - f-params Extra parameters for training (See fit-params)."
+    `mod`: Module
+    `opts-map` {
+      `train-data`: DataIter
+      `eval-data`: DataIter
+          If not nil, will be used as validation set and evaluate the
+          performance after each epoch.
+      `num-epoch`: int
+          Number of epochs to run training.
+      `fit-params`: FitParams
+          Extra parameters for training (see fit-params).
+    }
+   Ex:
+     (fit {:train-data train-iter :eval-data test-iter :num-epoch 100)
+     (fit {:train-data train-iter
+           :eval-data test-iter
+           :num-epoch 5
+           :fit-params
+           (fit-params {:batch-end-callback (callback/speedometer 128 100)
+                        :initializer (initializer/xavier)
+                        :optimizer (optimizer/sgd {:learning-rate 0.01})
+                        :eval-metric (eval-metric/mse)}))"
   [mod {:keys [train-data eval-data num-epoch fit-params] :as opts
-        `:or {num-epoch 1
-              fit-params (new FitParams)}}]
+        :or {num-epoch 1
+             fit-params (new FitParams)}}]
   (util/validate! ::fit-options opts "Invalid options for fit")
   (doto mod
     (.fit
@@ -557,12 +681,13 @@
 (s/def ::eval-data ::train-data)
 (s/def ::num-batch integer?)
 (s/def ::reset boolean?)
-(s/def ::predict-opts (s/keys :req-un [::eval-data] :opt-un [::num-batch ::reset]))
+(s/def ::predict-opts
+  (s/keys :req-un [::eval-data] :opt-un [::num-batch ::reset]))
 
 (defn predict-batch
-  "Run the predication on a data batch
-   - mod module
-   - data-batch data-batch"
+  "Run the predication on a data batch.
+    `mod`: Module
+    `data-batch`: data-batch"
   [mod data-batch]
   (util/validate! ::mx-io/data-batch data-batch "Invalid data batch")
   (util/coerce-return (.predict mod (if (map? data-batch)
@@ -571,41 +696,60 @@
 
 (defn predict
   "Run prediction and collect the outputs.
-   - mod module
-   - option map with
-     - :eval-data
-     - :num-batch Default is -1, indicating running all the batches in the data iterator.
-     - :reset Default is `True`, indicating whether we should reset the data iter before start
-               doing prediction.
-    The return value will be a vector of NDArrays `[out1, out2, out3]`.
-          Where each element is concatenation of the outputs for all the mini-batches."
+    `mod`: Module
+    `opts-map` {
+      `eval-data`: DataIter
+      `num-batch` int - Default is `-1`
+          Indicating running all the batches in the data iterator.
+      `reset`: boolean - Default is `true`
+          Indicating whether we should reset the data iter before start doing
+          prediction.
+    }
+    returns: vector of NDArrays `[out1, out2, out3]` where each element is the
+       concatenation of the outputs for all the mini-batches.
+  Ex:
+    (predict mod {:eval-data test-iter})
+    (predict mod {:eval-data test-iter :num-batch 10 :reset false})"
   [mod {:keys [eval-data num-batch reset] :as opts
         :or {num-batch -1
              reset true}}]
   (util/validate! ::predict-opts opts "Invalid opts for predict")
   (util/scala-vector->vec (.predict mod eval-data (int num-batch) reset)))
 
-(s/def ::predict-every-batch-opts (s/keys :req-un [::eval-data] :opt-un [::num-batch ::reset]))
+(s/def ::predict-every-batch-opts
+  (s/keys :req-un [::eval-data] :opt-un [::num-batch ::reset]))
 
 (defn predict-every-batch
-  " Run prediction and collect the outputs.
-   - module
-   - option map with
-     :eval-data
-     :num-batch Default is -1, indicating running all the batches in the data iterator.
-     :reset Default is `True`, indicating whether we should reset the data iter before start
-               doing prediction.
-    The return value will be a nested list like
-   [[out1_batch1, out2_batch1, ...], [out1_batch2, out2_batch2, ...]]`
-   This mode is useful because in some cases (e.g. bucketing),
-    the module does not necessarily produce the same number of outputs."
+  "Run prediction and collect the outputs.
+    `mod`: Module
+    `opts-map` {
+      `eval-data`: DataIter
+      `num-batch` int - Default is `-1`
+          Indicating running all the batches in the data iterator.
+      `reset` boolean - Default is `true`
+          Indicating whether we should reset the data iter before start doing
+          prediction.
+    }
+    returns: nested list like this
+    `[[out1_batch1, out2_batch1, ...], [out1_batch2, out2_batch2, ...]]`
+
+   Note: This mode is useful because in some cases (e.g. bucketing), the module
+         does not necessarily produce the same number of outputs.
+   Ex:
+     (predict-every-batch mod {:eval-data test-iter})"
   [mod {:keys [eval-data num-batch reset] :as opts
         :or {num-batch -1
              reset true}}]
-  (util/validate! ::predict-every-batch-opts opts "Invalid opts for predict-every-batch")
-  (mapv util/scala-vector->vec (util/scala-vector->vec (.predictEveryBatch mod eval-data (int num-batch) reset))))
-
-(s/def ::score-opts (s/keys :req-un [::eval-data ::eval-metric] :opt-un [::num-batch ::reset ::epoch]))
+  (util/validate! ::predict-every-batch-opts
+                  opts
+                  "Invalid opts for predict-every-batch")
+  (mapv util/scala-vector->vec
+        (util/scala-vector->vec
+          (.predictEveryBatch mod eval-data (int num-batch) reset))))
+
+(s/def ::score-opts
+  (s/keys :req-un [::eval-data ::eval-metric]
+          :opt-un [::num-batch ::reset ::epoch]))
 
 (defn exec-group [mod]
   (.execGroup mod))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray.clj
index 651bdcb3f315..9caa00d49010 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray.clj
@@ -16,15 +16,18 @@
 ;;
 
 (ns org.apache.clojure-mxnet.ndarray
+  "NDArray API for Clojure package."
   (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
                             min repeat reverse set sort take to-array empty shuffle
                             ref])
-  (:require [org.apache.clojure-mxnet.base :as base]
-            [org.apache.clojure-mxnet.context :as mx-context]
-            [org.apache.clojure-mxnet.shape :as mx-shape]
-            [org.apache.clojure-mxnet.util :as util]
-            [clojure.reflect :as r]
-            [t6.from-scala.core :refer [$] :as $])
+  (:require
+    [clojure.spec.alpha :as s]
+
+    [org.apache.clojure-mxnet.base :as base]
+    [org.apache.clojure-mxnet.context :as mx-context]
+    [org.apache.clojure-mxnet.shape :as mx-shape]
+    [org.apache.clojure-mxnet.util :as util]
+    [t6.from-scala.core :refer [$] :as $])
   (:import (org.apache.mxnet NDArray)))
 
 ;; loads the generated functions into the namespace
@@ -91,6 +94,27 @@
   ([start stop]
    (arange start stop {})))
 
+(defn ->ndarray
+  "Creates a new NDArray based on the given n-dimenstional vector
+   of numbers.
+    `nd-vec`: n-dimensional vector with numbers.
+    `opts-map` {
+       `ctx`: Context of the output ndarray, will use default context if unspecified.
+    }
+    returns: `ndarray` with the given values and matching the shape of the input vector.
+   Ex:
+    (->ndarray [5.0 -4.0])
+    (->ndarray [5 -4] {:ctx (context/cpu)})
+    (->ndarray [[1 2 3] [4 5 6]])
+    (->ndarray [[[1.0] [2.0]]]"
+  ([nd-vec {:keys [ctx]
+            :or {ctx (mx-context/default-context)}
+            :as opts}]
+   (array (vec (clojure.core/flatten nd-vec))
+          (util/nd-seq-shape nd-vec)
+          {:ctx ctx}))
+  ([nd-vec] (->ndarray nd-vec {})))
+
 (defn slice
   "Return a sliced NDArray that shares memory with current one."
   ([ndarray i]
@@ -167,3 +191,46 @@
 
 (defn shape-vec [ndarray]
   (mx-shape/->vec (shape ndarray)))
+
+(s/def ::ndarray #(instance? NDArray %))
+(s/def ::vector vector?)
+(s/def ::sequential sequential?)
+(s/def ::shape-vec-match-vec
+  (fn [[v vec-shape]] (= (count v) (reduce clojure.core/* 1 vec-shape))))
+
+(s/fdef vec->nd-vec
+        :args (s/cat :v ::sequential :shape-vec ::sequential)
+        :ret ::vector)
+
+(defn- vec->nd-vec
+  "Convert a vector `v` into a n-dimensional vector given the `shape-vec`
+   Ex:
+    (vec->nd-vec [1 2 3] [1 1 3])       ;[[[1 2 3]]]
+    (vec->nd-vec [1 2 3 4 5 6] [2 3 1]) ;[[[1] [2] [3]] [[4] [5] [6]]]
+    (vec->nd-vec [1 2 3 4 5 6] [1 2 3]) ;[[[1 2 3]] [4 5 6]]]
+    (vec->nd-vec [1 2 3 4 5 6] [3 1 2]) ;[[[1 2]] [[3 4]] [[5 6]]]
+    (vec->nd-vec [1 2 3 4 5 6] [3 2])   ;[[1 2] [3 4] [5 6]]"
+  [v [s1 & ss :as shape-vec]]
+  (util/validate! ::sequential v "Invalid input vector `v`")
+  (util/validate! ::sequential shape-vec "Invalid input vector `shape-vec`")
+  (util/validate! ::shape-vec-match-vec
+                  [v shape-vec]
+                  "Mismatch between vector `v` and vector `shape-vec`")
+  (if-not (seq ss)
+    (vec v)
+    (->> v
+         (partition (clojure.core// (count v) s1))
+         vec
+         (mapv #(vec->nd-vec % ss)))))
+
+(s/fdef ->nd-vec :args (s/cat :ndarray ::ndarray) :ret ::vector)
+
+(defn ->nd-vec
+  "Convert an ndarray `ndarray` into a n-dimensional Clojure vector.
+  Ex:
+    (->nd-vec (array [1] [1 1 1]))           ;[[[1.0]]]
+    (->nd-vec (array [1 2 3] [3 1 1]))       ;[[[1.0]] [[2.0]] [[3.0]]]
+    (->nd-vec (array [1 2 3 4 5 6]) [3 1 2]) ;[[[1.0 2.0]] [[3.0 4.0]] [[5.0 6.0]]]"
+  [ndarray]
+  (util/validate! ::ndarray ndarray "Invalid input array")
+  (vec->nd-vec (->vec ndarray) (shape-vec ndarray)))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_api.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_api.clj
new file mode 100644
index 000000000000..70359a6ef9b7
--- /dev/null
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_api.clj
@@ -0,0 +1,32 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.ndarray-api
+  "Experimental NDArray API"
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
+                            min repeat reverse set sort take to-array empty shuffle
+                            ref])
+
+  (:require [org.apache.clojure-mxnet.base :as base]
+            [org.apache.clojure-mxnet.context :as mx-context]
+            [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util]
+            [clojure.reflect :as r]
+            [t6.from-scala.core :refer [$] :as $])
+  (:import (org.apache.mxnet NDArrayAPI)))
+
+;; loads the generated functions into the namespace
+(do (clojure.core/load "gen/ndarray_api"))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/random.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/random.clj
index 0ec2039ba79b..1261e659e6dc 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/random.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/random.clj
@@ -16,70 +16,84 @@
 ;;
 
 (ns org.apache.clojure-mxnet.random
+  "Random Number interface of mxnet."
   (:require
-   [org.apache.clojure-mxnet.shape :as mx-shape]
-   [org.apache.clojure-mxnet.context :as context]
    [clojure.spec.alpha :as s]
+   [org.apache.clojure-mxnet.context :as context]
+   [org.apache.clojure-mxnet.shape :as mx-shape]
    [org.apache.clojure-mxnet.util :as util])
   (:import (org.apache.mxnet Context Random)))
 
 (s/def ::low number?)
 (s/def ::high number?)
+(s/def ::low-high (fn [[low high]] (<= low high)))
 (s/def ::shape-vec (s/coll-of pos-int? :kind vector?))
 (s/def ::ctx #(instance? Context %))
 (s/def ::uniform-opts (s/keys :opt-un [::ctx]))
 
 (defn uniform
-  "Generate uniform distribution in [low, high) with shape.
-    low: The lower bound of distribution.
-    high: The upper bound of distribution.
-    shape-vec: vector shape of the ndarray generated.
-    opts-map {
-      ctx: Context of output ndarray, will use default context if not specified.
-      out: Output place holder}
-    returns: The result ndarray with generated result./"
+  "Generate uniform distribution in [`low`, `high`) with shape.
+    `low`: The lower bound of distribution.
+    `high`: The upper bound of distribution.
+    `shape-vec`: vector shape of the ndarray generated.
+    `opts-map` {
+      `ctx`: Context of output ndarray, will use default context if not specified.
+      `out`: Output place holder}
+    returns: The result ndarray with generated result.
+   Ex:
+     (uniform 0 1 [1 10])
+     (uniform -10 10 [100 100])"
   ([low high shape-vec {:keys [ctx out] :as opts}]
-   (util/validate! ::uniform-opts opts "Incorrect random uniform parameters")
+   (util/validate! ::uniform-opts opts  "Incorrect random uniform parameters")
    (util/validate! ::low low  "Incorrect random uniform parameter")
    (util/validate! ::high high  "Incorrect random uniform parameters")
+   (util/validate! ::low-high [low high]  "Incorrect random uniform parameters")
    (util/validate! ::shape-vec shape-vec  "Incorrect random uniform parameters")
    (Random/uniform (float low) (float high) (mx-shape/->shape shape-vec) ctx out))
   ([low high shape-vec]
    (uniform low high shape-vec {})))
 
 (s/def ::loc number?)
-(s/def ::scale number?)
+(s/def ::scale (s/and number? pos?))
 (s/def ::normal-opts (s/keys :opt-un [::ctx]))
 
 (defn normal
-  "Generate normal(Gaussian) distribution N(mean, stdvar^^2) with shape.
-    loc: The standard deviation of the normal distribution
-    scale: The upper bound of distribution.
-    shape-vec: vector shape of the ndarray generated.
-    opts-map {
-      ctx: Context of output ndarray, will use default context if not specified.
-      out: Output place holder}
-    returns: The result ndarray with generated result./"
+  "Generate normal (Gaussian) distribution N(mean, stdvar^^2) with shape.
+    `loc`: Mean (centre) of the distribution.
+    `scale`: Standard deviation (spread or width) of the distribution.
+    `shape-vec`: vector shape of the ndarray generated.
+    `opts-map` {
+      `ctx`: Context of output ndarray, will use default context if not specified.
+      `out`: Output place holder}
+    returns: The result ndarray with generated result.
+   Ex:
+     (normal 0 1 [10 10])
+     (normal -5 4 [2 3])"
   ([loc scale shape-vec {:keys [ctx out] :as opts}]
    (util/validate! ::normal-opts opts  "Incorrect random normal parameters")
    (util/validate! ::loc loc  "Incorrect random normal parameters")
    (util/validate! ::scale scale  "Incorrect random normal parameters")
    (util/validate! ::shape-vec shape-vec  "Incorrect random uniform parameters")
-   (Random/normal (float loc) (float scale) (mx-shape/->shape shape-vec) ctx out))
+   (Random/normal (float loc)
+                  (float scale)
+                  (mx-shape/->shape shape-vec) ctx out))
   ([loc scale shape-vec]
    (normal loc scale shape-vec {})))
 
 (s/def ::seed-state number?)
 (defn seed
-  " Seed the random number generators in mxnet.
-    This seed will affect behavior of functions in this module,
-    as well as results from executors that contains Random number
-    such as Dropout operators.
+  "Seed the random number generators in mxnet.
+   This seed will affect behavior of functions in this module,
+   as well as results from executors that contains Random number
+   such as Dropout operators.
 
-   seed-state: The random number seed to set to all devices.
+   `seed-state`: The random number seed to set to all devices.
    note: The random number generator of mxnet is by default device specific.
          This means if you set the same seed, the random number sequence
-         generated from GPU0 can be different from CPU."
+         generated from GPU0 can be different from CPU.
+   Ex:
+     (seed-state 42)
+     (seed-state 42.0)"
   [seed-state]
   (util/validate! ::seed-state seed-state  "Incorrect seed parameters")
-  (Random/seed (int seed-state)))
\ No newline at end of file
+  (Random/seed (int seed-state)))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_api.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_api.clj
new file mode 100644
index 000000000000..69cc8136d500
--- /dev/null
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_api.clj
@@ -0,0 +1,32 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.symbol-api
+  "Experimental Symbol API"
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
+                            min repeat reverse set sort take to-array empty sin
+                            get apply shuffle ref])
+  (:require [org.apache.clojure-mxnet.base :as base]
+            [org.apache.clojure-mxnet.context :as mx-context]
+            [org.apache.clojure-mxnet.executor :as ex]
+            [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util]
+            [t6.from-scala.core :refer [$] :as $]
+            [org.apache.clojure-mxnet.ndarray :as ndarray])
+  (:import (org.apache.mxnet SymbolAPI)))
+
+;; loads the generated functions into the namespace
+(do (clojure.core/load "gen/symbol_api"))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
index 6b5f50792ead..9dc6c8f88ddd 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
@@ -35,7 +35,6 @@
                            "int<>" "vec-of-ints"
                            "float<>" "vec-of-floats"
                            "byte<>" "byte-array"
-                           "java.lang.String<>" "vec-or-strings"
                            "org.apache.mxnet.NDArray" "ndarray"
                            "org.apache.mxnet.Symbol" "sym"
                            "org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE" "double-or-float"})
@@ -49,7 +48,7 @@
                           "int<>" "vec-of-ints"
                           "float<>" "vec-of-floats"
                           "byte<>" "byte-array"
-                          "java.lang.String<>" "vec-or-strings"
+                          "java.lang.String<>" "vec-of-strings"
                           "org.apache.mxnet.Symbol" "sym"
                           "java.lang.Object" "object"})
 
@@ -74,8 +73,17 @@
 (defn option->value [opt]
   ($/view opt))
 
-(defn keyword->snake-case [vals]
-  (mapv (fn [v] (if (keyword? v) (string/replace (name v) "-" "_") v)) vals))
+(defn keyword->snake-case
+  "Transforms a keyword `kw` into a snake-case string.
+  `kw`: keyword
+  returns: string
+  Ex:
+    (keyword->snake-case :foo-bar) ;\"foo_bar\"
+    (keyword->snake-case :foo)     ;\"foo\""
+  [kw]
+  (if (keyword? kw)
+    (string/replace (name kw) "-" "_")
+    kw))
 
 (defn convert-tuple [param]
   (apply $/tuple param))
@@ -111,8 +119,8 @@
     (empty-map)
     (apply $/immutable-map (->> param
                                 (into [])
-                                flatten
-                                keyword->snake-case))))
+                                (flatten)
+                                (mapv keyword->snake-case)))))
 
 (defn convert-symbol-map [param]
   (convert-map (tuple-convert-by-param-name param)))
@@ -143,9 +151,12 @@
     (and (get targets "scala.collection.Seq") (instance? org.apache.mxnet.Symbol param)) ($/immutable-list param)
     (and (get targets "scala.collection.Seq") (and (or (vector? param) (seq? param)) (empty? param))) (empty-list)
     (and (get targets "scala.collection.Seq") (or (vector? param) (seq? param))) (apply $/immutable-list param)
+    (and (get targets "org.apache.mxnet.Shape") (or (vector? param) (seq? param) (empty? param))) (mx-shape/->shape param)
     (and (get targets "int<>") (vector? param)) (int-array param)
     (and (get targets "float<>") (vector? param)) (float-array param)
     (and (get targets "java.lang.String<>") (vector? param)) (into-array param)
+    (and (get targets "org.apache.mxnet.NDArray<>") (vector? param)) (into-array param)
+    (and (get targets "org.apache.mxnet.Symbol<>") (vector? param)) (into-array param)
     (and (get targets "org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE") (instance? Float param)) (primitives/mx-float param)
     (and (get targets "org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE") (number? param)) (primitives/mx-double param)
     :else param))
@@ -218,8 +229,28 @@
     (throw (ex-info error-msg
                     (s/explain-data spec value)))))
 
+(s/def ::non-empty-seq (s/and sequential? not-empty))
+(defn to-array-nd
+  "Converts any N-D sequential structure to an array
+   with the same dimensions."
+  [nd-seq]
+  (validate! ::non-empty-seq nd-seq "Invalid N-D sequence")
+  (if (sequential? (first nd-seq))
+    (to-array (mapv to-array-nd nd-seq))
+    (to-array nd-seq)))
+
+(defn nd-seq-shape
+  "Computes the shape of a n-dimensional sequential structure"
+  [nd-seq]
+  (validate! ::non-empty-seq nd-seq "Invalid N-D sequence")
+  (loop [s nd-seq
+         shape [(count s)]]
+    (if (sequential? (first s))
+      (recur (first s) (conj shape (count (first s))))
+      shape)))
+
 (defn map->scala-tuple-seq
-  "* Convert a map to a scala-Seq of scala-Tubple.
+  "* Convert a map to a scala-Seq of scala-Tuple.
    * Should also work if a seq of seq of 2 things passed.
    * Otherwise passed through unchanged."
   [map-or-tuple-seq]
diff --git a/contrib/clojure-package/test/dev/generator_test.clj b/contrib/clojure-package/test/dev/generator_test.clj
index a3ec338921ba..cf28241c59e8 100644
--- a/contrib/clojure-package/test/dev/generator_test.clj
+++ b/contrib/clojure-package/test/dev/generator_test.clj
@@ -50,6 +50,127 @@
     (is (= transformed-params (gen/symbol-transform-param-name
                                (:parameter-types (symbol-reflect-info "floor")))))))
 
+(deftest test-gen-op-info
+  (testing "activation"
+    (let [activation-info (gen/gen-op-info "Activation")]
+      (is (= "activation" (:fn-name activation-info)))
+      (is (string? (:fn-description activation-info)))
+      (is (= 2 (-> activation-info :args count)))
+      (is (= "" (:key-var-num-args activation-info)))
+
+      (is (= "data" (-> activation-info :args first :name)))
+      (is (= "NDArray-or-Symbol" (-> activation-info :args first :type)))
+      (is (false? (-> activation-info :args first :optional?)))
+      (is (nil? (-> activation-info :args first :default)))
+      (is (string? (-> activation-info :args first :description)))
+
+      (is (= "act-type" (-> activation-info :args second :name)))
+      (is (= "'relu', 'sigmoid', 'softrelu', 'softsign', 'tanh'" (-> activation-info :args second :type)))
+      (is (false? (-> activation-info :args second :optional?)))
+      (is (nil? (-> activation-info :args second :default)))
+      (is (string? (-> activation-info :args second :description)))))
+
+  (testing "argmin"
+    (let [argmin-info (gen/gen-op-info "argmin")]
+      (is (= "argmin" (:fn-name argmin-info)))
+      (is (= 3 (-> argmin-info :args count)))
+
+      (is (= "data" (-> argmin-info :args (nth 0) :name)))
+      (is (= "NDArray-or-Symbol" (-> argmin-info :args (nth 0) :type)))
+      (is (false? (-> argmin-info :args (nth 0) :optional?)))
+
+      (is (= "axis" (-> argmin-info :args (nth 1) :name)))
+      (is (= "int or None" (-> argmin-info :args (nth 1) :type)))
+      (is (= "'None'" (-> argmin-info :args (nth 1) :default)))
+      (is (true? (-> argmin-info :args (nth 1) :optional?)))
+
+      (is (= "keepdims" (-> argmin-info :args (nth 2) :name)))
+      (is (= "boolean" (-> argmin-info :args (nth 2) :type)))
+      (is (= "0" (-> argmin-info :args (nth 2) :default)))
+      (is (true? (-> argmin-info :args (nth 2) :optional?)))))
+
+  (testing "concat"
+    (let [concat-info (gen/gen-op-info "Concat")]
+      (is (= "concat" (:fn-name concat-info)))
+      (is (= 3 (-> concat-info :args count)))
+      (is (= "num-args" (:key-var-num-args concat-info)))
+
+      (is (= "data" (-> concat-info :args (nth 0) :name)))
+      (is (= "NDArray-or-Symbol[]" (-> concat-info :args (nth 0) :type)))
+      (is (false? (-> concat-info :args (nth 0) :optional?)))
+
+      (is (= "num-args" (-> concat-info :args (nth 1) :name)))
+      (is (= "int" (-> concat-info :args (nth 1) :type)))
+      (is (false? (-> concat-info :args (nth 1) :optional?)))
+
+      (is (= "dim" (-> concat-info :args (nth 2) :name)))
+      (is (= "int" (-> concat-info :args (nth 2) :type)))
+      (is (= "'1'" (-> concat-info :args (nth 2) :default)))
+      (is (true? (-> concat-info :args (nth 2) :optional?)))))
+
+  (testing "convolution"
+    (let [convolution-info (gen/gen-op-info "Convolution")]
+
+      (is (= "convolution" (:fn-name convolution-info)))
+      (is (= 14 (-> convolution-info :args count)))
+      (is (= "" (:key-var-num-args convolution-info)))
+
+      (is (= "data" (-> convolution-info :args (nth 0) :name)))
+      (is (= "NDArray-or-Symbol" (-> convolution-info :args (nth 0) :type)))
+      (is (false? (-> convolution-info :args (nth 0) :optional?)))
+
+      (is (= "weight" (-> convolution-info :args (nth 1) :name)))
+      (is (= "NDArray-or-Symbol" (-> convolution-info :args (nth 1) :type)))
+      (is (false? (-> convolution-info :args (nth 1) :optional?)))
+
+      (is (= "kernel" (-> convolution-info :args (nth 3) :name)))
+      (is (= "Shape" (-> convolution-info :args (nth 3) :type)))
+      (is (= "(tuple)" (-> convolution-info :args (nth 3) :spec)))
+      (is (false? (-> convolution-info :args (nth 3) :optional?)))
+
+      (is (= "stride" (-> convolution-info :args (nth 4) :name)))
+      (is (= "Shape" (-> convolution-info :args (nth 4) :type)))
+      (is (= "(tuple)" (-> convolution-info :args (nth 4) :spec)))
+      (is (= "[]" (-> convolution-info :args (nth 4) :default)))
+      (is (true? (-> convolution-info :args (nth 4) :optional?)))
+
+      (is (= "num-filter" (-> convolution-info :args (nth 7) :name)))
+      (is (= "int" (-> convolution-info :args (nth 7) :type)))
+      (is (= "(non-negative)" (-> convolution-info :args (nth 7) :spec)))
+      (is (false? (-> convolution-info :args (nth 7) :optional?)))
+
+      (is (= "num-group" (-> convolution-info :args (nth 8) :name)))
+      (is (= "int" (-> convolution-info :args (nth 8) :type)))
+      (is (= "(non-negative)" (-> convolution-info :args (nth 8) :spec)))
+      (is (= "1" (-> convolution-info :args (nth 8) :default)))
+      (is (true? (-> convolution-info :args (nth 8) :optional?)))
+
+      (is (= "workspace" (-> convolution-info :args (nth 9) :name)))
+      (is (= "long" (-> convolution-info :args (nth 9) :type)))
+      (is (= "(non-negative)" (-> convolution-info :args (nth 9) :spec)))
+      (is (= "1024" (-> convolution-info :args (nth 9) :default)))
+      (is (true? (-> convolution-info :args (nth 9) :optional?)))
+
+      (is (= "no-bias" (-> convolution-info :args (nth 10) :name)))
+      (is (= "boolean" (-> convolution-info :args (nth 10) :type)))
+      (is (= "0" (-> convolution-info :args (nth 10) :default)))
+      (is (true? (-> convolution-info :args (nth 10) :optional?)))
+
+      (is (= "layout" (-> convolution-info :args (nth 13) :name)))
+      (is (= "None, 'NCDHW', 'NCHW', 'NCW', 'NDHWC', 'NHWC'" (-> convolution-info :args (nth 13) :type)))
+      (is (= "'None'" (-> convolution-info :args (nth 13) :default)))
+      (is (true? (-> convolution-info :args (nth 13) :optional?)))))
+
+  (testing "element wise sum"
+    (let [element-wise-sum-info (gen/gen-op-info "ElementWiseSum")]
+      (is (= "add-n" (:fn-name element-wise-sum-info)))
+      (is (= 1 (-> element-wise-sum-info :args count)))
+      (is (= "num-args" (:key-var-num-args element-wise-sum-info)))
+
+      (is (= "args" (-> element-wise-sum-info :args (nth 0) :name)))
+      (is (= "NDArray-or-Symbol[]" (-> element-wise-sum-info :args (nth 0) :type)))
+      (is (false? (-> element-wise-sum-info :args (nth 0) :optional?))))))
+
 (deftest test-ndarray-transform-param-name
   (let [params ["scala.collection.immutable.Map"
                 "scala.collection.Seq"]
@@ -68,7 +189,10 @@
 
 (deftest test-rename-duplicate-params
   (is (= ["foo" "bar" "baz"] (gen/rename-duplicate-params ["foo" "bar" "baz"])))
-  (is (= ["foo" "bar" "bar-1"] (gen/rename-duplicate-params ["foo" "bar" "bar"]))))
+  (is (= ["foo" "bar" "bar-1"] (gen/rename-duplicate-params ["foo" "bar" "bar"])))
+  (is (= ["foo" "bar" "bar-1" "foo-1"] (gen/rename-duplicate-params ["foo" "bar" "bar" "foo"])))
+  (is (= ["foo" "bar" "bar-1" "bar-2"] (gen/rename-duplicate-params ["foo" "bar" "bar" "bar"])))
+  (is (= ["foo" "bar" "bar-1" "bar-2" "foo-1" "baz"] (gen/rename-duplicate-params ["foo" "bar" "bar" "bar" "foo" "baz"]))))
 
 (deftest test-is-symbol-hand-gen?
   (is (not (false? (gen/is-symbol-hand-gen? (symbol-reflect-info "max")))))
@@ -86,18 +210,19 @@
     (is (= "LRN" (-> lrn-info vals ffirst :name str)))))
 
 (deftest test-symbol-vector-args
-  (is (= `(if (clojure.core/map? kwargs-map-or-vec-or-sym)
+  (is (= '(if (clojure.core/map? kwargs-map-or-vec-or-sym)
             (util/empty-list)
             (util/coerce-param
              kwargs-map-or-vec-or-sym
-             #{"scala.collection.Seq"}))) (gen/symbol-vector-args)))
+             #{"scala.collection.Seq"}))
+         (gen/symbol-vector-args))))
 
 (deftest test-symbol-map-args
-  (is (= `(if (clojure.core/map? kwargs-map-or-vec-or-sym)
+  (is (= '(if (clojure.core/map? kwargs-map-or-vec-or-sym)
             (org.apache.clojure-mxnet.util/convert-symbol-map
              kwargs-map-or-vec-or-sym)
-            nil))
-      (gen/symbol-map-args)))
+            nil)
+         (gen/symbol-map-args))))
 
 (deftest test-add-symbol-arities
   (let [params (map symbol ["sym-name" "kwargs-map" "symbol-list" "kwargs-map-1"])
@@ -112,36 +237,36 @@
            ar1))
     (is (= '([sym-name kwargs-map-or-vec-or-sym]
              (foo
-              sym-name
-              nil
-              (if
-               (clojure.core/map? kwargs-map-or-vec-or-sym)
-                (util/empty-list)
-                (util/coerce-param
-                 kwargs-map-or-vec-or-sym
-                 #{"scala.collection.Seq"}))
-              (if
-               (clojure.core/map? kwargs-map-or-vec-or-sym)
-                (org.apache.clojure-mxnet.util/convert-symbol-map
-                 kwargs-map-or-vec-or-sym)
-                nil))))
-        ar2)
+               sym-name
+               nil
+               (if
+                 (clojure.core/map? kwargs-map-or-vec-or-sym)
+                 (util/empty-list)
+                 (util/coerce-param
+                   kwargs-map-or-vec-or-sym
+                   #{"scala.collection.Seq"}))
+               (if
+                 (clojure.core/map? kwargs-map-or-vec-or-sym)
+                 (org.apache.clojure-mxnet.util/convert-symbol-map
+                   kwargs-map-or-vec-or-sym)
+                 nil)))
+           ar2))
     (is (= '([kwargs-map-or-vec-or-sym]
              (foo
-              nil
-              nil
-              (if
-               (clojure.core/map? kwargs-map-or-vec-or-sym)
-                (util/empty-list)
-                (util/coerce-param
-                 kwargs-map-or-vec-or-sym
-                 #{"scala.collection.Seq"}))
-              (if
-               (clojure.core/map? kwargs-map-or-vec-or-sym)
-                (org.apache.clojure-mxnet.util/convert-symbol-map
-                 kwargs-map-or-vec-or-sym)
-                nil))))
-        ar3)))
+               nil
+               nil
+               (if
+                 (clojure.core/map? kwargs-map-or-vec-or-sym)
+                 (util/empty-list)
+                 (util/coerce-param
+                   kwargs-map-or-vec-or-sym
+                   #{"scala.collection.Seq"}))
+               (if
+                 (clojure.core/map? kwargs-map-or-vec-or-sym)
+                 (org.apache.clojure-mxnet.util/convert-symbol-map
+                   kwargs-map-or-vec-or-sym)
+                 nil)))
+           ar3))))
 
 (deftest test-gen-symbol-function-arity
   (let [op-name (symbol "$div")
@@ -157,14 +282,14 @@
                        :exception-types [],
                        :flags #{:public}}]}
         function-name (symbol "div")]
-    (is (= '(([sym sym-or-Object]
+    (is (= '(([sym sym-or-object]
               (util/coerce-return
                (.$div
                 sym
                 (util/nil-or-coerce-param
-                 sym-or-Object
-                 #{"org.apache.mxnet.Symbol" "java.lang.Object"}))))))
-        (gen/gen-symbol-function-arity op-name op-values function-name))))
+                 sym-or-object
+                 #{"org.apache.mxnet.Symbol" "java.lang.Object"})))))
+           (gen/gen-symbol-function-arity op-name op-values function-name)))))
 
 (deftest test-gen-ndarray-function-arity
   (let [op-name (symbol "$div")
@@ -182,15 +307,25 @@
                        :flags #{:public}}]}]
     (is (= '(([ndarray num-or-ndarray]
               (util/coerce-return
-               (.$div
-                ndarray
-                (util/coerce-param
-                 num-or-ndarray
-                 #{"float" "org.apache.mxnet.NDArray"}))))))
-        (gen/gen-ndarray-function-arity op-name op-values))))
+                (.$div
+                  ndarray
+                  (util/coerce-param
+                    num-or-ndarray
+                    #{"float" "org.apache.mxnet.NDArray"})))))
+           (gen/gen-ndarray-function-arity op-name op-values)))))
 
 (deftest test-write-to-file
-  (testing "symbol"
+  (testing "symbol-api"
+    (let [fname "test/test-symbol-api.clj"
+          _ (gen/write-to-file [(first gen/all-symbol-api-functions)
+                                (second gen/all-symbol-api-functions)]
+                               gen/symbol-api-gen-ns
+                               fname)
+          good-contents (slurp "test/good-test-symbol-api.clj")
+          contents (slurp fname)]
+      (is (= good-contents contents))))
+
+ (testing "symbol"
     (let [fname "test/test-symbol.clj"
           _ (gen/write-to-file [(first gen/all-symbol-functions)]
                                gen/symbol-gen-ns
@@ -199,6 +334,16 @@
           contents (slurp fname)]
       (is (= good-contents contents))))
 
+  (testing "ndarray-api"
+    (let [fname "test/test-ndarray-api.clj"
+          _ (gen/write-to-file [(first gen/all-ndarray-api-functions)
+                                (second gen/all-ndarray-api-functions)]
+                               gen/ndarray-api-gen-ns
+                               fname)
+          good-contents (slurp "test/good-test-ndarray-api.clj")
+          contents (slurp fname)]
+      (is (= good-contents contents))))
+
   (testing "ndarray"
     (let [fname "test/test-ndarray.clj"
           _ (gen/write-to-file [(first gen/all-ndarray-functions)]
diff --git a/contrib/clojure-package/test/good-test-ndarray-api.clj b/contrib/clojure-package/test/good-test-ndarray-api.clj
new file mode 100644
index 000000000000..1b83a7beb7bc
--- /dev/null
+++ b/contrib/clojure-package/test/good-test-ndarray-api.clj
@@ -0,0 +1,89 @@
+(ns
+  ^{:doc "Experimental"}
+  org.apache.clojure-mxnet.ndarray-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
+                            min repeat reverse set sort take to-array empty shuffle
+                            ref])
+  (:require [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util])
+  (:import (org.apache.mxnet NDArrayAPI)))
+
+;; Do not edit - this is auto-generated
+
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+
+
+(defn
+ activation
+ "Applies an activation function element-wise to the input.\n\nThe following activation functions are supported:\n\n- `relu`: Rectified Linear Unit, :math:`y = max(x, 0)`\n- `sigmoid`: :math:`y = \\frac{1}{1 + exp(-x)}`\n- `tanh`: Hyperbolic tangent, :math:`y = \\frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}`\n- `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`\n- `softsign`: :math:`y = \\frac{x}{1 + abs(x)}`\n\n\n\nDefined in src/operator/nn/activation.cc:L167\n\n`data`: The input array.\n`act-type`: Activation function to be applied.\n`out`: Output array. (optional)\n"
+ ([data act-type] (activation {:data data, :act-type act-type}))
+ ([{:keys [data act-type out], :or {out nil}, :as opts}]
+  (util/coerce-return
+   (NDArrayAPI/Activation data act-type (util/->option out)))))
+
+(defn
+ batch-norm
+ "Batch normalization.\n\nNormalizes a data batch by mean and variance, and applies a scale ``gamma`` as\nwell as offset ``beta``.\n\nAssume the input has more than one dimension and we normalize along axis 1.\nWe first compute the mean and variance along this axis:\n\n.. math::\n\n  data\\_mean[i] = mean(data[:,i,:,...]) \\\\\n  data\\_var[i] = var(data[:,i,:,...])\n\nThen compute the normalized output, which has the same shape as input, as following:\n\n.. math::\n\n  out[:,i,:,...] = \\frac{data[:,i,:,...] - data\\_mean[i]}{\\sqrt{data\\_var[i]+\\epsilon}} * gamma[i] + beta[i]\n\nBoth *mean* and *var* returns a scalar by treating the input as a vector.\n\nAssume the input has size *k* on axis 1, then both ``gamma`` and ``beta``\nhave shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and\nthe inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these\ntwo outputs are blocked.\n\nBesides the inputs and the outputs, this operator accepts two auxiliary\nstates, ``moving_mean`` and ``moving_var``, which are *k*-length\nvectors. They are global statistics for the whole dataset, which are updated\nby::\n\n  moving_mean = moving_mean * momentum + data_mean * (1 - momentum)\n  moving_var = moving_var * momentum + data_var * (1 - momentum)\n\nIf ``use_global_stats`` is set to be true, then ``moving_mean`` and\n``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute\nthe output. It is often used during inference.\n\nThe parameter ``axis`` specifies which axis of the input shape denotes\nthe 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel\naxis to be the last item in the input shape.\n\nBoth ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true,\nthen set ``gamma`` to 1 and its gradient to 0.\n\n.. Note::\n  When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False,\n  the sparse tensors will fallback.\n\n\n\nDefined in src/operator/nn/batch_norm.cc:L574\n\n`data`: Input data to batch normalization\n`gamma`: gamma array\n`beta`: beta array\n`moving-mean`: running mean of input\n`moving-var`: running variance of input\n`eps`: Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON defined in cudnn.h when using cudnn (usually 1e-5) (optional)\n`momentum`: Momentum for moving average (optional)\n`fix-gamma`: Fix gamma while training (optional)\n`use-global-stats`: Whether use global moving statistics instead of local batch-norm. This will force change batch-norm into a scale shift operator. (optional)\n`output-mean-var`: Output the mean and inverse std  (optional)\n`axis`: Specify which shape axis the channel is specified (optional)\n`cudnn-off`: Do not select CUDNN operator, if available (optional)\n`out`: Output array. (optional)\n"
+ ([data gamma beta moving-mean moving-var]
+  (batch-norm
+   {:data data,
+    :gamma gamma,
+    :beta beta,
+    :moving-mean moving-mean,
+    :moving-var moving-var}))
+ ([{:keys
+    [data
+     gamma
+     beta
+     moving-mean
+     moving-var
+     eps
+     momentum
+     fix-gamma
+     use-global-stats
+     output-mean-var
+     axis
+     cudnn-off
+     out],
+    :or
+    {eps nil,
+     momentum nil,
+     fix-gamma nil,
+     use-global-stats nil,
+     output-mean-var nil,
+     axis nil,
+     cudnn-off nil,
+     out nil},
+    :as opts}]
+  (util/coerce-return
+   (NDArrayAPI/BatchNorm
+    data
+    gamma
+    beta
+    moving-mean
+    moving-var
+    (util/->option eps)
+    (util/->option momentum)
+    (util/->option fix-gamma)
+    (util/->option use-global-stats)
+    (util/->option output-mean-var)
+    (util/->option axis)
+    (util/->option cudnn-off)
+    (util/->option out)))))
+
diff --git a/contrib/clojure-package/test/good-test-symbol-api.clj b/contrib/clojure-package/test/good-test-symbol-api.clj
new file mode 100644
index 000000000000..a03088486ee8
--- /dev/null
+++ b/contrib/clojure-package/test/good-test-symbol-api.clj
@@ -0,0 +1,109 @@
+(ns
+  ^{:doc "Experimental"}
+  org.apache.clojure-mxnet.symbol-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
+                            min repeat reverse set sort take to-array empty sin
+                            get apply shuffle ref])
+  (:require [org.apache.clojure-mxnet.util :as util]
+            [org.apache.clojure-mxnet.shape :as mx-shape])
+  (:import (org.apache.mxnet SymbolAPI)))
+
+;; Do not edit - this is auto-generated
+
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+
+
+(defn
+ activation
+ "Applies an activation function element-wise to the input.\n\nThe following activation functions are supported:\n\n- `relu`: Rectified Linear Unit, :math:`y = max(x, 0)`\n- `sigmoid`: :math:`y = \\frac{1}{1 + exp(-x)}`\n- `tanh`: Hyperbolic tangent, :math:`y = \\frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}`\n- `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`\n- `softsign`: :math:`y = \\frac{x}{1 + abs(x)}`\n\n\n\nDefined in src/operator/nn/activation.cc:L167\n\n`data`: The input array. (optional)\n`act-type`: Activation function to be applied.\n`name`: Name of the symbol (optional)\n`attr`: Attributes of the symbol (optional)\n"
+ [{:keys [data act-type name attr],
+   :or {data nil, name nil, attr nil},
+   :as opts}]
+ (util/coerce-return
+  (SymbolAPI/Activation
+   (util/->option data)
+   act-type
+   name
+   (clojure.core/when
+    attr
+    (clojure.core/->>
+     attr
+     (clojure.core/mapv
+      (clojure.core/fn [[k v]] [k (clojure.core/str v)]))
+     (clojure.core/into {})
+     util/convert-map)))))
+
+(defn
+ batch-norm
+ "Batch normalization.\n\nNormalizes a data batch by mean and variance, and applies a scale ``gamma`` as\nwell as offset ``beta``.\n\nAssume the input has more than one dimension and we normalize along axis 1.\nWe first compute the mean and variance along this axis:\n\n.. math::\n\n  data\\_mean[i] = mean(data[:,i,:,...]) \\\\\n  data\\_var[i] = var(data[:,i,:,...])\n\nThen compute the normalized output, which has the same shape as input, as following:\n\n.. math::\n\n  out[:,i,:,...] = \\frac{data[:,i,:,...] - data\\_mean[i]}{\\sqrt{data\\_var[i]+\\epsilon}} * gamma[i] + beta[i]\n\nBoth *mean* and *var* returns a scalar by treating the input as a vector.\n\nAssume the input has size *k* on axis 1, then both ``gamma`` and ``beta``\nhave shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and\nthe inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these\ntwo outputs are blocked.\n\nBesides the inputs and the outputs, this operator accepts two auxiliary\nstates, ``moving_mean`` and ``moving_var``, which are *k*-length\nvectors. They are global statistics for the whole dataset, which are updated\nby::\n\n  moving_mean = moving_mean * momentum + data_mean * (1 - momentum)\n  moving_var = moving_var * momentum + data_var * (1 - momentum)\n\nIf ``use_global_stats`` is set to be true, then ``moving_mean`` and\n``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute\nthe output. It is often used during inference.\n\nThe parameter ``axis`` specifies which axis of the input shape denotes\nthe 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel\naxis to be the last item in the input shape.\n\nBoth ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true,\nthen set ``gamma`` to 1 and its gradient to 0.\n\n.. Note::\n  When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False,\n  the sparse tensors will fallback.\n\n\n\nDefined in src/operator/nn/batch_norm.cc:L574\n\n`data`: Input data to batch normalization (optional)\n`gamma`: gamma array (optional)\n`beta`: beta array (optional)\n`moving-mean`: running mean of input (optional)\n`moving-var`: running variance of input (optional)\n`eps`: Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON defined in cudnn.h when using cudnn (usually 1e-5) (optional)\n`momentum`: Momentum for moving average (optional)\n`fix-gamma`: Fix gamma while training (optional)\n`use-global-stats`: Whether use global moving statistics instead of local batch-norm. This will force change batch-norm into a scale shift operator. (optional)\n`output-mean-var`: Output the mean and inverse std  (optional)\n`axis`: Specify which shape axis the channel is specified (optional)\n`cudnn-off`: Do not select CUDNN operator, if available (optional)\n`name`: Name of the symbol (optional)\n`attr`: Attributes of the symbol (optional)\n"
+ [{:keys
+   [data
+    gamma
+    beta
+    moving-mean
+    moving-var
+    eps
+    momentum
+    fix-gamma
+    use-global-stats
+    output-mean-var
+    axis
+    cudnn-off
+    name
+    attr],
+   :or
+   {output-mean-var nil,
+    axis nil,
+    cudnn-off nil,
+    fix-gamma nil,
+    eps nil,
+    data nil,
+    attr nil,
+    beta nil,
+    name nil,
+    use-global-stats nil,
+    moving-mean nil,
+    moving-var nil,
+    momentum nil,
+    gamma nil},
+   :as opts}]
+ (util/coerce-return
+  (SymbolAPI/BatchNorm
+   (util/->option data)
+   (util/->option gamma)
+   (util/->option beta)
+   (util/->option moving-mean)
+   (util/->option moving-var)
+   (util/->option eps)
+   (util/->option momentum)
+   (util/->option fix-gamma)
+   (util/->option use-global-stats)
+   (util/->option output-mean-var)
+   (util/->option axis)
+   (util/->option cudnn-off)
+   name
+   (clojure.core/when
+    attr
+    (clojure.core/->>
+     attr
+     (clojure.core/mapv
+      (clojure.core/fn [[k v]] [k (clojure.core/str v)]))
+     (clojure.core/into {})
+     util/convert-map)))))
+
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/conv_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/conv_test.clj
index feda45b9d027..ca9d4bc93986 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/conv_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/conv_test.clj
@@ -24,6 +24,8 @@
             [org.apache.clojure-mxnet.module :as m]
             [org.apache.clojure-mxnet.optimizer :as optimizer]
             [org.apache.clojure-mxnet.symbol :as sym]
+            [org.apache.clojure-mxnet.symbol-api :as sym-api]
+            [org.apache.clojure-mxnet.util :as util]
             [clojure.reflect :as r]))
 
 (def data-dir "data/")
@@ -54,17 +56,19 @@
 (defn get-symbol []
   (as-> (sym/variable "data") data
 
-    (sym/convolution "conv1" {:data data :kernel [3 3] :num-filter 32 :stride [2 2]})
-    (sym/batch-norm "bn1" {:data data})
-    (sym/activation "relu1" {:data data :act-type "relu"})
-    (sym/pooling "mp1" {:data data :kernel [2 2] :pool-type "max" :stride [2 2]}) (sym/convolution "conv2" {:data data :kernel [3 3] :num-filter 32 :stride [2 2]})
-    (sym/batch-norm "bn2" {:data data})
-    (sym/activation "relu2" {:data data :act-type "relu"})
-    (sym/pooling "mp2" {:data data :kernel [2 2] :pool-type "max" :stride [2 2]})
+    (sym-api/convolution {:name "conv1" :data data :kernel [3 3] :num-filter 32 :stride [2 2]})
+    (sym-api/batch-norm {:name "bn1" :data data})
+    (sym-api/activation {:name "relu1" :data data :act-type "relu"})
+    (sym-api/pooling {:name "mp1" :data data :kernel [2 2] :pool-type "max" :stride [2 2]})
 
-    (sym/flatten "fl" {:data data})
-    (sym/fully-connected "fc2" {:data data :num-hidden 10})
-    (sym/softmax-output "softmax" {:data data})))
+    (sym-api/convolution {:name "conv2" :data data :kernel [3 3] :num-filter 32 :stride [2 2]})
+    (sym-api/batch-norm {:name "bn2" :data data})
+    (sym-api/activation {:name "relu2" :data data :act-type "relu"})
+    (sym-api/pooling {:name "mp2" :data data :kernel [2 2] :pool-type "max" :stride [2 2]})
+
+    (sym-api/flatten {:name "fl" :data data})
+    (sym-api/fully-connected {:name "fc2" :data data :num-hidden 10})
+    (sym-api/softmax-output {:name "softmax" :data data})))
 
 (deftest test-conv []
   (let [mod (m/module (get-symbol))]
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/eval_metric_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/eval_metric_test.clj
index d6da2ec9ee58..1f4dba35fa7a 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/eval_metric_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/eval_metric_test.clj
@@ -57,3 +57,12 @@
                                           "my-metric")]
     (eval-metric/update metric [(ndarray/ones [2])] [(ndarray/ones [2])])
     (is (= ["my-metric" 0.0] (eval-metric/get metric)))))
+
+(deftest test-comp-metric
+  (let [metric (eval-metric/comp-metric [(eval-metric/accuracy)
+                                         (eval-metric/f1)
+                                         (eval-metric/top-k-accuracy 2)])]
+    (eval-metric/update metric [(ndarray/ones [2])] [(ndarray/ones [2 3])])
+    (is (= {"accuracy" 0.0
+            "f1" 0.0
+            "top_k_accuracy" 1.0} (eval-metric/get metric)))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/executor_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/executor_test.clj
index fb73f0091562..ebd1a9d061a4 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/executor_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/executor_test.clj
@@ -65,10 +65,10 @@
                                 (map ndarray/->vec)
                                 first)))
     ;; test shared memory
-    (is (= [4.0 4.0 4.0]) (->> (executor/outputs exec)
-                               (map ndarray/->vec)
-                               first
-                               (take 3)))
+    (is (= [4.0 4.0 4.0] (->> (executor/outputs exec)
+                              (map ndarray/->vec)
+                              first
+                              (take 3))))
     ;; test base exec forward
     (executor/forward exec)
     (is (every? #(= 4.0 %) (->> (executor/outputs exec)
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj
index 38ab11c86012..23b88d07e896 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj
@@ -20,7 +20,8 @@
             [org.apache.clojure-mxnet.ndarray :as ndarray]
             [clojure.java.io :as io]
             [clojure.test :refer :all])
-  (:import (javax.imageio ImageIO)))
+  (:import (javax.imageio ImageIO)
+           (java.io File)))
 
 (def tmp-dir (System/getProperty "java.io.tmpdir"))
 (def image-path (.getAbsolutePath (io/file tmp-dir "Pug-Cookie.jpg")))
@@ -76,4 +77,15 @@
   (let [img-arr (image/read-image image-path)
         resized-arr (image/resize-image img-arr 224 224)
         new-img (image/to-image resized-arr)]
-    (is (= true (ImageIO/write new-img "png" (io/file tmp-dir "out.png"))))))
+    (is (ImageIO/write new-img "png" (io/file tmp-dir "out.png")))))
+
+(deftest test-draw-bounding-box!
+  (let [orig-img (ImageIO/read (new File image-path))
+        new-img  (-> orig-img
+                     (image/draw-bounding-box! [{:x-min 190 :x-max 850 :y-min 50 :y-max 450}
+                                                {:x-min 200 :x-max 350 :y-min 440 :y-max 530}]
+                                               {:stroke 2
+                                                :names ["pug" "cookie"]
+                                                :transparency 0.8
+                                                :font-size-mult 2.0}))]
+    (is (ImageIO/write new-img "png" (io/file tmp-dir "out.png")))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/imageclassifier_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/imageclassifier_test.clj
index e3935c31e342..b7f468f341cd 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/imageclassifier_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/imageclassifier_test.clj
@@ -48,7 +48,7 @@
     (is (= 10 (count predictions-with-default-dtype)))
     (is (= 5 (count predictions)))
     (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (= (< 0 (:prob (first predictions)) 1)))))
+    (is (< 0 (:prob (first predictions)) 1))))
 
 (deftest test-batch-classification
   (let [classifier (create-classifier)
@@ -61,7 +61,7 @@
     (is (= 10 (count batch-predictions-with-default-dtype)))
     (is (= 5 (count predictions)))
     (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (= (< 0 (:prob (first predictions)) 1)))))
+    (is (< 0 (:prob (first predictions)) 1))))
 
 (deftest test-single-classification-with-ndarray
   (let [classifier (create-classifier)
@@ -74,7 +74,7 @@
     (is (= 1000 (count predictions-all)))
     (is (= 5 (count predictions)))
     (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (= (< 0 (:prob (first predictions)) 1)))))
+    (is (< 0 (:prob (first predictions)) 1))))
 
 (deftest test-single-classify
   (let [classifier (create-classifier)
@@ -87,7 +87,7 @@
     (is (= 1000 (count predictions-all)))
     (is (= 5 (count predictions)))
     (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (= (< 0 (:prob (first predictions)) 1)))))
+    (is (< 0 (:prob (first predictions)) 1))))
 
 (deftest test-base-classification-with-ndarray
   (let [descriptors [{:name "data"
@@ -105,7 +105,7 @@
     (is (= 1000 (count predictions-all)))
     (is (= 5 (count predictions)))
     (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (= (< 0 (:prob (first predictions)) 1)))))
+    (is (< 0 (:prob (first predictions)) 1))))
 
 (deftest test-base-single-classify
   (let [descriptors [{:name "data"
@@ -123,6 +123,6 @@
     (is (= 1000 (count predictions-all)))
     (is (= 5 (count predictions)))
     (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (= (< 0 (:prob (first predictions)) 1)))))
+    (is (< 0 (:prob (first predictions)) 1))))
 
 
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj
index d53af2ec249d..e03c43848332 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj
@@ -246,6 +246,7 @@
         d-shape1 [10 3 64 64]
         d-shape2 [10 3 32 32]
         l-shape [10]
+
         mod (m/module my-sym {:data-names ["data1" "data2"]})
         data-batch {:data [(ndarray/random-uniform 0 9 (str (mx-shape/->shape d-shape1)))
                            (ndarray/random-uniform 5 15 (str (mx-shape/->shape d-shape2)))]
@@ -261,7 +262,12 @@
         (m/init-params)
         (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1})})
         (m/forward data-batch))
-    (is (= [(first l-shape) num-class]) (-> (m/outputs-merged mod) first (ndarray/shape) (mx-shape/->vec)))
+    (is (= [(first l-shape) num-class]
+           (-> mod
+               (m/outputs-merged)
+               (first)
+               (ndarray/shape)
+               (mx-shape/->vec))))
     (-> mod
         (m/backward)
         (m/update))
@@ -275,8 +281,13 @@
                         :index nil
                         :pad 0}]
       (-> mod
-          (m/forward data-batch))
-      (is (= [(first l-shape) num-class]) (-> (m/outputs-merged mod) first (ndarray/shape) (mx-shape/->vec)))
+          (m/forward data-batch-2))
+      (is (= [(first l-shape) num-class]
+             (-> mod
+                 (m/outputs-merged)
+                 (first)
+                 (ndarray/shape)
+                 (mx-shape/->vec))))
       (-> mod
           (m/backward)
           (m/update)))
@@ -290,8 +301,13 @@
                         :index nil
                         :pad 0}]
       (-> mod
-          (m/forward data-batch))
-      (is (= [(first l-shape) num-class]) (-> (m/outputs-merged mod) first (ndarray/shape) (mx-shape/->vec)))
+          (m/forward data-batch-2))
+      (is (= [(first l-shape) num-class]
+             (-> mod
+                 (m/outputs-merged)
+                 (first)
+                 (ndarray/shape)
+                 (mx-shape/->vec))))
       (-> mod
           (m/backward)
           (m/update)))
@@ -307,7 +323,11 @@
                       :pad 0}]
       (-> mod
           (m/forward data-batch))
-      (is (= [(first l-shape) num-class]) (-> (m/outputs-merged mod) first (ndarray/shape) (mx-shape/->vec)))
+      (is (= [(first l-shape) num-class]
+             (-> (m/outputs-merged mod)
+                 first
+                 (ndarray/shape)
+                 (mx-shape/->vec))))
       (-> mod
           (m/backward)
           (m/update)))
@@ -321,7 +341,11 @@
                       :pad 0}]
       (-> mod
           (m/forward data-batch))
-      (is (= [(first l-shape) num-class]) (-> (m/outputs-merged mod) first (ndarray/shape) (mx-shape/->vec)))
+      (is (= [(first l-shape) num-class]
+             (-> (m/outputs-merged mod)
+                 first
+                 (ndarray/shape)
+                 (mx-shape/->vec))))
       (-> mod
           (m/backward)
           (m/update)))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_api_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_api_test.clj
new file mode 100644
index 000000000000..18b8b78f19d1
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_api_test.clj
@@ -0,0 +1,415 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.ndarray-api-test
+  (:require [org.apache.clojure-mxnet.base :as base]
+            [org.apache.clojure-mxnet.context :as ctx]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.ndarray :as ndarray :refer [->vec zeros ones += -= *= full shape shape-vec]]
+            [org.apache.clojure-mxnet.ndarray-api :as ndarray-api]
+            [org.apache.clojure-mxnet.shape :as mx-shape :refer [->shape]]
+            [org.apache.clojure-mxnet.test-util :as test-util :refer [approx=]]
+            [org.apache.clojure-mxnet.util :as util :refer [->option]]
+            [clojure.test :refer :all]))
+
+(deftest test-activation
+  (let [data (ndarray/array [2 1 0 -1 -2] [1 5])
+        relu (ndarray-api/activation data "relu")
+        sigmoid (ndarray-api/activation data "sigmoid")
+        softsign (ndarray-api/activation data "softsign")
+        out (ndarray/zeros [1 5])
+        _ (ndarray-api/activation {:data data :act-type "relu" :out out})]
+    (is (= [2.0 1.0 0.0 0.0 0.0] (->vec relu)))
+    (is (approx= 1e-3 [0.881 0.731 0.5 0.269 0.119] (->vec sigmoid)))
+    (is (approx= 1e-3 [0.666 0.5 0.0 -0.5 -0.666] (->vec softsign)))
+    (is (= [2.0 1.0 0.0 0.0 0.0] (->vec out)))))
+
+(deftest test-bilinear-sampler
+  (let [data (ndarray/array [1 4 3 6
+                             1 8 8 9
+                             0 4 1 5
+                             1 0 1 3]
+                            [1 1 4 4])
+        affine (ndarray/array [2 0 0
+                               0 2 0]
+                              [1 6])
+        grid (ndarray-api/grid-generator {:data affine :transform-type "affine" :target-shape [4 4]})
+        out (ndarray-api/bilinear-sampler data grid)]
+    (is (approx= 1e-3
+                 [0.0 0.0 0.0 0.0
+                  0.0 3.5 6.5 0.0
+                  0.0 1.25 2.5 0.0
+                  0.0 0.0 0.0 0.0]
+                 (->vec out)))))
+
+(deftest test-cast
+  (let [nda1 (ndarray/array [0.9 1.3] [2])
+        nda2 (ndarray/array [1e20 11.1] [2])
+        nda3 (ndarray/array [300 11.1 10.9 -1 -3] [5])
+        out (ndarray/zeros [2] {:dtype dtype/INT32})
+        _ (ndarray-api/cast {:data nda1 :dtype (str dtype/INT32) :out out})]
+    (is (= [0.0 1.0] (->vec (ndarray-api/cast nda1 (str dtype/INT32)))))
+    (is (= [(float 1e20) (float 11.1)] (->vec (ndarray-api/cast nda2 (str dtype/FLOAT32)))))
+    ;; uint8 gets converted to native types after ->vec
+    (is (= [44.0 11.0 10.0 -1.0 -3.0] (->vec (ndarray-api/cast nda3 "uint8"))))))
+
+(deftest test-concat
+  (let [nda1 (ndarray/zeros [1 2])
+        nda2 (ndarray/ones [1 2])
+        out (ndarray/zeros [1 4])
+        res1 (ndarray-api/concat [nda1 nda2] 2) ;; num_args=2, dim=1 (default)
+        res2 (ndarray-api/concat {:data [nda1 nda2] :num-args 2 :dim 0}) ;; num_args=2, dim=0
+        res3 (ndarray-api/concat {:data [nda1 nda2 nda1] :num-args 3 :dim 1}) ;; num_args=3, dim=1
+        _ (ndarray-api/concat {:data [nda1 nda2] :num-args 2 :dim 1 :out out}) ;; store result in out
+        ]
+    (is (= [0.0 0.0 1.0 1.0] (->vec res1)))
+    (is (= [1 4] (shape-vec res1)))
+    (is (= [0.0 0.0 1.0 1.0] (->vec res2)))
+    (is (= [2 2] (shape-vec res2)))
+    (is (= [0.0 0.0 1.0 1.0 0.0 0.0] (->vec res3)))
+    (is (= [1 6] (shape-vec res3)))
+    (is (= [0.0 0.0 1.0 1.0] (->vec out)))
+    (is (= [1 4] (shape-vec out)))))
+
+(deftest test-embedding
+  (let [input-dim 4
+        output-dim 5
+        w (ndarray/array [0.  1.  2.  3.  4.
+                          5.  6.  7.  8.  9.
+                          10. 11. 12. 13. 14.
+                          15. 16. 17. 18. 19.]
+                         [4 5])
+        x (ndarray/array [1. 3.
+                          0. 2.]
+                         [2 2])
+        out (ndarray-api/embedding x w input-dim output-dim)]
+    (is (= [5.  6.  7.  8.  9.
+            15. 16. 17. 18. 19.
+            0.  1.  2.  3.  4.
+            10. 11. 12. 13. 14.]
+           (->vec out)))
+    (is (= [2 2 5] (shape-vec out)))))
+
+(deftest test-flatten
+  (let [nda (ndarray/array [1 2 3
+                            4 5 6
+                            7 8 9
+                            1 2 3
+                            4 5 6
+                            7 8 9]
+                           [2 3 3])
+        out (ndarray/zeros [2 9])
+        res (ndarray-api/flatten {:data nda})
+        _ (ndarray-api/flatten {:data nda :out out})]
+    (is (= [1. 2. 3. 4. 5. 6. 7. 8. 9.
+            1. 2. 3. 4. 5. 6. 7. 8. 9.] (->vec res)))
+    (is (= [2 9] (shape-vec res)))
+    (is (= [1. 2. 3. 4. 5. 6. 7. 8. 9.
+            1. 2. 3. 4. 5. 6. 7. 8. 9.] (->vec out)))
+    (is (= [2 9] (shape-vec out)))))
+
+(deftest test-instance-norm
+  (let [x (ndarray/array [1.1 2.2 3.3 4.4] [2 1 2])
+        gamma (ndarray/array [1.5] [1])
+        beta (ndarray/array [0.5] [1])
+        res (ndarray-api/instance-norm x gamma beta)]
+    (is (approx= 1e-4 [-0.9975 1.9975
+                       -0.9975 1.9975] (->vec res)))
+    (is (= [2 1 2] (shape-vec res)))))
+
+(deftest test-l2-normalization
+  (let [x (ndarray/array [1 2 3 4 2 2 5 6] [2 2 2])
+        res1 (ndarray-api/l2-normalization {:data x}) ;; instance-wise
+        res2 (ndarray-api/l2-normalization {:data x :mode "instance"})
+        res3 (ndarray-api/l2-normalization {:data x :mode "channel"})
+        res4 (ndarray-api/l2-normalization {:data x :mode "spatial"})]
+    (is (approx= 1e-4 [0.1825 0.3651
+                       0.5477 0.7303
+                       0.2407 0.2407
+                       0.6019 0.7223] (->vec res1)))
+    (is (approx= 1e-4 [0.1825 0.3651
+                       0.5477 0.7303
+                       0.2407 0.2407
+                       0.6019 0.7223] (->vec res2)))
+    (is (approx= 1e-4 [0.3162 0.4472
+                       0.9486 0.8944
+                       0.3714 0.3162
+                       0.9284 0.9486] (->vec res3)))
+    (is (approx= 1e-4 [0.4472 0.8944
+                       0.6    0.8
+                       0.7071 0.7071
+                       0.6402 0.7682] (->vec res4)))))
+
+(deftest test-pad
+  (let [x (ndarray/array [1 2 3
+                          4 5 6
+                          7 8 9
+                          10 11 12
+                          11 12 13
+                          14 15 16
+                          17 18 19
+                          20 21 22]
+                         [2 2 2 3])
+        res1 (ndarray-api/pad x "edge" [0,0,0,0,1,1,1,1])
+        res2 (ndarray-api/pad {:data x :mode "constant" :pad-width [0,0,0,0,1,1,1,1] :constant-value 0})]
+    (is (= [1.   1.   2.   3.   3.
+            1.   1.   2.   3.   3.
+            4.   4.   5.   6.   6.
+            4.   4.   5.   6.   6.
+            7.   7.   8.   9.   9.
+            7.   7.   8.   9.   9.
+            10.  10.  11.  12.  12.
+            10.  10.  11.  12.  12.
+            11.  11.  12.  13.  13.
+            11.  11.  12.  13.  13.
+            14.  14.  15.  16.  16.
+            14.  14.  15.  16.  16.
+            17.  17.  18.  19.  19.
+            17.  17.  18.  19.  19.
+            20.  20.  21.  22.  22.
+            20.  20.  21.  22.  22.] (->vec res1)))
+    (is (= [2 2 4 5] (shape-vec res1)))
+    (is (= [0.   0.   0.   0.   0.
+            0.   1.   2.   3.   0.
+            0.   4.   5.   6.   0.
+            0.   0.   0.   0.   0.
+            
+            0.   0.   0.   0.   0.
+            0.   7.   8.   9.   0.
+            0.  10.  11.  12.   0.
+            0.   0.   0.   0.   0.
+            
+            0.   0.   0.   0.   0.
+            0.  11.  12.  13.   0.
+            0.  14.  15.  16.   0.
+            0.   0.   0.   0.   0.
+            
+            0.   0.   0.   0.   0.
+            0.  17.  18.  19.   0.
+            0.  20.  21.  22.   0.
+            0.   0.   0.   0.   0.] (->vec res2)))
+    (is (= [2 2 4 5] (shape-vec res2)))))
+
+(deftest test-roi-pooling
+  (let [xi [[[[  0.,   1.,   2.,   3.,   4.,   5.],
+              [  6.,   7.,   8.,   9.,  10.,  11.],
+              [ 12.,  13.,  14.,  15.,  16.,  17.],
+              [ 18.,  19.,  20.,  21.,  22.,  23.],
+              [ 24.,  25.,  26.,  27.,  28.,  29.],
+              [ 30.,  31.,  32.,  33.,  34.,  35.],
+              [ 36.,  37.,  38.,  39.,  40.,  41.],
+              [ 42.,  43.,  44.,  45.,  46.,  47.]]]]
+        x (ndarray/array (-> xi flatten vec) [1 1 8 6])
+        y (ndarray/array [0 0 0 4 4] [1 5])
+        res1 (ndarray-api/roi-pooling x y [2 2] 1.0)
+        res2 (ndarray-api/roi-pooling x y [2 2] 0.7)]
+    (is (= [14. 16. 26. 28.] (->vec res1)))
+    (is (= [1 1 2 2] (shape-vec res1)))
+    (is (= [7. 9. 19. 21.] (->vec res2)))
+    (is (= [1 1 2 2] (shape-vec res2)))))
+
+(deftest test-reshape
+  (let [x (ndarray/array (vec (range 4)) [4])
+        y (ndarray/array (vec (range 24)) [2 3 4])
+        z (ndarray/array (vec (range 120)) [2 3 4 5])
+        res1 (ndarray-api/reshape {:data x :shape [2 2]})]
+    (is (= [0. 1. 2. 3.] (->vec res1)))
+    (is (= [2 2] (shape-vec res1)))
+    (is (= (map float (range 24)) (->vec (ndarray-api/reshape {:data y :shape [4 0 2]}))))
+    (is (= [4 3 2] (shape-vec (ndarray-api/reshape {:data y :shape [4 0 2]}))))
+    (is (= [2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [2 0 0]}))))
+    (is (= [6 1 4] (shape-vec (ndarray-api/reshape {:data y :shape [6 1 -1]}))))
+    (is (= [3 1 8] (shape-vec (ndarray-api/reshape {:data y :shape [3 -1 8]}))))
+    (is (= [24] (shape-vec (ndarray-api/reshape {:data y :shape [-1]}))))
+    (is (= [2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [-2]}))))
+    (is (= [2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [2 -2]}))))
+    (is (= [2 3 4 1 1] (shape-vec (ndarray-api/reshape {:data y :shape [-2 1 1]}))))
+    (is (= [6 4] (shape-vec (ndarray-api/reshape {:data y :shape [-3 4]}))))
+    (is (= [6 20] (shape-vec (ndarray-api/reshape {:data z :shape [-3 -3]}))))
+    (is (= [2 12] (shape-vec (ndarray-api/reshape {:data y :shape [0 -3]}))))
+    (is (= [6 4] (shape-vec (ndarray-api/reshape {:data y :shape [-3 -2]}))))
+    (is (= [1 2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [-4 1 2 -2]}))))
+    (is (= [2 1 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [2 -4 -1 3 -2]}))))))
+
+(deftest test-sequence-last
+  (let [xi [[[  1.,   2.,   3.],
+             [  4.,   5.,   6.],
+             [  7.,   8.,   9.]],
+            
+            [[ 10.,   11.,   12.],
+             [ 13.,   14.,   15.],
+             [ 16.,   17.,   18.]],
+            
+            [[  19.,   20.,   21.],
+             [  22.,   23.,   24.],
+             [  25.,   26.,   27.]]]
+        x (ndarray/array (-> xi flatten vec) [3 3 3])
+        seq-len1 (ndarray/array [1 1 1] [3])
+        seq-len2 (ndarray/array [1 2 3] [3])
+        ;; This test is failing with an exception
+        ;; (most likely a scala generation issue)
+        ;; res1 (ndarray-api/sequence-last x nil)
+        ]
+    ;; (is (= [] (->vec res1)))
+))
+
+(deftest test-sequence-mask
+  (let [xi [[[  1.,   2.,   3.],
+             [  4.,   5.,   6.]],
+            
+            [[  7.,   8.,   9.],
+             [ 10.,  11.,  12.]],
+            
+            [[ 13.,  14.,   15.],
+             [ 16.,  17.,   18.]]]
+        x (ndarray/array (-> xi flatten vec) [3 2 3])
+        seq-len1 (ndarray/array [1 1] [2])
+        seq-len2 (ndarray/array [2 3] [2])
+        ;; Same issue as previous test
+        ;; res1 (ndarray-api/sequence-mask x seq-len1)
+        ]
+    ;; (is (= [] (->vec res1)))
+))
+
+(deftest test-slice-channel
+  (let [xi [[[ 1.] [ 2.]]
+            [[ 3.] [ 4.]]
+            [[ 5.] [ 6.]]]
+        x (ndarray/array (-> xi flatten vec) [3 2 1])
+        res1 (ndarray-api/slice-channel {:data x :num-outputs 2 :axis 1})
+        res2 (ndarray-api/slice-channel {:data x :num-outputs 3 :axis 0})
+        res3 (ndarray-api/slice-channel {:data x :num-outputs 3 :axis 0 :squeeze-axis 1})]
+    (is (= [1. 3. 5.] (->vec res1)))
+    (is (= [3 1 1] (shape-vec res1)))
+    (is (= [1. 2.] (->vec res2)))
+    (is (= [1 2 1] (shape-vec res2)))
+    (is (= [1. 2.] (->vec res3)))
+    (is (= [2 1] (shape-vec res3)))))
+
+(deftest test-softmax-activation
+  (let [x (ndarray/array [1 1 1 1 1 1] [2 3])
+        res1 (ndarray-api/softmax-activation {:data x :mode "instance"})]
+    (is (approx= 1e-3 [0.333 0.333 0.333
+                       0.333 0.333 0.333] (->vec res1)))
+    (is (= [2 3] (shape-vec res1)))))
+
+(deftest test-softmax-output
+  (let [datai [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
+        data (ndarray/array (-> datai flatten vec) [4 4])
+        label (ndarray/array [1,0,2,3] [4])
+        res1 (ndarray-api/softmax-output data label)]
+    (is (approx= 1e-4 [0.0321 0.0871 0.2369 0.6439
+                       0.25 0.25 0.25 0.25
+                       0.25 0.25 0.25 0.25
+                       0.25 0.25 0.25 0.25] (->vec res1)))
+    (is (= [4 4] (shape-vec res1)))))
+
+(deftest test-swap-axis
+  (let [x (ndarray/array (range 3) [1 3])
+        y (ndarray/array (range 8) [2 2 2])
+        res1 (ndarray-api/swap-axis {:data x :dim1 0 :dim2 1})
+        res2 (ndarray-api/swap-axis {:data y :dim1 0 :dim2 2})]
+    (is (= [0. 1. 2.] (->vec res1)))
+    (is (= [3 1] (shape-vec res1)))
+    (is (= [0. 4. 2. 6. 1. 5. 3. 7.] (->vec res2)))
+    (is (= [2 2 2] (shape-vec res2)))))
+
+(deftest test-abs
+  (let [x (ndarray/array [-2 0 3] [3])
+        res1 (ndarray-api/abs {:data x})]
+    (is (= [2. 0. 3.] (->vec res1)))
+    (is (= [3] (shape-vec res1)))))
+
+(deftest test-arccos
+  (let [x (ndarray/array [-1 -0.707 0 0.707 1] [5])
+        pi Math/PI
+        res1 (ndarray-api/arccos {:data x})]
+    (is (approx= 1e-3 [pi (* 0.75 pi) (* 0.5 pi) (* 0.25 pi) 0.] (->vec res1)))))
+
+(deftest test-arcsin
+  (let [x (ndarray/array [-1 -0.707 0 0.707 1] [5])
+        pi Math/PI
+        res1 (ndarray-api/arcsin {:data x})]
+    (is (approx= 1e-3 [(- (* 0.5 pi)) (- (* 0.25 pi)) 0 (* 0.25 pi) (* 0.5 pi)] (->vec res1)))))
+
+(deftest test-argmax
+  (let [x (ndarray/array (range 6) [2 3])
+        res1 (ndarray-api/argmax {:data x :axis 0})
+        res2 (ndarray-api/argmax {:data x :axis 1})
+        res3 (ndarray-api/argmax {:data x :axis 0 :keepdims true})
+        res4 (ndarray-api/argmax {:data x :axis 1 :keepdims true})]
+    (is (= [1. 1. 1.] (->vec res1)))
+    (is (= [3] (shape-vec res1)))
+    (is (= [2. 2.] (->vec res2)))
+    (is (= [2] (shape-vec res2)))
+    (is (= [1. 1. 1.] (->vec res3)))
+    (is (= [1 3] (shape-vec res3)))
+    (is (= [2. 2.] (->vec res4)))
+    (is (= [2 1] (shape-vec res4)))))
+
+(deftest test-argmax-channel
+  (let [x (ndarray/array (range 6) [2 3])
+        res1 (ndarray-api/argmax-channel {:data x})]
+    (is (= [2. 2.] (->vec res1)))
+    (is (= [2] (shape-vec res1)))))
+
+(deftest test-argmin
+  (let [x (ndarray/array (reverse (range 6)) [2 3])
+        res1 (ndarray-api/argmin {:data x :axis 0})
+        res2 (ndarray-api/argmin {:data x :axis 1})
+        res3 (ndarray-api/argmin {:data x :axis 0 :keepdims true})
+        res4 (ndarray-api/argmin {:data x :axis 1 :keepdims true})]
+    (is (= [1. 1. 1.] (->vec res1)))
+    (is (= [3] (shape-vec res1)))
+    (is (= [2. 2.] (->vec res2)))
+    (is (= [2] (shape-vec res2)))
+    (is (= [1. 1. 1.] (->vec res3)))
+    (is (= [1 3] (shape-vec res3)))
+    (is (= [2. 2.] (->vec res4)))
+    (is (= [2 1] (shape-vec res4)))))
+
+(deftest test-argsort
+  (let [x (ndarray/array [0.3  0.2  0.4
+                          0.1  0.3  0.2]
+                         [2 3])
+        y (ndarray/array [0.3 0.2 0.4 0.1 0.3 0.2] [6])
+        res1 (ndarray-api/argsort {:data x})
+        res2 (ndarray-api/argsort {:data x :axis 0})
+        res3 (ndarray-api/argsort {:data y})]
+    (is (= [1. 0. 2.
+            0. 2. 1.]
+           (->vec res1)))
+    (is (= [2 3] (shape-vec res1)))
+    (is (= [1. 0. 1.
+            0. 1. 0.]
+           (->vec res2)))
+    (is (= [2 3] (shape-vec res1)))
+    (is (= [3. 1. 5. 0. 4. 2.] (->vec res3)))
+    (is (= [6] (shape-vec res3)))))
+
+(deftest test-batch-take
+  (let [x (ndarray/array (range 6) [3 2])
+        i (ndarray/as-type (ndarray/array [0 1 0] [3]) dtype/INT32)
+        res1 (ndarray-api/batch-take x i)        ]
+    (is (= [0. 3. 4.] (->vec res1)))))
+
+(deftest test-broadcast-add
+  (let [x (ndarray/ones [2 3])
+        y (ndarray/array (range 2) [2 1])
+        res1 (ndarray-api/broadcast-add x y)]
+    (is (= [1. 1. 1. 2. 2. 2.] (->vec res1)))
+    (is (= [2 3] (shape-vec res1)))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_test.clj
index 9ffd3abed2f9..13209e609a1d 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_test.clj
@@ -28,7 +28,7 @@
   (is (= [0.0 0.0 0.0 0.0] (->vec (zeros [2 2])))))
 
 (deftest test-to-array
-  (is (= [0.0 0.0 0.0 0.0]) (vec (ndarray/to-array (zeros [2 2])))))
+  (is (= [0.0 0.0 0.0 0.0] (vec (ndarray/to-array (zeros [2 2]))))))
 
 (deftest test-to-scalar
   (is (= 0.0 (ndarray/to-scalar (zeros [1]))))
@@ -61,8 +61,8 @@
     (is (= [2.0 2.0] (->vec (ndarray/+ ndones 1))))
     (is (= [1.0 1.0] (->vec ndones)))
     ;;; += mutuates
-    (is (= [2.0 2.0]) (->vec (+= ndones 1)))
-    (is (= [2.0 2.0]) (->vec ndones))))
+    (is (= [2.0 2.0] (->vec (+= ndones 1))))
+    (is (= [2.0 2.0] (->vec ndones)))))
 
 (deftest test-minus
   (let [ndones (ones [2 1])
@@ -71,8 +71,8 @@
     (is (= [-1.0 -1.0] (->vec (ndarray/- ndzeros 1))))
     (is (= [0.0 0.0] (->vec ndzeros)))
     ;;; += mutuates
-    (is (= [-1.0 -1.0]) (->vec (-= ndzeros 1)))
-    (is (= [-1.0 -1.0]) (->vec ndzeros))))
+    (is (= [-1.0 -1.0] (->vec (-= ndzeros 1))))
+    (is (= [-1.0 -1.0] (->vec ndzeros)))))
 
 (deftest test-multiplication
   (let [ndones (ones [2 1])
@@ -146,6 +146,18 @@
     (is (= [0.0 0.0 0.5 0.5 1.0 1.0 1.5 1.5 2.0 2.0 2.5 2.5 3.0 3.0 3.5 3.5 4.0 4.0 4.5 4.5]
            (->vec (ndarray/arange start stop {:step step :repeat repeat}))))))
 
+(deftest test->ndarray
+  (let [nda1 (ndarray/->ndarray [5.0 -4.0])
+        nda2 (ndarray/->ndarray [[1 2 3]
+                                 [4 5 6]])
+        nda3 (ndarray/->ndarray [[[7.0] [8.0]]])]
+    (is (= [5.0 -4.0] (->vec nda1)))
+    (is (= [2] (mx-shape/->vec (shape nda1))))
+    (is (= [1.0 2.0 3.0 4.0 5.0 6.0] (->vec nda2)))
+    (is (= [2 3] (mx-shape/->vec (shape nda2))))
+    (is (= [7.0 8.0] (->vec nda3)))
+    (is (= [1 2 1] (mx-shape/->vec (shape nda3))))))
+
 (deftest test-power
   (let [nda (ndarray/array [3 5] [2 1])]
 
@@ -396,7 +408,7 @@
   (let [nda (ndarray/array [1 2 3 4 5 6] [3 2])
         res (ndarray/at nda 1)]
     (is (= [2] (-> res shape mx-shape/->vec)))
-    (is (= [3 4]))))
+    (is (= [3 4] (-> res ndarray/->int-vec)))))
 
 (deftest test-reshape
   (let [nda (ndarray/array [1 2 3 4 5 6] [3 2])
@@ -473,3 +485,15 @@
     (is (= [2 2] (ndarray/->int-vec nda)))
     (is (= [2.0 2.0] (ndarray/->double-vec nda)))
     (is (= [(byte 2) (byte 2)] (ndarray/->byte-vec nda)))))
+
+(deftest test->nd-vec
+  (is (= [[[1.0]]]
+         (ndarray/->nd-vec (ndarray/array [1] [1 1 1]))))
+  (is (= [[[1.0]] [[2.0]] [[3.0]]]
+         (ndarray/->nd-vec (ndarray/array [1 2 3] [3 1 1]))))
+  (is (= [[[1.0 2.0]] [[3.0 4.0]] [[5.0 6.0]]]
+         (ndarray/->nd-vec (ndarray/array [1 2 3 4 5 6] [3 1 2]))))
+  (is (= [[[1.0] [2.0]] [[3.0] [4.0]] [[5.0] [6.0]]]
+         (ndarray/->nd-vec (ndarray/array [1 2 3 4 5 6] [3 2 1]))))
+  (is (thrown-with-msg? Exception #"Invalid input array"
+                         (ndarray/->nd-vec [1 2 3 4 5]))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
index 3b97190854b4..5e1b127d18bd 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
@@ -264,9 +264,9 @@
         _ (executor/set-arg exec "datas" data-vec)
         output (-> (executor/forward exec) (executor/outputs) first)]
     (is (approx= 1e-5 expected output))
-    (is (= [0 0 0 0]) (-> (executor/backward exec (ndarray/ones shape-vec))
+    (is (= [0 0 0 0] (-> (executor/backward exec (ndarray/ones shape-vec))
                           (executor/get-grad "datas")
-                          (ndarray/->vec)))))
+                          (ndarray/->int-vec))))))
 
 (defn check-symbol-operation
   [operator data-vec-1 data-vec-2 expected]
@@ -280,8 +280,8 @@
         output (-> (executor/forward exec) (executor/outputs) first)]
     (is (approx= 1e-5 expected output))
     _ (executor/backward exec (ndarray/ones shape-vec))
-    (is (= [0 0 0 0]) (-> (executor/get-grad exec "datas") (ndarray/->vec)))
-    (is (= [0 0 0 0]) (-> (executor/get-grad exec "datas2") (ndarray/->vec)))))
+    (is (= [0 0 0 0] (-> (executor/get-grad exec "datas") (ndarray/->int-vec))))
+    (is (= [0 0 0 0] (-> (executor/get-grad exec "datas2") (ndarray/->int-vec))))))
 
 (defn check-scalar-2-operation
   [operator data-vec expected]
@@ -292,9 +292,9 @@
         _ (executor/set-arg exec "datas" data-vec)
         output (-> (executor/forward exec) (executor/outputs) first)]
     (is (approx= 1e-5 expected output))
-    (is (= [0 0 0 0]) (-> (executor/backward exec (ndarray/ones shape-vec))
+    (is (= [0 0 0 0] (-> (executor/backward exec (ndarray/ones shape-vec))
                           (executor/get-grad "datas")
-                          (ndarray/->vec)))))
+                          (ndarray/->int-vec))))))
 
 (deftest test-scalar-equal
   (check-scalar-operation sym/equal [1 2 3 4] 2 [0 1 0 0]))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/random_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/random_test.clj
index 6952335c1390..ca1dcc9430dc 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/random_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/random_test.clj
@@ -56,6 +56,8 @@
   (is (thrown? Exception (fn_ 'a 2 [])))
   (is (thrown? Exception (fn_ 1 'b [])))
   (is (thrown? Exception (fn_ 1 2 [-1])))
+  (is (thrown? Exception (fn_ 1 0 [1 2])))
+  (is (thrown? Exception (fn_ 1 -1 [1 2])))
   (is (thrown? Exception (fn_ 1 2 [2 3 0])))
   (is (thrown? Exception (fn_ 1 2 [10 10] {:ctx "a"})))
   (let [ctx (context/default-context)]
@@ -64,4 +66,4 @@
 (deftest test-random-parameters-specs
   (random-or-normal random/normal)
   (random-or-normal random/uniform)
-  (is (thrown? Exception (random/seed "a"))))
\ No newline at end of file
+  (is (thrown? Exception (random/seed "a"))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_api_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_api_test.clj
new file mode 100644
index 000000000000..b642ad75d1d0
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_api_test.clj
@@ -0,0 +1,61 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.symbol-api-test
+  (:require [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.executor :as executor]
+            [org.apache.clojure-mxnet.ndarray :as ndarray]
+            [org.apache.clojure-mxnet.symbol :as sym]
+            [org.apache.clojure-mxnet.symbol-api :as sym-api]
+            [org.apache.clojure-mxnet.util :as util]
+            [clojure.test :refer :all]
+            [org.apache.clojure-mxnet.context :as context]))
+
+(deftest test-compose
+  (let [data (sym/variable "data")
+        net1 (sym-api/fully-connected {:data data :num-hidden 10 :name "fc1"})
+        net1 (sym-api/fully-connected {:data net1 :num-hidden 100 :name "fc2"} )
+
+        net2 (sym-api/fully-connected {:num-hidden 10 :name "fc3"})
+        net2 (sym-api/activation {:data net2 :act-type "relu"})
+        net2 (sym-api/fully-connected {:data net2 :num-hidden 20 :name "fc4"})
+
+        composed (sym/apply net2 "composed" {"fc3_data" net1})
+
+        multi-out (sym/group [composed net1])]
+
+    (is (= ["data" "fc1_weight" "fc1_bias" "fc2_weight" "fc2_bias"] (sym/list-arguments net1)))
+    (is (= 2 (count (sym/list-outputs multi-out))))))
+
+(deftest test-symbol-internal
+  (let [data (sym/variable "data")
+        oldfc (sym-api/fully-connected {:data data :num-hidden 10 :name"fc1"})
+        net1 (sym-api/fully-connected {:data oldfc :num-hidden 100 :name"fc2"})]
+    (is (= ["data" "fc1_weight" "fc1_bias" "fc2_weight" "fc2_bias"] (sym/list-arguments net1)))
+    (= (sym/list-arguments oldfc) (-> (sym/get-internals net1)
+                                      (sym/get "fc1_output")
+                                      (sym/list-arguments)))))
+
+(deftest test-infer-type
+  (let [data (sym/variable "data")
+        f32data (sym-api/cast {:data data :dtype "float32"})
+        fc1 (sym-api/fully-connected {:data f32data :num-hidden 128 :name"fc1"})
+        mlp (sym-api/softmax-output {:data fc1 :name"softmax"})
+        [arg out aux] (sym/infer-type mlp {:data dtype/FLOAT64})]
+    (is (= [dtype/FLOAT64 dtype/FLOAT32 dtype/FLOAT32 dtype/FLOAT32] (util/buffer->vec arg)))
+    (is (= [dtype/FLOAT32] (util/buffer->vec out)))
+    (is (= [] (util/buffer->vec aux)))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj
index 89b51237d3a5..4d1b493ab2b6 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj
@@ -57,7 +57,7 @@
         mlp (sym/softmax-output "softmax" {:data fc1})
         [arg out aux] (sym/infer-type mlp {:data dtype/FLOAT64})]
     (is (= [dtype/FLOAT64 dtype/FLOAT32 dtype/FLOAT32 dtype/FLOAT32] (util/buffer->vec arg)))
-    (is (= [dtype/FLOAT32 (util/buffer->vec out)]))
+    (is (= [dtype/FLOAT32] (util/buffer->vec out)))
     (is (= [] (util/buffer->vec aux)))))
 
 (deftest test-copy
@@ -70,10 +70,10 @@
         b (sym/variable "b")
         c (sym/+ a b)
         ex (sym/bind c {"a" (ndarray/ones [2 2]) "b" (ndarray/ones [2 2])})]
-    (is (= [2.0 2.0 2.0 2.0]) (-> (executor/forward ex)
-                                  (executor/outputs)
-                                  (first)
-                                  (ndarray/->vec)))))
+    (is (= [2.0 2.0 2.0 2.0] (-> (executor/forward ex)
+                                 (executor/outputs)
+                                 (first)
+                                 (ndarray/->vec))))))
 (deftest test-simple-bind
   (let [a (sym/ones [3])
         b (sym/ones [3])
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
index 4ed7d38e690a..6652b68a4830 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
@@ -70,8 +70,8 @@
                (util/option->value)))))
 
 (deftest test-keyword->snake-case
-  (is (= [:foo-bar :foo2 :bar-bar])
-      (util/keyword->snake-case [:foo_bar :foo2 :bar-bar])))
+  (is (= ["foo_bar" "foo2" "bar_bar"]
+         (mapv util/keyword->snake-case [:foo_bar :foo2 :bar-bar]))))
 
 (deftest test-convert-tuple
   (is (instance? Tuple1 (util/convert-tuple [1])))
@@ -163,6 +163,33 @@
   (is (= [1 2] (-> (util/convert-tuple [1 2])
                    (util/tuple->vec)))))
 
+(deftest test-to-array-nd
+  (let [a1 (util/to-array-nd '(1))
+        a2 (util/to-array-nd [1.0 2.0])
+        a3 (util/to-array-nd [[3.0] [4.0]])
+        a4 (util/to-array-nd [[[5 -5]]])]
+    (is (= 1 (alength a1)))
+    (is (= [1] (->> a1 vec)))
+    (is (= 2 (alength a2)))
+    (is (= 2.0 (aget a2 1)))
+    (is (= [1.0 2.0] (->> a2 vec)))
+    (is (= 2 (alength a3)))
+    (is (= 1 (alength (aget a3 0))))
+    (is (= 4.0 (aget a3 1 0)))
+    (is (= [[3.0] [4.0]] (->> a3 vec (mapv vec))))
+    (is (= 1 (alength a4)))
+    (is (= 1 (alength (aget a4 0))))
+    (is (= 2 (alength (aget a4 0 0))))
+    (is (= 5 (aget a4 0 0 0)))
+    (is (= [[[5 -5]]] (->> a4 vec (mapv vec) (mapv #(mapv vec %)))))))
+
+(deftest test-nd-seq-shape
+  (is (= [1] (util/nd-seq-shape '(5))))
+  (is (= [2] (util/nd-seq-shape [1.0 2.0])))
+  (is (= [3] (util/nd-seq-shape [1 1 1])))
+  (is (= [2 1] (util/nd-seq-shape [[3.0] [4.0]])))
+  (is (= [1 3 2] (util/nd-seq-shape [[[5 -5] [5 -5] [5 -5]]]))))
+
 (deftest test-coerce-return
   (is (= [] (util/coerce-return (ArrayBuffer.))))
   (is (= [1 2 3] (util/coerce-return (util/vec->indexed-seq [1 2 3]))))
diff --git a/cpp-package/CMakeLists.txt b/cpp-package/CMakeLists.txt
index 5d2977279d74..fec86e78e6b8 100644
--- a/cpp-package/CMakeLists.txt
+++ b/cpp-package/CMakeLists.txt
@@ -16,7 +16,7 @@ if(USE_CPP_PACKAGE)
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts
   )
 
-  if(NOT DO_NOT_BUILD_EXAMPLES)
+  if(BUILD_CPP_EXAMPLES)
     add_subdirectory(example)
   endif()
 
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index e2083a0dfa0a..21029da4678b 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -196,19 +196,40 @@ Symbol AlexnetSymbol(int num_classes) {
   return softmax;
 }
 
+NDArray ResizeInput(NDArray data, const Shape new_shape) {
+  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
+  NDArray pic_1channel;
+  Operator("_contrib_BilinearResize2D")
+    .SetParam("height", new_shape[2])
+    .SetParam("width", new_shape[3])
+    (pic).Invoke(pic_1channel);
+  NDArray output;
+  Operator("tile")
+    .SetParam("reps", Shape(1, 3, 1, 1))
+    (pic_1channel).Invoke(output);
+  return output;
+}
+
 int main(int argc, char const *argv[]) {
   /*basic config*/
-  int batch_size = 256;
   int max_epo = argc > 1 ? strtol(argv[1], NULL, 10) : 100;
   float learning_rate = 1e-4;
   float weight_decay = 1e-4;
 
-  /*context and net symbol*/
-  auto ctx = Context::gpu();
-#if MXNET_USE_CPU
-  ctx = Context::cpu();
+  /*context*/
+  auto ctx = Context::cpu();
+  int num_gpu;
+  MXGetGPUCount(&num_gpu);
+  int batch_size = 32;
+#if !MXNET_USE_CPU
+  if (num_gpu > 0) {
+    ctx = Context::gpu();
+    batch_size = 256;
+  }
 #endif
 
+  TRY
+  /*net symbol*/
   auto Net = AlexnetSymbol(10);
 
   /*args_map and aux_map is used for parameters' saving*/
@@ -216,8 +237,10 @@ int main(int argc, char const *argv[]) {
   std::map<std::string, NDArray> aux_map;
 
   /*we should tell mxnet the shape of data and label*/
-  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), ctx);
-  args_map["label"] = NDArray(Shape(batch_size), ctx);
+  const Shape data_shape = Shape(batch_size, 3, 256, 256),
+              label_shape = Shape(batch_size);
+  args_map["data"] = NDArray(data_shape, ctx);
+  args_map["label"] = NDArray(label_shape, ctx);
 
   /*with data and label, executor can be generated automatically*/
   auto *exec = Net.SimpleBind(ctx, args_map);
@@ -261,17 +284,18 @@ int main(int argc, char const *argv[]) {
      ->SetParam("wd", weight_decay);
 
   Accuracy acu_train, acu_val;
-  LogLoss logloss_val;
-  for (int iter = 0; iter < max_epo; ++iter) {
-    LG << "Train Epoch: " << iter;
+  LogLoss logloss_train, logloss_val;
+  for (int epoch = 0; epoch < max_epo; ++epoch) {
+    LG << "Train Epoch: " << epoch;
     /*reset the metric every epoch*/
     acu_train.Reset();
     /*reset the data iter every epoch*/
     train_iter.Reset();
+    int iter = 0;
     while (train_iter.Next()) {
       auto batch = train_iter.GetDataBatch();
       /*use copyto to feed new data and label to the executor*/
-      batch.data.CopyTo(&args_map["data"]);
+      ResizeInput(batch.data, data_shape).CopyTo(&args_map["data"]);
       batch.label.CopyTo(&args_map["label"]);
       exec->Forward(true);
       exec->Backward();
@@ -282,44 +306,53 @@ int main(int argc, char const *argv[]) {
 
       NDArray::WaitAll();
       acu_train.Update(batch.label, exec->outputs[0]);
+      logloss_train.Reset();
+      logloss_train.Update(batch.label, exec->outputs[0]);
+      ++iter;
+      LG << "EPOCH: " << epoch << " ITER: " << iter
+         << " Train Accuracy: " << acu_train.Get()
+         << " Train Loss: " << logloss_train.Get();
     }
-    LG << "ITER: " << iter << " Train Accuracy: " << acu_train.Get();
+    LG << "EPOCH: " << epoch << " Train Accuracy: " << acu_train.Get();
 
-    LG << "Val Epoch: " << iter;
+    LG << "Val Epoch: " << epoch;
     acu_val.Reset();
     val_iter.Reset();
     logloss_val.Reset();
+    iter = 0;
     while (val_iter.Next()) {
       auto batch = val_iter.GetDataBatch();
-      LG << val_iter.GetDataBatch().index.size();
-      batch.data.CopyTo(&args_map["data"]);
+      ResizeInput(batch.data, data_shape).CopyTo(&args_map["data"]);
       batch.label.CopyTo(&args_map["label"]);
       exec->Forward(false);
       NDArray::WaitAll();
       acu_val.Update(batch.label, exec->outputs[0]);
       logloss_val.Update(batch.label, exec->outputs[0]);
+      LG << "EPOCH: " << epoch << " ITER: " << iter << " Val Accuracy: " << acu_val.Get();
+      ++iter;
     }
-    LG << "ITER: " << iter << " Val Accuracy: " << acu_val.Get();
-    LG << "ITER: " << iter << " Val LogLoss: " << logloss_val.Get();
+    LG << "EPOCH: " << epoch << " Val Accuracy: " << acu_val.Get();
+    LG << "EPOCH: " << epoch << " Val LogLoss: " << logloss_val.Get();
 
     /*save the parameters*/
     std::stringstream ss;
-    ss << iter;
-    std::string iter_str;
-    ss >> iter_str;
-    std::string save_path_param = "alex_param_" + iter_str;
+    ss << epoch;
+    std::string epoch_str;
+    ss >> epoch_str;
+    std::string save_path_param = "alex_param_" + epoch_str;
     auto save_args = args_map;
     /*we do not want to save the data and label*/
     save_args.erase(save_args.find("data"));
     save_args.erase(save_args.find("label"));
     /*the alexnet does not get any aux array, so we do not need to save
      * aux_map*/
-    LG << "ITER: " << iter << " Saving to..." << save_path_param;
+    LG << "EPOCH: " << epoch << " Saving to..." << save_path_param;
     NDArray::Save(save_path_param, save_args);
   }
   /*don't foget to release the executor*/
   delete exec;
   delete opt;
   MXNotifyShutdown();
+  CATCH
   return 0;
 }
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
index 8951580067a8..ac5faa47b58c 100644
--- a/cpp-package/example/charRNN.cpp
+++ b/cpp-package/example/charRNN.cpp
@@ -42,6 +42,7 @@
 #include <thread>
 #include <chrono>
 #include "mxnet-cpp/MxNetCpp.h"
+#include "utils.h"
 
 using namespace mxnet::cpp;
 
@@ -721,6 +722,7 @@ int main(int argc, char** argv) {
   TIME_MAJOR = task.find("TimeMajor") != std::string::npos;
   std::cout << "use BuiltIn cuDNN RNN: " << builtIn << std::endl
          << "use data as TimeMajor: " << TIME_MAJOR << std::endl;
+  TRY
   if (task.find("train") == 0) {
     std::cout << "train batch size:      " << argv[3] << std::endl
            << "train max epoch:       " << argv[4] << std::endl;
@@ -746,5 +748,6 @@ int main(int argc, char** argv) {
   }
 
   MXNotifyShutdown();
+  CATCH
   return 0;
 }
diff --git a/cpp-package/example/feature_extract/run.sh b/cpp-package/example/feature_extract/run.sh
index 616445dbd671..b98ddb9eb81e 100755
--- a/cpp-package/example/feature_extract/run.sh
+++ b/cpp-package/example/feature_extract/run.sh
@@ -17,7 +17,12 @@
 
 # Downloading the data and model
 mkdir -p model
-wget -nc http://data.dmlc.ml/mxnet/models/imagenet/inception-bn.tar.gz
+wget -nc -O model/Inception-BN-symbol.json \
+    http://data.mxnet.io/mxnet/models/imagenet/inception-bn/Inception-BN-symbol.json
+wget -nc -O model/synset.txt \
+    http://data.mxnet.io/mxnet/models/imagenet/synset.txt
+wget -nc -O model/Inception-BN-0126.params \
+    http://data.mxnet.io/mxnet/models/imagenet/inception-bn/Inception-BN-0126.params?raw=true 
 wget -nc -O cat.jpg https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/cat.jpg?raw=true
 wget -nc -O dog.jpg https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/dog.jpg?raw=true
 wget -nc -O model/mean_224.nd https://github.com/dmlc/web-data/raw/master/mxnet/example/feature_extract/mean_224.nd
diff --git a/cpp-package/example/get_data.sh b/cpp-package/example/get_data.sh
index 7f975222d0be..b0913bdb684d 100755
--- a/cpp-package/example/get_data.sh
+++ b/cpp-package/example/get_data.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -14,29 +16,48 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-unameOut="$(uname -s)"
-case "${unameOut}" in
-    Linux*)     CMD='wget';;
-    Darwin*)    CMD='curl -o';;
-    CYGWIN*)    CMD='wget';;
-    MINGW*)     CMD='wget';;
-    *)          CMD=""
-esac
 
-if [ ! -d "./data" ]; then
-    mkdir data
-fi
+set -e
+
+mkdir -p data/mnist_data
+cd data/mnist_data
+
+download () {
+    local URL=$1
+    local GZ_FILE_NAME="${URL##*/}"
+
+    local FILE_NAME="${GZ_FILE_NAME%.*}"
+    if [[ -f "${FILE_NAME}" ]]; then
+        echo "File ${FILE_NAME} already downloaded."
+        return 0
+    fi
 
-if [ ! -d "./data/mnist_data" ]; then
-  mkdir ./data/mnist_data
+    echo "Downloading ${URL} ..."
+    local CURL_OPTIONS="--connect-timeout 10 \
+              --max-time 300 \
+              --retry-delay 10 \
+              --retry 3 \
+              --retry-delay 0 \
+              --location \
+              --silent"
+    curl ${CURL_OPTIONS} ${URL} -o ${GZ_FILE_NAME}
 
-  (cd data/mnist_data; $CMD train-images-idx3-ubyte.gz https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-images-idx3-ubyte.gz)
-  (cd data/mnist_data; $CMD train-labels-idx1-ubyte.gz https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-labels-idx1-ubyte.gz)
-  (cd data/mnist_data; $CMD t10k-images-idx3-ubyte.gz  https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-images-idx3-ubyte.gz)
-  (cd data/mnist_data; $CMD t10k-labels-idx1-ubyte.gz  https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-labels-idx1-ubyte.gz)
-  (cd data/mnist_data; $CMD mnist_train.csv.gz         http://data.mxnet.io/data/mnist_train.csv.gz)
-  (cd data/mnist_data; gzip -d *.gz)
-fi
+    if [[ ! -f "${GZ_FILE_NAME}" ]]; then
+        echo "File ${URL} couldn't be downloaded!"
+        exit 1
+    fi
 
+    gzip -d ${GZ_FILE_NAME}
+    (($? != 0)) && exit 1 || return 0
+}
 
+FILES=(
+    "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-images-idx3-ubyte.gz"
+    "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-labels-idx1-ubyte.gz"
+    "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-images-idx3-ubyte.gz"
+    "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-labels-idx1-ubyte.gz"
+    "http://data.mxnet.io/data/mnist_train.csv.gz")
 
+for FILE in ${FILES[@]}; do
+    download ${FILE}
+done
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
index 26ba51027db6..9cf1834cf22c 100644
--- a/cpp-package/example/googlenet.cpp
+++ b/cpp-package/example/googlenet.cpp
@@ -124,6 +124,7 @@ int main(int argc, char const *argv[]) {
   ctx = Context::cpu();;
 #endif
 
+  TRY
   auto googlenet = GoogleNetSymbol(10);
   std::map<std::string, NDArray> args_map;
   std::map<std::string, NDArray> aux_map;
@@ -192,5 +193,6 @@ int main(int argc, char const *argv[]) {
   delete exec;
   delete opt;
   MXNotifyShutdown();
+  CATCH
   return 0;
 }
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index 2073ebe47fbc..caf858a64177 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -142,23 +142,45 @@ Symbol InceptionSymbol(int num_classes) {
   return SoftmaxOutput("softmax", fc1, data_label);
 }
 
+NDArray ResizeInput(NDArray data, const Shape new_shape) {
+  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
+  NDArray pic_1channel;
+  Operator("_contrib_BilinearResize2D")
+    .SetParam("height", new_shape[2])
+    .SetParam("width", new_shape[3])
+    (pic).Invoke(pic_1channel);
+  NDArray output;
+  Operator("tile")
+    .SetParam("reps", Shape(1, 3, 1, 1))
+    (pic_1channel).Invoke(output);
+  return output;
+}
+
 int main(int argc, char const *argv[]) {
   int batch_size = 40;
   int max_epoch = argc > 1 ? strtol(argv[1], NULL, 10) : 100;
   float learning_rate = 1e-2;
   float weight_decay = 1e-4;
 
-  auto ctx = Context::gpu();
-#if MXNET_USE_CPU
-  ctx = Context::cpu();
+  /*context*/
+  auto ctx = Context::cpu();
+  int num_gpu;
+  MXGetGPUCount(&num_gpu);
+#if !MXNET_USE_CPU
+  if (num_gpu > 0) {
+    ctx = Context::gpu();
+  }
 #endif
 
+  TRY
   auto inception_bn_net = InceptionSymbol(10);
   std::map<std::string, NDArray> args_map;
   std::map<std::string, NDArray> aux_map;
 
-  args_map["data"] = NDArray(Shape(batch_size, 3, 224, 224), ctx);
-  args_map["data_label"] = NDArray(Shape(batch_size), ctx);
+  const Shape data_shape = Shape(batch_size, 3, 224, 224),
+              label_shape = Shape(batch_size);
+  args_map["data"] = NDArray(data_shape, ctx);
+  args_map["data_label"] = NDArray(label_shape, ctx);
   inception_bn_net.InferArgsMap(ctx, &args_map, args_map);
 
   std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
@@ -201,7 +223,7 @@ int main(int argc, char const *argv[]) {
     train_acc.Reset();
     while (train_iter.Next()) {
       auto data_batch = train_iter.GetDataBatch();
-      data_batch.data.CopyTo(&args_map["data"]);
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
       data_batch.label.CopyTo(&args_map["data_label"]);
       NDArray::WaitAll();
 
@@ -221,7 +243,7 @@ int main(int argc, char const *argv[]) {
     val_acc.Reset();
     while (val_iter.Next()) {
       auto data_batch = val_iter.GetDataBatch();
-      data_batch.data.CopyTo(&args_map["data"]);
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
       data_batch.label.CopyTo(&args_map["data_label"]);
       NDArray::WaitAll();
       exec->Forward(false);
@@ -234,5 +256,6 @@ int main(int argc, char const *argv[]) {
   delete exec;
   delete opt;
   MXNotifyShutdown();
+  CATCH
   return 0;
 }
diff --git a/cpp-package/example/inference/inception_inference.cpp b/cpp-package/example/inference/inception_inference.cpp
index 78487e6ee0cd..fa5600190f95 100644
--- a/cpp-package/example/inference/inception_inference.cpp
+++ b/cpp-package/example/inference/inception_inference.cpp
@@ -301,7 +301,7 @@ void Predictor::PredictImage(const std::string& image_file) {
   executor->Forward(false);
 
   // The output is available in executor->outputs.
-  auto array = executor->outputs[0].Copy(global_ctx);
+  auto array = executor->outputs[0].Copy(Context::cpu());
 
   /*
    * Find out the maximum accuracy and the index associated with that accuracy.
diff --git a/cpp-package/example/inference/unit_test_inception_inference.sh b/cpp-package/example/inference/unit_test_inception_inference.sh
index f33b8f19be6d..c3c4630f6e4a 100755
--- a/cpp-package/example/inference/unit_test_inception_inference.sh
+++ b/cpp-package/example/inference/unit_test_inception_inference.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -17,7 +19,7 @@
 
 # Downloading the data and model
 mkdir -p model
-wget -nc http://data.dmlc.ml/mxnet/models/imagenet/inception-bn.tar.gz
+wget -nc http://data.mxnet.io/models/imagenet/inception-bn.tar.gz
 wget -nc -O model/dog.jpg https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/dog.jpg?raw=true
 wget -nc -O model/mean_224.nd https://github.com/dmlc/web-data/raw/master/mxnet/example/feature_extract/mean_224.nd
 tar -xvzf inception-bn.tar.gz -C model
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index 42594548130a..a52efd8fed40 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -25,6 +25,7 @@
 #include <vector>
 #include <cstdlib>
 #include "mxnet-cpp/MxNetCpp.h"
+#include "utils.h"
 
 using namespace mxnet::cpp;
 
@@ -257,8 +258,10 @@ class Lenet {
 };
 
 int main(int argc, char const *argv[]) {
+  TRY
   Lenet lenet;
   lenet.Run(argc > 1 ? strtol(argv[1], NULL, 10) : 100000);
   MXNotifyShutdown();
+  CATCH
   return 0;
 }
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
index 33110fee3a88..69067d549380 100644
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -66,6 +66,16 @@ Symbol LenetSymbol() {
   return lenet;
 }
 
+NDArray ResizeInput(NDArray data, const Shape new_shape) {
+  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
+  NDArray output;
+  Operator("_contrib_BilinearResize2D")
+    .SetParam("height", new_shape[2])
+    .SetParam("width", new_shape[3])
+    (pic).Invoke(output);
+  return output;
+}
+
 int main(int argc, char const *argv[]) {
   /*setup basic configs*/
   int W = 28;
@@ -74,15 +84,24 @@ int main(int argc, char const *argv[]) {
   int max_epoch = argc > 1 ? strtol(argv[1], NULL, 10) : 100;
   float learning_rate = 1e-4;
   float weight_decay = 1e-4;
-  auto dev_ctx = Context::gpu();
-#if MXNET_USE_CPU
-    dev_ctx = Context::cpu();
+
+  auto dev_ctx = Context::cpu();
+  int num_gpu;
+  MXGetGPUCount(&num_gpu);
+#if !MXNET_USE_CPU
+  if (num_gpu > 0) {
+    dev_ctx = Context::gpu();
+  }
 #endif
+
+  TRY
   auto lenet = LenetSymbol();
   std::map<std::string, NDArray> args_map;
 
-  args_map["data"] = NDArray(Shape(batch_size, 1, W, H), dev_ctx);
-  args_map["data_label"] = NDArray(Shape(batch_size), dev_ctx);
+  const Shape data_shape = Shape(batch_size, 1, H, W),
+              label_shape = Shape(batch_size);
+  args_map["data"] = NDArray(data_shape, dev_ctx);
+  args_map["data_label"] = NDArray(label_shape, dev_ctx);
   lenet.InferArgsMap(dev_ctx, &args_map, args_map);
 
   args_map["fc1_w"] = NDArray(Shape(500, 4 * 4 * 50), dev_ctx);
@@ -131,7 +150,7 @@ int main(int argc, char const *argv[]) {
       samples += batch_size;
       auto data_batch = train_iter.GetDataBatch();
 
-      data_batch.data.CopyTo(&args_map["data"]);
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
       data_batch.label.CopyTo(&args_map["data_label"]);
       NDArray::WaitAll();
 
@@ -163,7 +182,7 @@ int main(int argc, char const *argv[]) {
     val_iter.Reset();
     while (val_iter.Next()) {
       auto data_batch = val_iter.GetDataBatch();
-      data_batch.data.CopyTo(&args_map["data"]);
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
       data_batch.label.CopyTo(&args_map["data_label"]);
       NDArray::WaitAll();
 
@@ -179,5 +198,6 @@ int main(int argc, char const *argv[]) {
   delete exec;
   delete opt;
   MXNotifyShutdown();
+  CATCH
   return 0;
 }
diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
index c3760fd3c846..ee09bf8da3f8 100644
--- a/cpp-package/example/mlp.cpp
+++ b/cpp-package/example/mlp.cpp
@@ -24,6 +24,7 @@
 #include <vector>
 #include <string>
 #include "mxnet-cpp/MxNetCpp.h"
+#include "utils.h"
 
 using namespace mxnet::cpp;
 
@@ -173,7 +174,9 @@ void MLP(int max_epoch) {
 
 int main(int argc, char** argv) {
   int max_epoch = argc > 1 ? strtol(argv[1], NULL, 10) : 15000;
+  TRY
   MLP(max_epoch);
   MXNotifyShutdown();
+  CATCH
   return 0;
 }
diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp
index 5d46d40e421f..7ea6946dd8c2 100644
--- a/cpp-package/example/mlp_cpu.cpp
+++ b/cpp-package/example/mlp_cpu.cpp
@@ -72,6 +72,7 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  TRY
   auto net = mlp(layers);
 
   Context ctx = Context::cpu();  // Use CPU for training
@@ -141,5 +142,6 @@ int main(int argc, char** argv) {
   delete exec;
   delete opt;
   MXNotifyShutdown();
+  CATCH
   return 0;
 }
diff --git a/cpp-package/example/mlp_csv.cpp b/cpp-package/example/mlp_csv.cpp
index f12b7c17133d..0d9103783a79 100644
--- a/cpp-package/example/mlp_csv.cpp
+++ b/cpp-package/example/mlp_csv.cpp
@@ -156,6 +156,7 @@ int main(int argc, char** argv) {
     .SetParam("shuffle", 0)
     .CreateDataIter();
 
+    TRY
     auto net = mlp(hidden_units);
 
     Context ctx = Context::cpu();
@@ -269,5 +270,6 @@ int main(int argc, char** argv) {
     delete exec;
     delete opt;
     MXNotifyShutdown();
+    CATCH
     return 0;
 }
diff --git a/cpp-package/example/mlp_gpu.cpp b/cpp-package/example/mlp_gpu.cpp
index f6060209a51e..5265de79d976 100644
--- a/cpp-package/example/mlp_gpu.cpp
+++ b/cpp-package/example/mlp_gpu.cpp
@@ -72,6 +72,7 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  TRY
   auto net = mlp(layers);
 
   Context ctx = Context::gpu();  // Use GPU for training
@@ -157,5 +158,6 @@ int main(int argc, char** argv) {
   delete exec;
   delete opt;
   MXNotifyShutdown();
+  CATCH
   return 0;
 }
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index 7200bd42d2de..8f8fd12e32ce 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -153,23 +153,46 @@ Symbol ResNetSymbol(int num_class, int num_level = 3, int num_block = 9,
   return SoftmaxOutput("softmax", fc, data_label);
 }
 
+NDArray ResizeInput(NDArray data, const Shape new_shape) {
+  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
+  NDArray pic_1channel;
+  Operator("_contrib_BilinearResize2D")
+    .SetParam("height", new_shape[2])
+    .SetParam("width", new_shape[3])
+    (pic).Invoke(pic_1channel);
+  NDArray output;
+  Operator("tile")
+    .SetParam("reps", Shape(1, 3, 1, 1))
+    (pic_1channel).Invoke(output);
+  return output;
+}
+
 int main(int argc, char const *argv[]) {
-  int batch_size = 50;
   int max_epoch = argc > 1 ? strtol(argv[1], NULL, 10) : 100;
   float learning_rate = 1e-4;
   float weight_decay = 1e-4;
 
+  TRY
   auto resnet = ResNetSymbol(10);
   std::map<std::string, NDArray> args_map;
   std::map<std::string, NDArray> aux_map;
 
-  auto ctx = Context::gpu();
-#if MXNET_USE_CPU
-  ctx = Context::cpu();;
+  /*context*/
+  auto ctx = Context::cpu();
+  int num_gpu;
+  MXGetGPUCount(&num_gpu);
+  int batch_size = 8;
+#if !MXNET_USE_CPU
+  if (num_gpu > 0) {
+    ctx = Context::gpu();
+    batch_size = 32;
+  }
 #endif
 
-  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), ctx);
-  args_map["data_label"] = NDArray(Shape(batch_size), ctx);
+  const Shape data_shape = Shape(batch_size, 3, 224, 224),
+              label_shape = Shape(batch_size);
+  args_map["data"] = NDArray(data_shape, ctx);
+  args_map["data_label"] = NDArray(label_shape, ctx);
   resnet.InferArgsMap(ctx, &args_map, args_map);
 
   std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
@@ -206,13 +229,15 @@ int main(int argc, char const *argv[]) {
 
   // Create metrics
   Accuracy train_acc, val_acc;
-  for (int iter = 0; iter < max_epoch; ++iter) {
-    LG << "Epoch: " << iter;
+  LogLoss logloss_train, logloss_val;
+  for (int epoch = 0; epoch < max_epoch; ++epoch) {
+    LG << "Epoch: " << epoch;
     train_iter.Reset();
     train_acc.Reset();
+    int iter = 0;
     while (train_iter.Next()) {
       auto data_batch = train_iter.GetDataBatch();
-      data_batch.data.CopyTo(&args_map["data"]);
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
       data_batch.label.CopyTo(&args_map["data_label"]);
       NDArray::WaitAll();
 
@@ -225,24 +250,34 @@ int main(int argc, char const *argv[]) {
       }
       NDArray::WaitAll();
       train_acc.Update(data_batch.label, exec->outputs[0]);
+      logloss_train.Reset();
+      logloss_train.Update(data_batch.label, exec->outputs[0]);
+      ++iter;
+      LG << "EPOCH: " << epoch << " ITER: " << iter
+         << " Train Accuracy: " << train_acc.Get()
+         << " Train Loss: " << logloss_train.Get();
     }
+    LG << "EPOCH: " << epoch << " Train Accuracy: " << train_acc.Get();
 
     val_iter.Reset();
     val_acc.Reset();
+    iter = 0;
     while (val_iter.Next()) {
       auto data_batch = val_iter.GetDataBatch();
-      data_batch.data.CopyTo(&args_map["data"]);
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
       data_batch.label.CopyTo(&args_map["data_label"]);
       NDArray::WaitAll();
       exec->Forward(false);
       NDArray::WaitAll();
       val_acc.Update(data_batch.label, exec->outputs[0]);
+      LG << "EPOCH: " << epoch << " ITER: " << iter << " Val Accuracy: " << val_acc.Get();
+      ++iter;
     }
-    LG << "Train Accuracy: " << train_acc.Get();
     LG << "Validation Accuracy: " << val_acc.Get();
   }
   delete exec;
   delete opt;
   MXNotifyShutdown();
+  CATCH
   return 0;
 }
diff --git a/cpp-package/example/test_kvstore.cpp b/cpp-package/example/test_kvstore.cpp
new file mode 100644
index 000000000000..d9e0400a5ac8
--- /dev/null
+++ b/cpp-package/example/test_kvstore.cpp
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "mxnet/c_api.h"  // MXGetGPUCount()
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+static bool test_single_key(const Context &context, const std::string &context_str) {
+  std::string key = "singlekeytest-" + context_str;
+
+  NDArray result(Shape(4), context);
+  NDArray result_cpu;
+
+  // initialize data
+  NDArray data_cpu({0.f, 233.f, -0.12f, 9.f}, Shape(4), Context::cpu());
+  NDArray data = data_cpu.Copy(context);
+  NDArray::WaitAll();
+
+  KVStore::Init(key, data);
+  NDArray::WaitAll();
+
+  // retrieve result
+  KVStore::Pull(key, &result);
+  NDArray::WaitAll();
+
+  result_cpu = result.Copy(Context::cpu());
+  NDArray::WaitAll();
+
+  // compare
+  for (size_t j=0; j < result_cpu.Size(); j++) {
+    if (result_cpu.GetData()[j] != data_cpu.GetData()[j]) {
+      LG << "Error: wrong initialized data in singlekeytest-" << context_str
+          << ", expect " << data_cpu.GetData()[j]
+          << " got " << result_cpu.GetData()[j];
+      return false;
+    }
+  }
+
+  // push gradient
+  NDArray grad_cpu({0.1f, -2.f, -4.4f, 0.f}, Shape(4), Context::cpu());
+  NDArray grad = grad_cpu.Copy(context);
+  NDArray::WaitAll();
+
+  KVStore::Push(key, grad);
+  NDArray::WaitAll();
+
+  // retrieve result
+  KVStore::Pull(key, &result);
+  NDArray::WaitAll();
+
+  result_cpu = result.Copy(Context::cpu());
+  NDArray::WaitAll();
+
+  // compare
+  for (size_t j=0; j < result_cpu.Size(); j++) {
+    if (result_cpu.GetData()[j] != grad_cpu.GetData()[j]) {
+      LG << "Error: wrong gradient data in singlekeytest-" << context_str
+          << ", expect " << grad_cpu.GetData()[j]
+          << " got " << result_cpu.GetData()[j];
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static bool test_multiple_key(const Context &context, const std::string &context_str) {
+  std::vector<std::string> keys(2);
+  keys[0] = "multikeytest-0-" + context_str;
+  keys[1] = "multikeytest-1-" + context_str;
+
+  std::vector<NDArray> results(2);
+  results[0] = NDArray(Shape(4), context);
+  results[1] = NDArray(Shape(4), context);
+  std::vector<NDArray> results_cpu(2);
+
+  // initialize data
+  std::vector<NDArray> data_cpu(2);
+  data_cpu[0] = NDArray({0.f, 2.f, -3.12f, 4.f}, Shape(4), Context::cpu());
+  data_cpu[1] = NDArray({0.8f, -2.f, 6.6f, 77.f}, Shape(4), Context::cpu());
+  std::vector<NDArray> data(2);
+  data[0] = data_cpu[0].Copy(context);
+  data[1] = data_cpu[1].Copy(context);
+  NDArray::WaitAll();
+
+  KVStore::Init(keys, data);
+  NDArray::WaitAll();
+
+  // retrieve result
+  KVStore::Pull(keys, &results);
+  NDArray::WaitAll();
+
+  results_cpu[0] = results[0].Copy(Context::cpu());
+  results_cpu[1] = results[1].Copy(Context::cpu());
+  NDArray::WaitAll();
+
+  // compare
+  for (size_t i=0; i < results_cpu.size(); i++) {
+    for (size_t j=0; j < results_cpu[i].Size(); j++) {
+      if (results_cpu[i].GetData()[j] != data_cpu[i].GetData()[j]) {
+        LG << "Error: wrong initialized data in multikeytest-" << context_str
+            << ", expect " << data_cpu[i].GetData()[j]
+            << " got " << results_cpu[i].GetData()[j];
+        return false;
+      }
+    }
+  }
+
+  // push gradient, reduce for the second
+  std::vector<std::string> push_keys(3);
+  push_keys[0] = "multikeytest-0-" + context_str;
+  push_keys[1] = "multikeytest-1-" + context_str;
+  push_keys[2] = "multikeytest-1-" + context_str;
+
+  std::vector<NDArray> grads_cpu(3);
+  grads_cpu[0] = NDArray({0.2f, -0.3f, -1.1f, 0.0f}, Shape(4), Context::cpu());
+  grads_cpu[1] = NDArray({2.f, 4.f, -4.f, -5.f}, Shape(4), Context::cpu());
+  grads_cpu[2] = NDArray({-3.f, -0.2f, 12.f, -9.f}, Shape(4), Context::cpu());
+  std::vector<NDArray> grads(3);
+  grads[0] = grads_cpu[0].Copy(context);
+  grads[1] = grads_cpu[1].Copy(context);
+  grads[2] = grads_cpu[2].Copy(context);
+  NDArray::WaitAll();
+
+  KVStore::Push(push_keys, grads);
+  NDArray::WaitAll();
+
+  // retrieve result
+  KVStore::Pull(keys, &results);
+  NDArray::WaitAll();
+
+  results_cpu[0] = results[0].Copy(Context::cpu());
+  results_cpu[1] = results[1].Copy(Context::cpu());
+  NDArray::WaitAll();
+
+  // compare the first
+  for (size_t j=0; j < results_cpu[0].Size(); j++) {
+    if (results_cpu[0].GetData()[j] != grads_cpu[0].GetData()[j]) {
+      LG << "Error: wrong gradient data in multikeytest-" << context_str
+          << ", expect " << grads_cpu[0].GetData()[j]
+          << " got " << results_cpu[0].GetData()[j];
+      return false;
+    }
+  }
+
+  // compare the second
+  for (size_t j=0; j < results_cpu[1].Size(); j++) {
+    if (results_cpu[1].GetData()[j] != (grads_cpu[1].GetData()[j] + grads_cpu[2].GetData()[j])) {
+      LG << "Error: wrong reduced gradient data in multikeytest-" << context_str
+          << ", expect " << (grads_cpu[1].GetData()[j] + grads_cpu[2].GetData()[j])
+          << " got " << results_cpu[1].GetData()[j];
+      return false;
+    }
+  }
+
+  return true;
+}
+
+int main(int argc, char** argv) {
+  KVStore::SetType("local");
+
+  bool success1 = test_single_key(Context::cpu(), "cpu");
+  bool success2 = test_multiple_key(Context::cpu(), "cpu");
+
+  bool success3 = true;
+  bool success4 = true;
+
+  int gpu_count = 0;
+  if (MXGetGPUCount(&gpu_count) != 0) {
+    LG << "Error: MXGetGPUCount";
+
+    MXNotifyShutdown();
+    return 1;
+  }
+
+  if (gpu_count > 0) {
+    success3 = test_single_key(Context::gpu(), "gpu");
+    success4 = test_multiple_key(Context::gpu(), "gpu");
+  }
+
+  int ret = (success1 && success2 && success3 && success4) ? 0 : 1;
+
+  MXNotifyShutdown();
+  return ret;
+}
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
index 687683f487f8..0ccdf65b3b19 100644
--- a/cpp-package/example/test_score.cpp
+++ b/cpp-package/example/test_score.cpp
@@ -62,6 +62,7 @@ int main(int argc, char** argv) {
   const int max_epoch = 10;
   const float learning_rate = 0.1;
   const float weight_decay = 1e-2;
+  float score = 0;
 
   std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
                                           "./data/mnist_data/train-labels-idx1-ubyte",
@@ -79,6 +80,7 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  TRY
   auto net = mlp(layers);
 
   Context ctx = Context::gpu();  // Use GPU for training
@@ -111,7 +113,6 @@ int main(int argc, char** argv) {
   auto *exec = net.SimpleBind(ctx, args);
   auto arg_names = net.ListArguments();
 
-  float score = 0;
   // Start training
   for (int iter = 0; iter < max_epoch; ++iter) {
     int samples = 0;
@@ -158,5 +159,6 @@ int main(int argc, char** argv) {
   delete exec;
   delete opt;
   MXNotifyShutdown();
+  CATCH
   return score >= MIN_SCORE ? 0 : 1;
 }
diff --git a/cpp-package/example/utils.h b/cpp-package/example/utils.h
index 020d1ec5804e..87847701ce6e 100644
--- a/cpp-package/example/utils.h
+++ b/cpp-package/example/utils.h
@@ -27,6 +27,15 @@
 
 using namespace mxnet::cpp;
 
+#define TRY \
+  try {
+#define CATCH \
+  } catch(dmlc::Error &err) { \
+    LG << "Status: FAIL";\
+    LG << "With Error: " << MXGetLastError(); \
+    return 1; \
+  }
+
 bool isFileExists(const std::string &filename) {
   std::ifstream fhandle(filename.c_str());
   return fhandle.good();
diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h
index d5aa1509a8f0..67f984fce0ee 100644
--- a/cpp-package/include/mxnet-cpp/kvstore.h
+++ b/cpp-package/include/mxnet-cpp/kvstore.h
@@ -39,12 +39,21 @@ class KVStore {
   static void SetType(const std::string& type);
   static void RunServer();
   static void Init(int key, const NDArray& val);
+  static void Init(const std::string& key, const NDArray& val);
   static void Init(const std::vector<int>& keys, const std::vector<NDArray>& vals);
+  static void Init(const std::vector<std::string>& keys, const std::vector<NDArray>& vals);
   static void Push(int key, const NDArray& val, int priority = 0);
+  static void Push(const std::string& key, const NDArray& val, int priority = 0);
   static void Push(const std::vector<int>& keys,
-      const std::vector<NDArray>& vals, int priority = 0);
+                   const std::vector<NDArray>& vals, int priority = 0);
+  static void Push(const std::vector<std::string>& keys,
+                   const std::vector<NDArray>& vals, int priority = 0);
   static void Pull(int key, NDArray* out, int priority = 0);
-  static void Pull(const std::vector<int>& keys, std::vector<NDArray>* outs, int priority = 0);
+  static void Pull(const std::string& key, NDArray* out, int priority = 0);
+  static void Pull(const std::vector<int>& keys,
+                   std::vector<NDArray>* outs, int priority = 0);
+  static void Pull(const std::vector<std::string>& keys,
+                   std::vector<NDArray>* outs, int priority = 0);
   // TODO(lx): put lr in optimizer or not?
   static void SetOptimizer(std::unique_ptr<Optimizer> optimizer, bool local = false);
   static std::string GetType();
diff --git a/cpp-package/include/mxnet-cpp/kvstore.hpp b/cpp-package/include/mxnet-cpp/kvstore.hpp
index f2b5e74990ce..6cd405b91dd4 100644
--- a/cpp-package/include/mxnet-cpp/kvstore.hpp
+++ b/cpp-package/include/mxnet-cpp/kvstore.hpp
@@ -87,6 +87,12 @@ inline void KVStore::Init(int key, const NDArray& val) {
   CHECK_EQ(MXKVStoreInit(get_kvstore()->get_handle(), 1, &key, &val_handle), 0);
 }
 
+inline void KVStore::Init(const std::string& key, const NDArray& val) {
+  const char* key_handle = key.c_str();
+  NDArrayHandle val_handle = val.GetHandle();
+  CHECK_EQ(MXKVStoreInitEx(get_kvstore()->get_handle(), 1, &key_handle, &val_handle), 0);
+}
+
 inline void KVStore::Init(const std::vector<int>& keys, const std::vector<NDArray>& vals) {
   CHECK_EQ(keys.size(), vals.size());
   std::vector<NDArrayHandle> val_handles(vals.size());
@@ -99,14 +105,36 @@ inline void KVStore::Init(const std::vector<int>& keys, const std::vector<NDArra
       val_handles.data()), 0);
 }
 
+inline void KVStore::Init(const std::vector<std::string>& keys, const std::vector<NDArray>& vals) {
+  CHECK_EQ(keys.size(), vals.size());
+  std::vector<const char*> key_handles(keys.size());
+  std::transform(keys.cbegin(), keys.cend(), key_handles.begin(),
+      [](const std::string& key) {
+        return key.c_str();
+      });
+  std::vector<NDArrayHandle> val_handles(vals.size());
+  std::transform(vals.cbegin(), vals.cend(), val_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStoreInitEx(get_kvstore()->get_handle(), key_handles.size(), key_handles.data(),
+      val_handles.data()), 0);
+}
+
 inline void KVStore::Push(int key, const NDArray& val, int priority) {
   NDArrayHandle val_handle = val.GetHandle();
   CHECK_EQ(MXKVStorePush(get_kvstore()->get_handle(), 1, &key, &val_handle, priority), 0);
 }
 
+inline void KVStore::Push(const std::string& key, const NDArray& val, int priority) {
+  const char* key_handle = key.c_str();
+  NDArrayHandle val_handle = val.GetHandle();
+  CHECK_EQ(MXKVStorePushEx(get_kvstore()->get_handle(), 1, &key_handle, &val_handle, priority), 0);
+}
+
 inline void KVStore::Push(const std::vector<int>& keys,
-                          const std::vector<NDArray>& vals,
-                          int priority) {
+                          const std::vector<NDArray>& vals, int priority) {
   CHECK_EQ(keys.size(), vals.size());
   std::vector<NDArrayHandle> val_handles(vals.size());
   std::transform(vals.cbegin(), vals.cend(), val_handles.begin(),
@@ -118,12 +146,37 @@ inline void KVStore::Push(const std::vector<int>& keys,
       val_handles.data(), priority), 0);
 }
 
+inline void KVStore::Push(const std::vector<std::string>& keys,
+                          const std::vector<NDArray>& vals, int priority) {
+  CHECK_EQ(keys.size(), vals.size());
+  std::vector<const char*> key_handles(keys.size());
+  std::transform(keys.cbegin(), keys.cend(), key_handles.begin(),
+      [](const std::string& key) {
+        return key.c_str();
+      });
+  std::vector<NDArrayHandle> val_handles(vals.size());
+  std::transform(vals.cbegin(), vals.cend(), val_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStorePushEx(get_kvstore()->get_handle(), key_handles.size(), key_handles.data(),
+      val_handles.data(), priority), 0);
+}
+
 inline void KVStore::Pull(int key, NDArray* out, int priority) {
   NDArrayHandle out_handle = out->GetHandle();
   CHECK_EQ(MXKVStorePull(get_kvstore()->get_handle(), 1, &key, &out_handle, priority), 0);
 }
 
-inline void KVStore::Pull(const std::vector<int>& keys, std::vector<NDArray>* outs, int priority) {
+inline void KVStore::Pull(const std::string& key, NDArray* out, int priority) {
+  const char* key_handle = key.c_str();
+  NDArrayHandle out_handle = out->GetHandle();
+  CHECK_EQ(MXKVStorePullEx(get_kvstore()->get_handle(), 1, &key_handle, &out_handle, priority), 0);
+}
+
+inline void KVStore::Pull(const std::vector<int>& keys,
+                          std::vector<NDArray>* outs, int priority) {
   CHECK_EQ(keys.size(), outs->size());
 
   std::vector<NDArrayHandle> out_handles(keys.size());
@@ -136,6 +189,25 @@ inline void KVStore::Pull(const std::vector<int>& keys, std::vector<NDArray>* ou
       out_handles.data(), priority), 0);
 }
 
+inline void KVStore::Pull(const std::vector<std::string>& keys,
+                          std::vector<NDArray>* outs, int priority) {
+  CHECK_EQ(keys.size(), outs->size());
+
+  std::vector<const char*> key_handles(keys.size());
+  std::transform(keys.cbegin(), keys.cend(), key_handles.begin(),
+      [](const std::string& key) {
+        return key.c_str();
+      });
+  std::vector<NDArrayHandle> out_handles(keys.size());
+  std::transform(outs->cbegin(), outs->cend(), out_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStorePullEx(get_kvstore()->get_handle(), key_handles.size(), key_handles.data(),
+      out_handles.data(), priority), 0);
+}
+
 inline void KVStore::Updater(int key, NDArrayHandle recv, NDArrayHandle local,
                              void* handle_) {
   Optimizer *opt = static_cast<Optimizer*>(handle_);
diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp
index 966cf75c9122..d0438305a62e 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.hpp
+++ b/cpp-package/include/mxnet-cpp/ndarray.hpp
@@ -233,12 +233,12 @@ inline NDArray NDArray::Reshape(const Shape &new_shape) const {
   return NDArray(handle);
 }
 inline void NDArray::WaitToRead() const {
-  CHECK_EQ(MXNDArrayWaitToRead(blob_ptr_->handle_), 0);
+  CHECK_EQ(MXNDArrayWaitToRead(blob_ptr_->handle_), 0) << MXGetLastError();
 }
 inline void NDArray::WaitToWrite() {
-  CHECK_EQ(MXNDArrayWaitToWrite(blob_ptr_->handle_), 0);
+  CHECK_EQ(MXNDArrayWaitToWrite(blob_ptr_->handle_), 0) << MXGetLastError();
 }
-inline void NDArray::WaitAll() { CHECK_EQ(MXNDArrayWaitAll(), 0); }
+inline void NDArray::WaitAll() { CHECK_EQ(MXNDArrayWaitAll(), 0) << MXGetLastError(); }
 inline void NDArray::SampleGaussian(mx_float mu, mx_float sigma, NDArray *out) {
   Operator("_random_normal")(mu, sigma).Invoke(*out);
 }
@@ -397,11 +397,11 @@ inline size_t NDArray::Size() const {
 }
 
 inline std::vector<mx_uint> NDArray::GetShape() const {
-  const mx_uint *out_pdata;
-  mx_uint out_dim;
-  MXNDArrayGetShape(blob_ptr_->handle_, &out_dim, &out_pdata);
+  const int *out_pdata;
+  int out_dim;
+  MXNDArrayGetShapeEx(blob_ptr_->handle_, &out_dim, &out_pdata);
   std::vector<mx_uint> ret;
-  for (mx_uint i = 0; i < out_dim; ++i) {
+  for (int i = 0; i < out_dim; ++i) {
     ret.push_back(out_pdata[i]);
   }
   return ret;
diff --git a/cpp-package/include/mxnet-cpp/operator.hpp b/cpp-package/include/mxnet-cpp/operator.hpp
index edc396f1477c..8cdd78d2c0e9 100644
--- a/cpp-package/include/mxnet-cpp/operator.hpp
+++ b/cpp-package/include/mxnet-cpp/operator.hpp
@@ -134,9 +134,11 @@ inline void Operator::Invoke(std::vector<NDArray> &outputs) {
     outputs_receiver = output_handles.data();
   }
 
-  MXImperativeInvoke(handle_, num_inputs, input_ndarrays_.data(),
-      &num_outputs, &outputs_receiver,
-      param_keys.size(), param_keys.data(), param_values.data());
+  if (MXImperativeInvoke(handle_, num_inputs, input_ndarrays_.data(),
+                         &num_outputs, &outputs_receiver,
+                         param_keys.size(), param_keys.data(),
+                         param_values.data()))
+      LOG(FATAL) << MXGetLastError();
 
   if (outputs.size() > 0)
     return;
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
index aed963949060..2e3fb7a2d5de 100644
--- a/cpp-package/include/mxnet-cpp/symbol.hpp
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -188,7 +188,7 @@ inline void Symbol::InferShape(
 
   std::vector<const char *> keys;
   std::vector<mx_uint> arg_ind_ptr;
-  std::vector<mx_uint> arg_shape_data;
+  std::vector<int> arg_shape_data;
 
   for (const auto &arg : arg_shapes) {
     keys.push_back(arg.first.c_str());
@@ -200,40 +200,40 @@ inline void Symbol::InferShape(
   arg_ind_ptr.push_back(arg_shape_data.size());
 
   mx_uint in_shape_size;
-  const mx_uint *in_shape_ndim;
-  const mx_uint **in_shape_data;
+  const int *in_shape_ndim;
+  const int **in_shape_data;
   mx_uint out_shape_size;
-  const mx_uint *out_shape_ndim;
-  const mx_uint **out_shape_data;
+  const int *out_shape_ndim;
+  const int **out_shape_data;
   mx_uint aux_shape_size;
-  const mx_uint *aux_shape_ndim;
-  const mx_uint **aux_shape_data;
+  const int *aux_shape_ndim;
+  const int **aux_shape_data;
   int complete;
 
-  CHECK_EQ(MXSymbolInferShape(GetHandle(), keys.size(), keys.data(),
-                              arg_ind_ptr.data(), arg_shape_data.data(),
-                              &in_shape_size, &in_shape_ndim, &in_shape_data,
-                              &out_shape_size, &out_shape_ndim, &out_shape_data,
-                              &aux_shape_size, &aux_shape_ndim, &aux_shape_data,
-                              &complete),
+  CHECK_EQ(MXSymbolInferShapeEx(GetHandle(), keys.size(), keys.data(),
+                                arg_ind_ptr.data(), arg_shape_data.data(),
+                                &in_shape_size, &in_shape_ndim, &in_shape_data,
+                                &out_shape_size, &out_shape_ndim, &out_shape_data,
+                                &aux_shape_size, &aux_shape_ndim, &aux_shape_data,
+                                &complete),
            0);
 
   if (complete) {
     for (mx_uint i = 0; i < in_shape_size; ++i) {
       in_shape->push_back(std::vector<mx_uint>());
-      for (mx_uint j = 0; j < in_shape_ndim[i]; ++j) {
+      for (int j = 0; j < in_shape_ndim[i]; ++j) {
         (*in_shape)[i].push_back(in_shape_data[i][j]);
       }
     }
     for (mx_uint i = 0; i < aux_shape_size; ++i) {
       aux_shape->push_back(std::vector<mx_uint>());
-      for (mx_uint j = 0; j < aux_shape_ndim[i]; ++j) {
+      for (int j = 0; j < aux_shape_ndim[i]; ++j) {
         (*aux_shape)[i].push_back(aux_shape_data[i][j]);
       }
     }
     for (mx_uint i = 0; i < out_shape_size; ++i) {
       out_shape->push_back(std::vector<mx_uint>());
-      for (mx_uint j = 0; j < out_shape_ndim[i]; ++j) {
+      for (int j = 0; j < out_shape_ndim[i]; ++j) {
         (*out_shape)[i].push_back(out_shape_data[i][j]);
       }
     }
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index 18fabea7a7f9..2d1f8e4f68e6 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -48,8 +48,11 @@ cp ../../build/cpp-package/example/mlp_cpu .
 cp ../../build/cpp-package/example/mlp_gpu .
 ./mlp_gpu
 
- cp ../../build/cpp-package/example/test_optimizer .
- ./test_optimizer
+cp ../../build/cpp-package/example/test_optimizer .
+./test_optimizer
+
+cp ../../build/cpp-package/example/test_kvstore .
+./test_kvstore
 
 cp ../../build/cpp-package/example/test_score .
 ./test_score 0.93
diff --git a/dev_menu.py b/dev_menu.py
index 1fda4c7aabca..d439d8194f2a 100755
--- a/dev_menu.py
+++ b/dev_menu.py
@@ -123,7 +123,7 @@ def create_virtualenv_default():
     ('[Docker] sanity_check. Check for linting and code formatting and licenses.',
     [
         "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh sanity_check",
-        "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh nightly_test_rat_check",
+        "ci/build.py --platform ubuntu_rat /work/runtime_functions.sh nightly_test_rat_check",
     ]),
     ('[Docker] Python3 CPU unittests',
     [
@@ -132,12 +132,12 @@ def create_virtualenv_default():
     ]),
     ('[Docker] Python3 GPU unittests',
     [
-        "ci/build.py --platform ubuntu_gpu /work/runtime_functions.sh build_ubuntu_gpu",
+        "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh build_ubuntu_gpu",
         "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_python3_gpu",
     ]),
     ('[Docker] Python3 GPU+MKLDNN unittests',
     [
-        "ci/build.py --platform ubuntu_gpu /work/runtime_functions.sh build_ubuntu_gpu_cmake_mkldnn",
+        "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh build_ubuntu_gpu_cmake_mkldnn",
         "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_python3_gpu",
     ]),
     ('[Docker] Python3 CPU Intel MKLDNN unittests',
diff --git a/docker/docker-python/Dockerfile.mxnet.python.cpu b/docker/docker-python/Dockerfile.mxnet.python.cpu
index eb437f3c2334..0858e99e2d7d 100644
--- a/docker/docker-python/Dockerfile.mxnet.python.cpu
+++ b/docker/docker-python/Dockerfile.mxnet.python.cpu
@@ -19,10 +19,11 @@
 # Dockerfile to build MXNet for CPU
 
 FROM ubuntu:16.04
+ARG version
 
-RUN apt-get update
-RUN apt-get install -y wget python gcc
-RUN wget https://bootstrap.pypa.io/get-pip.py
-RUN python get-pip.py
+RUN apt-get update && \
+    apt-get install -y wget python-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py
 
-RUN pip install mxnet
+RUN pip install mxnet==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python.cpu.mkl b/docker/docker-python/Dockerfile.mxnet.python.cpu.mkl
index 043932ff7c8e..dbb7d29f5037 100644
--- a/docker/docker-python/Dockerfile.mxnet.python.cpu.mkl
+++ b/docker/docker-python/Dockerfile.mxnet.python.cpu.mkl
@@ -19,10 +19,11 @@
 # Dockerfile to build MXNet CPU with MKL
 
 FROM ubuntu:16.04
+ARG version
 
-RUN apt-get update
-RUN apt-get install -y wget python gcc
-RUN wget https://bootstrap.pypa.io/get-pip.py
-RUN python get-pip.py
+RUN apt-get update && \
+    apt-get install -y wget python-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py
 
-RUN pip install mxnet-mkl
+RUN pip install mxnet-mkl==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu80 b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu80
index 8c83ece434a3..cb22721f48b9 100644
--- a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu80
+++ b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu80
@@ -19,10 +19,11 @@
 # Dockerfile to build MXNet for GPU
 
 FROM nvidia/cuda:8.0-cudnn5-devel
+ARG version
 
-RUN apt-get update
-RUN apt-get install -y wget python gcc
-RUN wget https://bootstrap.pypa.io/get-pip.py
-RUN python get-pip.py
+RUN apt-get update && \
+    apt-get install -y wget python-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py
 
-RUN pip install mxnet-cu80
+RUN pip install mxnet-cu80==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu80.mkl b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu80.mkl
index a057c1d20cb1..eda96c90d181 100644
--- a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu80.mkl
+++ b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu80.mkl
@@ -19,10 +19,11 @@
 # Dockerfile to build MXNet for GPU with MKL
 
 FROM nvidia/cuda:8.0-cudnn5-devel
+ARG version
 
-RUN apt-get update
-RUN apt-get install -y wget python gcc
-RUN wget https://bootstrap.pypa.io/get-pip.py
-RUN python get-pip.py
+RUN apt-get update && \
+    apt-get install -y wget python-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py
 
-RUN pip install mxnet-cu80mkl
+RUN pip install mxnet-cu80mkl==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu90 b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu90
index 1e3d9869ac63..cd36b8afdbd8 100644
--- a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu90
+++ b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu90
@@ -19,10 +19,11 @@
 # Dockerfile to build MXNet for GPU
 
 FROM nvidia/cuda:9.0-cudnn7-devel
+ARG version
 
-RUN apt-get update
-RUN apt-get install -y wget python gcc
-RUN wget https://bootstrap.pypa.io/get-pip.py
-RUN python get-pip.py
+RUN apt-get update && \
+    apt-get install -y wget python-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py
 
-RUN pip install mxnet-cu90
+RUN pip install mxnet-cu90==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu90.mkl b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu90.mkl
index d82abd7cf523..0b274c4e0d3f 100644
--- a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu90.mkl
+++ b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu90.mkl
@@ -19,10 +19,11 @@
 # Dockerfile to build MXNet for GPU with MKL
 
 FROM nvidia/cuda:9.0-cudnn7-devel
+ARG version
 
-RUN apt-get update
-RUN apt-get install -y wget python gcc
-RUN wget https://bootstrap.pypa.io/get-pip.py
-RUN python get-pip.py
+RUN apt-get update && \
+    apt-get install -y wget python-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py
 
-RUN pip install mxnet-cu90mkl
+RUN pip install mxnet-cu90mkl==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu92 b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu92
index ba5c54a2a2aa..2c43187faf79 100644
--- a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu92
+++ b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu92
@@ -19,10 +19,11 @@
 # Dockerfile to build MXNet for GPU
 
 FROM nvidia/cuda:9.2-cudnn7-devel
+ARG version
 
-RUN apt-get update
-RUN apt-get install -y wget python gcc
-RUN wget https://bootstrap.pypa.io/get-pip.py
-RUN python get-pip.py
+RUN apt-get update && \
+    apt-get install -y wget python-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py
 
-RUN pip install mxnet-cu92
+RUN pip install mxnet-cu92==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu92.mkl b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu92.mkl
index 96a943980b59..db204897ef60 100644
--- a/docker/docker-python/Dockerfile.mxnet.python.gpu.cu92.mkl
+++ b/docker/docker-python/Dockerfile.mxnet.python.gpu.cu92.mkl
@@ -19,10 +19,11 @@
 # Dockerfile to build MXNet for GPU with MKL
 
 FROM nvidia/cuda:9.2-cudnn7-devel
+ARG version
 
-RUN apt-get update
-RUN apt-get install -y wget python gcc
-RUN wget https://bootstrap.pypa.io/get-pip.py
-RUN python get-pip.py
+RUN apt-get update && \
+    apt-get install -y wget python-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py
 
-RUN pip install mxnet-cu92mkl
+RUN pip install mxnet-cu92mkl==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python3.cpu b/docker/docker-python/Dockerfile.mxnet.python3.cpu
new file mode 100644
index 000000000000..8ad9950a2f21
--- /dev/null
+++ b/docker/docker-python/Dockerfile.mxnet.python3.cpu
@@ -0,0 +1,29 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build MXNet for CPU
+
+FROM ubuntu:16.04
+ARG version
+
+RUN apt-get update && \
+    apt-get install -y wget python3-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python3 get-pip.py
+
+RUN pip3 install mxnet==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python3.cpu.mkl b/docker/docker-python/Dockerfile.mxnet.python3.cpu.mkl
new file mode 100644
index 000000000000..c6312891c6ea
--- /dev/null
+++ b/docker/docker-python/Dockerfile.mxnet.python3.cpu.mkl
@@ -0,0 +1,29 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build MXNet CPU with MKL
+
+FROM ubuntu:16.04
+ARG version
+
+RUN apt-get update && \
+    apt-get install -y wget python3-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python3 get-pip.py
+
+RUN pip3 install mxnet-mkl==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu80 b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu80
new file mode 100644
index 000000000000..58af6bfeb273
--- /dev/null
+++ b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu80
@@ -0,0 +1,29 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build MXNet for GPU
+
+FROM nvidia/cuda:8.0-cudnn5-devel
+ARG version
+
+RUN apt-get update && \
+    apt-get install -y wget python3-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python3 get-pip.py
+
+RUN pip3 install mxnet-cu80==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu80.mkl b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu80.mkl
new file mode 100644
index 000000000000..059f002c8560
--- /dev/null
+++ b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu80.mkl
@@ -0,0 +1,29 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build MXNet for GPU with MKL
+
+FROM nvidia/cuda:8.0-cudnn5-devel
+ARG version
+
+RUN apt-get update && \
+    apt-get install -y wget python3-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python3 get-pip.py
+
+RUN pip3 install mxnet-cu80mkl==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu90 b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu90
new file mode 100644
index 000000000000..a860de918054
--- /dev/null
+++ b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu90
@@ -0,0 +1,29 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build MXNet for GPU
+
+FROM nvidia/cuda:9.0-cudnn7-devel
+ARG version
+
+RUN apt-get update && \
+    apt-get install -y wget python3-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python3 get-pip.py
+
+RUN pip3 install mxnet-cu90==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu90.mkl b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu90.mkl
new file mode 100644
index 000000000000..c0b6145b28e7
--- /dev/null
+++ b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu90.mkl
@@ -0,0 +1,29 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build MXNet for GPU with MKL
+
+FROM nvidia/cuda:9.0-cudnn7-devel
+ARG version
+
+RUN apt-get update && \
+    apt-get install -y wget python3-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python3 get-pip.py
+
+RUN pip3 install mxnet-cu90mkl==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu92 b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu92
new file mode 100644
index 000000000000..6d877961db9f
--- /dev/null
+++ b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu92
@@ -0,0 +1,29 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build MXNet for GPU
+
+FROM nvidia/cuda:9.2-cudnn7-devel
+ARG version
+
+RUN apt-get update && \
+    apt-get install -y wget python3-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python3 get-pip.py
+
+RUN pip3 install mxnet-cu92==$version
diff --git a/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu92.mkl b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu92.mkl
new file mode 100644
index 000000000000..b73df97e5950
--- /dev/null
+++ b/docker/docker-python/Dockerfile.mxnet.python3.gpu.cu92.mkl
@@ -0,0 +1,29 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build MXNet for GPU with MKL
+
+FROM nvidia/cuda:9.2-cudnn7-devel
+ARG version
+
+RUN apt-get update && \
+    apt-get install -y wget python3-dev gcc && \
+    wget https://bootstrap.pypa.io/get-pip.py && \
+    python3 get-pip.py
+
+RUN pip3 install mxnet-cu92mkl==$version
diff --git a/docker/docker-python/README.md b/docker/docker-python/README.md
index 6d17c50439ae..767be6d1eb33 100644
--- a/docker/docker-python/README.md
+++ b/docker/docker-python/README.md
@@ -19,31 +19,40 @@
 
 # Release Python Docker Images for MXNet
 
-The `docker-python` directory can be used to release mxnet python docker images to dockerhub after any mxnet release.
-It uses the appropriate pip binaries to build different docker images as -
-* cpu
-* cpu_mkl
-* latest (same as cpu)
-* gpu_cu90
-* gpu_cu90_mkl
-* gpu (same as gpu_cu90)
-* gpu_cu80
-* gpu_cu80_mkl
-* gpu_cu92
-* gpu_cu92_mkl
-
-
-** Note: If you want to use a different pip binary (specific mxnet or cuda version, etc), you can edit the last line of the cpu or gpu dockerfile as required.
+The `docker-python` directory can be used to release mxnet python docker images to dockerhub after any mxnet release.  
+It uses the appropriate pip binaries to build different docker images. Both python2 (default) and python3 images are available as -
+* {version}_cpu
+* {version}_cpu_mkl
+* {version}_gpu_cu90
+* {version}_gpu_cu90_mkl
+* {version}_gpu_cu80
+* {version}_gpu_cu80_mkl
+* {version}_gpu_cu92
+* {version}_gpu_cu92_mkl
+* {version}_cpu_py3
+* {version}_cpu_mkl_py3
+* {version}_gpu_cu90_py3
+* {version}_gpu_cu90_mkl_py3
+* {version}_gpu_cu80_py3
+* {version}_gpu_cu80_mkl_py3
+* {version}_gpu_cu92_py3
+* {version}_gpu_cu92_mkl_py3
 
-Refer: https://pypi.org/project/mxnet/
+And the following tags will be available without the version string in the image name (for Benchmarking and other use cases):
+* latest (same as {version}_cpu)
+* gpu (same as {version}_gpu_cu90)
+* latest_cpu_mkl_py2 (same as {version}_cpu_mkl)
+* latest_cpu_mkl_py3 (same as {version}_cpu_mkl_py3)
+* latest_gpu_mkl_py2 (same as {version}_gpu_cu90_mkl)
+* latest_gpu_mkl_py3 (same as {version}_gpu_cu90_mkl_py3)
 
-### Usage
-`./build_python_dockerfile.sh <mxnet_version> <path_to_cloned_mxnet_repo>`
+Refer: https://pypi.org/project/mxnet/
 
-For example:
-`./build_python_dockerfile.sh 1.3.0 ~/build-docker/incubator-mxnet`
+### Using the Build Script
+`./build_python_dockerfile.sh <mxnet_version> <pip_tag> <path_to_cloned_mxnet_repo>`
 
-** Note: The build script picks up the latest pip binaries. This means it uses the latest released mxnet version. The version specified as a parameter to the script is only used to tag the built image correctly.
+For example: 
+`./build_python_dockerfile.sh 1.3.0 1.3.0.post0 ~/build-docker/incubator-mxnet`
 
 ### Tests run
 * [test_conv.py](https://github.com/apache/incubator-mxnet/blob/master/tests/python/train/test_conv.py)
@@ -58,3 +67,10 @@ Credentials can be provided in the following ways:
 * **Set Environment Variables:** Set the following environment variables which the script will pick up to login to dockerhub at runtime -
     * $MXNET_DOCKERHUB_PASSWORD
     * $MXNET_DOCKERHUB_USERNAME
+    
+
+### Using the Docker Images
+* The MXNet Python Docker images can be found here: https://hub.docker.com/r/mxnet/python/
+
+* Docker Pull Command: `docker pull mxnet/python:<image_tag>`
+* Get started: `docker run -it mxnet/python:<image_tag> bash`
diff --git a/docker/docker-python/build_python_dockerfile.sh b/docker/docker-python/build_python_dockerfile.sh
index 24a44c28970c..d0f24c8d6a5e 100755
--- a/docker/docker-python/build_python_dockerfile.sh
+++ b/docker/docker-python/build_python_dockerfile.sh
@@ -17,20 +17,19 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-
 # Check Params
 programname=$0
 
 function usage {
-    echo "usage: $programname [version] [path]"
+    echo "usage: $programname [version] [pip_tag] [path]"
     echo "  [version]  Mxnet Version to build"
+    echo "  [pip_tag]  Pip Tag to use"
     echo "  [path]     Path to MXNet repository (to run tests)"
     echo " "
     exit 1
 }
 
-if [ $# -le 1 ] || [ $# -ge 3 ]
+if [ $# -le 2 ] || [ $# -ge 4 ]
 then
     usage
     exit 1
@@ -39,29 +38,37 @@ fi
 # Two params provided
 echo "Building Docker Images for Apache MXNet (Incubating) v$1"
 mxnet_version="${1}"
-test_dir="${2}"
+pip_tag="${2}"
+test_dir="${3}"
 
-docker_build_image(){
-    echo "Building docker image mxnet/python:${1}"
-    docker build -t mxnet/python:${1} -f ${2} .
-}
+# Remove the logs directory if it already exists else it may fail due to old logs.
+LOGDIR=~/temp/docker_logs
+if [ -d "${LOGDIR}" ]; then
+  rm -rf ${LOGDIR}
+fi
+
+# Create ~/temp if it does not exist
+mkdir -p ~/temp
+mkdir ${LOGDIR}
 
-docker_tag_image(){
-    docker tag mxnet/python:${1} mxnet/python:${2}
-}
 
+# Functions
 docker_test_image_cpu(){
-    echo "Running tests on mxnet/python:${1}"
-    docker run -v ${test_dir}:/mxnet mxnet/python:${1} bash -c "python /mxnet/docker/docker-python/test_mxnet.py ${mxnet_version}"
-    docker run -v ${test_dir}:/mxnet mxnet/python:${1} bash -c "python /mxnet/tests/python/train/test_conv.py"
-    docker run -v ${test_dir}:/mxnet mxnet/python:${1} bash -c "python /mxnet/example/image-classification/train_mnist.py"
+    image_tag="${1}"
+    python_version="${2}"
+    echo "Running tests on mxnet/python:${image_tag}"
+    docker run -v ${test_dir}:/mxnet mxnet/python:${image_tag} bash -c "${python_version} /mxnet/docker/docker-python/test_mxnet.py ${mxnet_version}"
+    docker run -v ${test_dir}:/mxnet mxnet/python:${image_tag} bash -c "${python_version} /mxnet/tests/python/train/test_conv.py"
+    docker run -v ${test_dir}:/mxnet mxnet/python:${image_tag} bash -c "${python_version} /mxnet/example/image-classification/train_mnist.py"
 }
 
 docker_test_image_gpu(){
+    image_tag="${1}"
+    python_version="${2}"
     echo "Running tests on mxnet/python:${1}"
-    nvidia-docker run -v ${test_dir}:/mxnet mxnet/python:${1} bash -c "python /mxnet/docker/docker-python/test_mxnet.py ${mxnet_version}"
-    nvidia-docker run -v ${test_dir}:/mxnet mxnet/python:${1} bash -c "python /mxnet/tests/python/train/test_conv.py --gpu"
-    nvidia-docker run -v ${test_dir}:/mxnet mxnet/python:${1} bash -c "python /mxnet/example/image-classification/train_mnist.py --gpus 2"
+    nvidia-docker run -v ${test_dir}:/mxnet mxnet/python:${image_tag} bash -c "${python_version} /mxnet/docker/docker-python/test_mxnet.py ${mxnet_version}"
+    nvidia-docker run -v ${test_dir}:/mxnet mxnet/python:${image_tag} bash -c "${python_version} /mxnet/tests/python/train/test_conv.py --gpu"
+    nvidia-docker run -v ${test_dir}:/mxnet mxnet/python:${image_tag} bash -c "${python_version} /mxnet/example/image-classification/train_mnist.py --gpus 0,1,2,3"
 }
 
 # if both $MXNET_DOCKERHUB_PASSWORD and $MXNET_DOCKERHUB_USERNAME environment variables are set, docker will automatically login
@@ -79,59 +86,132 @@ docker_account_logout(){
 }
 
 docker_push_image(){
-    docker push mxnet/python:${1}
+    image_tag="${1}"
+    docker push mxnet/python:${image_tag}
 }
 
+docker_generate_image_cpu(){
+    image_tag="${1}"
+    dockerfile="${2}"
+    python_version="${3}"
+    echo "Building docker image mxnet/python:${image_tag}"
+    docker build --build-arg version=${pip_tag} -t mxnet/python:${image_tag} -f ${dockerfile} .
+    docker_test_image_cpu ${image_tag} ${python_version}
+}
 
-# Build and Test dockerfiles - CPU
-docker_build_image "${mxnet_version}_cpu" "Dockerfile.mxnet.python.cpu"
-docker_test_image_cpu "${mxnet_version}_cpu"
-
-docker_build_image "${mxnet_version}_cpu_mkl" "Dockerfile.mxnet.python.cpu.mkl"
-docker_test_image_cpu "${mxnet_version}_cpu_mkl"
-
-docker_tag_image "${mxnet_version}_cpu" "latest"
-docker_test_image_cpu "latest"
-
-
-#Build and Test dockerfiles - GPU
-docker_build_image "${mxnet_version}_gpu_cu90" "Dockerfile.mxnet.python.gpu.cu90"
-docker_test_image_gpu "${mxnet_version}_gpu_cu90"
-
-docker_build_image "${mxnet_version}_gpu_cu90_mkl" "Dockerfile.mxnet.python.gpu.cu90.mkl"
-docker_test_image_gpu "${mxnet_version}_gpu_cu90_mkl"
+docker_tag_image_cpu(){
+    original_tag="${1}"
+    image_tag="${2}"
+    python_version="${3}"
+    docker tag mxnet/python:${original_tag} mxnet/python:${image_tag}
+    docker_test_image_cpu ${image_tag} ${python_version}
+}
 
-docker_tag_image "${mxnet_version}_gpu_cu90" "gpu"
-docker_test_image_gpu "gpu"
+docker_generate_image_gpu(){
+    image_tag="${1}"
+    dockerfile="${2}"
+    python_version="${3}"
+    echo "Building docker image mxnet/python:${1}"
+    docker build --build-arg version=${pip_tag} -t mxnet/python:${image_tag} -f ${dockerfile} .
+    docker_test_image_gpu ${image_tag} ${python_version}
+}
 
-docker_build_image "${mxnet_version}_gpu_cu80" "Dockerfile.mxnet.python.gpu.cu80"
-docker_test_image_gpu "${mxnet_version}_gpu_cu80"
+docker_tag_image_gpu(){
+    original_tag="${1}"
+    image_tag="${2}"
+    python_version="${3}"
+    docker tag mxnet/python:${original_tag} mxnet/python:${image_tag}
+    docker_test_image_gpu ${image_tag} ${python_version}
+}
 
-docker_build_image "${mxnet_version}_gpu_cu80_mkl" "Dockerfile.mxnet.python.gpu.cu80.mkl"
-docker_test_image_gpu "${mxnet_version}_gpu_cu80_mkl"
+check_errors(){
+    egrep -i "not found|error|returned a non-zero code|fail" ${LOGDIR}/docker*
+    if [ $? -eq 0 ]; then
+        echo "ERROR: One of the build/test commands failed. Refer to the filename above to see which image tag caused it."
+        exit 1
+    else
+        echo "Success: No errors found"
+    fi
+}
 
-docker_build_image "${mxnet_version}_gpu_cu92" "Dockerfile.mxnet.python.gpu.cu92"
-docker_test_image_gpu "${mxnet_version}_gpu_cu92"
+# Build and Test dockerfiles - CPU
+docker_generate_image_cpu "${mxnet_version}_cpu" "Dockerfile.mxnet.python.cpu" "python"  > ${LOGDIR}/docker_cpu.out 2>&1 &
+docker_generate_image_cpu "${mxnet_version}_cpu_mkl" "Dockerfile.mxnet.python.cpu.mkl" "python" > ${LOGDIR}/docker_cpu_mkl.out 2>&1 &
 
-docker_build_image "${mxnet_version}_gpu_cu92_mkl" "Dockerfile.mxnet.python.gpu.cu92.mkl"
-docker_test_image_gpu "${mxnet_version}_gpu_cu92_mkl"
 
+#Build and Test dockerfiles - GPU
+docker_generate_image_gpu "${mxnet_version}_gpu_cu90" "Dockerfile.mxnet.python.gpu.cu90" "python" > ${LOGDIR}/docker_gpu_cu90.out 2>&1 &
+docker_generate_image_gpu "${mxnet_version}_gpu_cu90_mkl" "Dockerfile.mxnet.python.gpu.cu90.mkl" "python" > ${LOGDIR}/docker_gpu_cu90_mkl.out 2>&1 &
+docker_generate_image_gpu "${mxnet_version}_gpu_cu80" "Dockerfile.mxnet.python.gpu.cu80" "python" > ${LOGDIR}/docker_gpu_cu80.out 2>&1 &
+docker_generate_image_gpu "${mxnet_version}_gpu_cu80_mkl" "Dockerfile.mxnet.python.gpu.cu80.mkl" "python" > ${LOGDIR}/docker_gpu_cu80_mkl.out 2>&1 &
+docker_generate_image_gpu "${mxnet_version}_gpu_cu92" "Dockerfile.mxnet.python.gpu.cu92" "python" > ${LOGDIR}/docker_gpu_cu92.out 2>&1 &
+docker_generate_image_gpu "${mxnet_version}_gpu_cu92_mkl" "Dockerfile.mxnet.python.gpu.cu92.mkl" "python" > ${LOGDIR}/docker_gpu_cu92_mkl.out 2>&1
+
+echo "Waiting for MXNet Python2 Docker Images to Build"
+wait
+
+# Build and Test Python3 dockerfiles - CPU
+docker_generate_image_cpu "${mxnet_version}_cpu_py3" "Dockerfile.mxnet.python3.cpu" "python3" > ${LOGDIR}/docker_cpu_py3.out 2>&1 &
+docker_generate_image_cpu "${mxnet_version}_cpu_mkl_py3" "Dockerfile.mxnet.python3.cpu.mkl" "python3" > ${LOGDIR}/docker_cpu_mkl_py3.out 2>&1 &
+
+#Build and Test Python3 dockerfiles - GPU
+docker_generate_image_gpu "${mxnet_version}_gpu_cu90_py3" "Dockerfile.mxnet.python3.gpu.cu90" "python3" > ${LOGDIR}/docker_gpu_cu90_py3.out 2>&1 &
+docker_generate_image_gpu "${mxnet_version}_gpu_cu90_mkl_py3" "Dockerfile.mxnet.python3.gpu.cu90.mkl" "python3" > ${LOGDIR}/docker_gpu_cu90_mkl_py3.out 2>&1 &
+docker_generate_image_gpu "${mxnet_version}_gpu_cu80_py3" "Dockerfile.mxnet.python3.gpu.cu80" "python3" > ${LOGDIR}/docker_gpu_cu80_py3.out 2>&1 &
+docker_generate_image_gpu "${mxnet_version}_gpu_cu80_mkl_py3" "Dockerfile.mxnet.python3.gpu.cu80.mkl" "python3" > ${LOGDIR}/docker_gpu_cu80_mkl_py3.out 2>&1 &
+docker_generate_image_gpu "${mxnet_version}_gpu_cu92_py3" "Dockerfile.mxnet.python3.gpu.cu92" "python3" > ${LOGDIR}/docker_gpu_cu92_py3.out 2>&1 &
+docker_generate_image_gpu "${mxnet_version}_gpu_cu92_mkl_py3" "Dockerfile.mxnet.python3.gpu.cu92.mkl" "python3" > ${LOGDIR}/docker_gpu_cu92_mkl_py3.out 2>&1
+
+echo "Waiting for MXNet Python3 Docker Images to Build"
+wait
+
+echo "Re-Tag 6 images with version-free names (for Benchmarking) - only after previous builds complete. "
+docker_tag_image_cpu "${mxnet_version}_cpu" "latest" "python" > ${LOGDIR}/docker_latest.out 2>&1 &
+docker_tag_image_gpu "${mxnet_version}_gpu_cu90" "gpu" "python" > ${LOGDIR}/docker_gpu.out 2>&1 &
+docker_tag_image_cpu "${mxnet_version}_cpu_mkl" "latest_cpu_mkl_py2" "python" > ${LOGDIR}/docker_latest_cpu_mkl_py2.out 2>&1 &
+docker_tag_image_cpu "${mxnet_version}_cpu_mkl_py3" "latest_cpu_mkl_py3" "python3" > ${LOGDIR}/docker_latest_cpu_mkl_py3.out 2>&1 &
+docker_tag_image_gpu "${mxnet_version}_gpu_cu90_mkl" "latest_gpu_mkl_py2" "python" > ${LOGDIR}/docker_latest_gpu_mkl_py2.out 2>&1 &
+docker_tag_image_gpu "${mxnet_version}_gpu_cu90_mkl_py3" "latest_gpu_mkl_py3" "python3" > ${LOGDIR}/docker_latest_gpu_mkl_py3.out 2>&1
+wait
+
+# Parse all the docker logfiles to make sure there is no error. Fail script if error is found.
+check_errors
 
 # Push dockerfiles
 echo "All images were successfully built. Now login to dockerhub and push images"
 docker_account_login
 
+# Python2
 docker_push_image "${mxnet_version}_cpu"
 docker_push_image "${mxnet_version}_cpu_mkl"
-docker_push_image "latest"
 docker_push_image "${mxnet_version}_gpu_cu90"
 docker_push_image "${mxnet_version}_gpu_cu90_mkl"
-docker_push_image "gpu"
 docker_push_image "${mxnet_version}_gpu_cu80"
 docker_push_image "${mxnet_version}_gpu_cu80_mkl"
 docker_push_image "${mxnet_version}_gpu_cu92"
 docker_push_image "${mxnet_version}_gpu_cu92_mkl"
 
+# Python3
+docker_push_image "${mxnet_version}_cpu_py3"
+docker_push_image "${mxnet_version}_cpu_mkl_py3"
+docker_push_image "${mxnet_version}_gpu_cu90_py3"
+docker_push_image "${mxnet_version}_gpu_cu90_mkl_py3"
+docker_push_image "${mxnet_version}_gpu_cu80_py3"
+docker_push_image "${mxnet_version}_gpu_cu80_mkl_py3"
+docker_push_image "${mxnet_version}_gpu_cu92_py3"
+docker_push_image "${mxnet_version}_gpu_cu92_mkl_py3"
+
+docker_push_image "latest"
+docker_push_image "gpu"
+docker_push_image "latest_cpu_mkl_py2"
+docker_push_image "latest_cpu_mkl_py3"
+docker_push_image "latest_gpu_mkl_py2"
+docker_push_image "latest_gpu_mkl_py3"
+
+
 docker_account_logout
 
 echo "Successfully Built, Tested and Pushed all Images to Dockerhub. Link: https://hub.docker.com/r/mxnet/python/tags/"
+
+#Delete the log directory since everything succeeded:
+rm -rf ${LOGDIR}
\ No newline at end of file
diff --git a/docs/_static/js/options.js b/docs/_static/js/options.js
index f4fde4e1f2ef..ec3977601c92 100644
--- a/docs/_static/js/options.js
+++ b/docs/_static/js/options.js
@@ -1,8 +1,9 @@
+
 /*!
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
+ * regarding copyright ownership.  The ASF licenses this file 
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
@@ -18,7 +19,7 @@
  */
 
 /* Installation page display functions for install selector */
-var versionSelect   = defaultVersion = 'v1.3.1';
+var versionSelect   = defaultVersion = 'v1.4.0';
 var platformSelect    = 'Linux';
 var languageSelect  = 'Python';
 var processorSelect = 'CPU';
diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html
index 302b1732233d..34f675853924 100644
--- a/docs/_static/mxnet-theme/index.html
+++ b/docs/_static/mxnet-theme/index.html
@@ -23,9 +23,9 @@
   <div class="container">
     <div class="row">
       <div class="col-lg-4 col-sm-12">
-        <h3>MXNet 1.3.1 Released</h3>
-        <p>This release includes bug fixes, performance improvements, and documentation updates.</p>
-        <a href="https://github.com/apache/incubator-mxnet/releases/tag/1.3.1">Learn More</a>
+        <h3>MXNet 1.4.0 Released</h3>
+        <p>This release introduces the Java Inference API and Julia API, as well as Control Flow Operators, MKLDNN optimizations, and SVRG optimization.</p>
+        <a href="https://github.com/apache/incubator-mxnet/releases/tag/1.4.0">Learn More</a>
       </div>
       <div class="col-lg-4 col-sm-12">
         <h3>A 60-minute Gluon Crash Course</h3>
diff --git a/docs/_static/mxnet-theme/navbar.html b/docs/_static/mxnet-theme/navbar.html
index 50c3debba395..d2449fa76e1b 100644
--- a/docs/_static/mxnet-theme/navbar.html
+++ b/docs/_static/mxnet-theme/navbar.html
@@ -11,7 +11,7 @@ <h1 id="logo-wrap">
           <a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Gluon <span class="caret"></span></a>
           <ul id="package-dropdown-menu" class="dropdown-menu navbar-menu">
             <li><a class="main-nav-link" href="{{url_root}}gluon/index.html">About</a></li>
-            <li><a class="main-nav-link" href="http://gluon.mxnet.io">The Straight Dope (Tutorials)</a></li>
+            <li><a class="main-nav-link" href="https://www.d2l.ai/">Dive into Deep Learning</a></li>
             <li><a class="main-nav-link" href="https://gluon-cv.mxnet.io">GluonCV Toolkit</a></li>
             <li><a class="main-nav-link" href="https://gluon-nlp.mxnet.io/">GluonNLP Toolkit</a></li>
           </ul>
@@ -23,11 +23,11 @@ <h1 id="logo-wrap">
             <li><a class="main-nav-link" href="{{url_root}}api/python/index.html">Python</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/c++/index.html">C++</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/clojure/index.html">Clojure</a></li>
+            <li><a class="main-nav-link" href="{{url_root}}api/java/index.html">Java</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/julia/index.html">Julia</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/perl/index.html">Perl</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/r/index.html">R</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/scala/index.html">Scala</a></li>
-            <li><a class="main-nav-link" href="{{url_root}}api/java/index.html">Java</a></li>
           </ul>
         </span>
 
@@ -77,11 +77,11 @@ <h1 id="logo-wrap">
                   <li><a class="main-nav-link" href="{{url_root}}api/python/index.html">Python</a></li>
                   <li><a class="main-nav-link" href="{{url_root}}api/c++/index.html">C++</a></li>
                   <li><a class="main-nav-link" href="{{url_root}}api/clojure/index.html">Clojure</a></li>
+                  <li><a class="main-nav-link" href="{{url_root}}api/java/index.html">Java</a></li>
                   <li><a class="main-nav-link" href="{{url_root}}api/julia/index.html">Julia</a></li>
                   <li><a class="main-nav-link" href="{{url_root}}api/perl/index.html">Perl</a></li>
                   <li><a class="main-nav-link" href="{{url_root}}api/r/index.html">R</a></li>
                   <li><a class="main-nav-link" href="{{url_root}}api/scala/index.html">Scala</a></li>
-                  <li><a class="main-nav-link" href="{{url_root}}api/java/index.html">Java</a></li>
                 </ul>
               </li>
               <li class="dropdown-submenu">
@@ -108,7 +108,7 @@ <h1 id="logo-wrap">
               </li>
           </ul>
       </div>
-      
+
       <div class="plusIcon dropdown">
         <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button"><span class="glyphicon glyphicon-plus" aria-hidden="true"></span></a>
         <ul id="plusMenu" class="dropdown-menu dropdown-menu-right"></ul>
diff --git a/docs/api/perl/index.md b/docs/api/perl/index.md
index 2d3d2656d7cd..8815e33a3e39 100644
--- a/docs/api/perl/index.md
+++ b/docs/api/perl/index.md
@@ -30,7 +30,7 @@ In addition please refer to [excellent metacpan doc interface](https://metacpan.
 [MXNet Python API Documentation](http://mxnet.io/api/python/index.html).
 
 AI::MXNet supports new imperative PyTorch like Gluon MXNet interface. Please get acquainted with this new interface
-at [Deep Learning - The Straight Dope](http://gluon.mxnet.io/).
+at [Dive into Deep Learning](https://www.d2l.ai/).
 
 For specific Perl Gluon usage please refer to Perl examples and tests directories on github, but be assured that the Python and Perl usage
 are extremely close in order to make the use of the Python Gluon docs and examples as easy as possible.
diff --git a/docs/api/perl/io.md b/docs/api/perl/io.md
index be4976425553..ca3b0f1e09f7 100644
--- a/docs/api/perl/io.md
+++ b/docs/api/perl/io.md
@@ -69,6 +69,7 @@ Then we can call `$mod->fit($nd_iter, num_epoch=>2)` to train `loss` by 2 epochs
 mx->io->NDArrayIter
 mx->io->CSVIter
 mx->io->ImageRecordIter
+mx->io->ImageRecordInt8Iter
 mx->io->ImageRecordUInt8Iter
 mx->io->MNISTIter
 mx->recordio->MXRecordIO
diff --git a/docs/api/python/io/io.md b/docs/api/python/io/io.md
index c0dc8d1efb1d..13a612196eee 100644
--- a/docs/api/python/io/io.md
+++ b/docs/api/python/io/io.md
@@ -75,6 +75,7 @@ A detailed tutorial is available at
     io.CSVIter
     io.LibSVMIter
     io.ImageRecordIter
+    io.ImageRecordInt8Iter
     io.ImageRecordUInt8Iter
     io.MNISTIter
     recordio.MXRecordIO
diff --git a/docs/api/python/libinfo/libinfo.md b/docs/api/python/libinfo/libinfo.md
index 531e1ced3c99..3b8d4998aa5b 100644
--- a/docs/api/python/libinfo/libinfo.md
+++ b/docs/api/python/libinfo/libinfo.md
@@ -28,33 +28,74 @@ The libinfo functionality allows to check for compile-time features supported by
 ### Example usage
 
 ```
-In [1]: import mxnet as mx
+In []: import mxnet as mx
    ...: import mxnet.runtime
    ...: fs = mx.runtime.Features()
 
-In [2]: fs
-Out[2]: [✖ CUDA, ✖ CUDNN, ✖ NCCL, ✖ CUDA_RTC, ✖ TENSORRT, ✔ CPU_SSE, ✔ CPU_SSE2, ✔ CPU_SSE3, ✔ CPU_SSE4_1, ✔ CPU_SSE4_2, ✖ CPU_SSE4A, ✔ CPU_AVX, ✖ CPU_AVX2, ✖ OPENMP, ✖ SSE, ✔ F16C, ✖ JEMALLOC, ✔ BLAS_OPEN, ✖ BLAS_ATLAS, ✖ BLAS_MKL, ✖ BLAS_APPLE, ✔ LAPACK, ✖ MKLDNN, ✔ OPENCV, ✖ CAFFE, ✖ PROFILER, ✖ DIST_KVSTORE, ✖ CXX14, ✔ SIGNAL_HANDLER, ✔ DEBUG]
-
-In [3]: fs['CUDA'].enabled
-Out[3]: False
-
-In [4]: fs.is_enabled('CPU_SSE')
-Out[4]: True
-
-In [5]: fs.is_enabled('CUDA')
-Out[5]: False
-
-In [6]:
+In []: fs
+Out[]: [✖ CUDA, ✖ CUDNN, ✖ NCCL, ✖ CUDA_RTC, ✖ TENSORRT, ✔ CPU_SSE, ✔ CPU_SSE2, ✔ CPU_SSE3, ✔ CPU_SSE4_1, ✔ CPU_SSE4_2, ✖ CPU_SSE4A, ✔ CPU_AVX, ✖ CPU_AVX2, ✖ OPENMP, ✖ SSE, ✔ F16C, ✖ JEMALLOC, ✔ BLAS_OPEN, ✖ BLAS_ATLAS, ✖ BLAS_MKL, ✖ BLAS_APPLE, ✔ LAPACK, ✖ MKLDNN, ✔ OPENCV, ✖ CAFFE, ✖ PROFILER, ✖ DIST_KVSTORE, ✖ CXX14, ✔ SIGNAL_HANDLER, ✔ DEBUG]
+
+In []: fs.keys()
+Out[]: odict_keys(['CUDA', 'CUDNN', 'NCCL', 'CUDA_RTC', 'TENSORRT', 'CPU_SSE', 'CPU_SSE2', 'CPU_SSE3', 'CPU_SSE4_1', 'CPU_SSE4_2', 'CPU_SSE4A', 'CPU_AVX', 'CPU_AVX2', 'OPENMP', 'SSE', 'F16C', 'JEMALLOC', 'BLAS_OPEN', 'BLAS_ATLAS', 'BLAS_MKL', 'BLAS_APPLE', 'LAPACK', 'MKLDNN', 'OPENCV', 'CAFFE', 'PROFILER', 'DIST_KVSTORE', 'CXX14', 'SIGNAL_HANDLER', 'DEBUG'])
+
+In []: type(fs['CUDA'])
+Out[]: mxnet.runtime.Feature
+
+In []: fs['CUDA'].enabled
+Out[]: False
+
+In []: fs.is_enabled('CPU_SSE')
+Out[]: True
+
+In []: fs.is_enabled('CUDA')
+Out[]: False
+
+In []: features = mx.runtime.feature_list()
+
+In []: features
+Out[]:
+[✖ CUDA,
+ ✖ CUDNN,
+ ✖ NCCL,
+ ✖ CUDA_RTC,
+ ✖ TENSORRT,
+ ✔ CPU_SSE,
+ ✔ CPU_SSE2,
+ ✔ CPU_SSE3,
+ ✔ CPU_SSE4_1,
+ ✔ CPU_SSE4_2,
+ ✖ CPU_SSE4A,
+ ✔ CPU_AVX,
+ ✖ CPU_AVX2,
+ ✖ OPENMP,
+ ✖ SSE,
+ ✔ F16C,
+ ✖ JEMALLOC,
+ ✔ BLAS_OPEN,
+ ✖ BLAS_ATLAS,
+ ✖ BLAS_MKL,
+ ✖ BLAS_APPLE,
+ ✔ LAPACK,
+ ✖ MKLDNN,
+ ✔ OPENCV,
+ ✖ CAFFE,
+ ✖ PROFILER,
+ ✖ DIST_KVSTORE,
+ ✖ CXX14,
+ ✔ SIGNAL_HANDLER,
+ ✔ DEBUG]
+
+In []: type(features)
+Out[]: list
+
+In []: type(features[0])
+Out[]: mxnet.runtime.Feature
 ```
 
-
 ```eval_rst
 .. autosummary::
     :nosignatures:
 
-    Features
-    Feature
-    feature_list
 ```
 
 ## API Reference
diff --git a/docs/architecture/exception_handling.md b/docs/architecture/exception_handling.md
index 6a9ab9ae0c4c..87481bcdb9bd 100644
--- a/docs/architecture/exception_handling.md
+++ b/docs/architecture/exception_handling.md
@@ -123,6 +123,3 @@ except mx.base.MXNetError as ex:
 d.asnumpy()
 ```
 
-### Limitation
-
-Rethrowing exceptions as part of `mx.nd.waitall` is not supported. So if your code executes a few operators and then calls `waitall` instead of `wait_to_read`/`asnumpy`, the exception will disappear. Please avoid waitalls in your code unless you are confident about your code not throwing exception in any scenario.
diff --git a/docs/community/ecosystem.md b/docs/community/ecosystem.md
index 101f76d4c380..1e2bf07335d3 100644
--- a/docs/community/ecosystem.md
+++ b/docs/community/ecosystem.md
@@ -41,7 +41,7 @@ Community contributions to MXNet have added many new valuable features and funct
 
 * [Gluon 60 Minute Crash Course](https://gluon-crash-course.mxnet.io/) - deep learning practitioners can learn Gluon quickly with these six 10-minute tutorials.
     - [YouTube Series](https://www.youtube.com/playlist?list=PLkEvNnRk8uVmVKRDgznk3o3LxmjFRaW7s)
-* [The Straight Dope](https://gluon.mxnet.io/) - a series of notebooks designed to teach deep learning using the Gluon Python API for MXNet.
+* [Dive into Deep Learning](https://www.d2l.ai/) - a series of notebooks designed to teach deep learning using the Gluon Python API for MXNet.
 
 
 ## MXNet APIs
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index f49cb1997691..095c214e66b3 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -115,7 +115,13 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
   - If set to `1`, during training MXNet executes the computation graph as several subgraphs in bulk mode.
 * MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN
   - Values: Int ```(default=15)```
-  - The maximum number of nodes in the subgraph executed in bulk during training(not inference). Setting this to a larger number may reduce the degree of parallelism for multi-GPU training.
+  - The maximum number of nodes in the subgraph executed in bulk during training (not inference). Setting this to a larger number may reduce the degree of parallelism for multi-GPU training.
+* MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD
+  - Values: Int ```(default=<value of MXNET_EXEC_BULK_MAX_NODE_TRAIN>)```
+  - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the forward pass.
+* MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD
+  - Values: Int ```(default=<value of MXNET_EXEC_BULK_MAX_NODE_TRAIN>)```
+  - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass.
 
 ## Control the Data Communication
 
diff --git a/docs/faq/perf.md b/docs/faq/perf.md
index 00310dfbb5bd..e1318b843a03 100644
--- a/docs/faq/perf.md
+++ b/docs/faq/perf.md
@@ -43,7 +43,7 @@ We also find that setting the following environment variables can help:
 | :-------- | :---------- |
 | `OMP_NUM_THREADS`            | Suggested value: `vCPUs / 2` in which `vCPUs` is the number of virtual CPUs. For more information, please see the guide for [setting the number of threads using an OpenMP environment variable](https://software.intel.com/en-us/mkl-windows-developer-guide-setting-the-number-of-threads-using-an-openmp-environment-variable) |
 | `KMP_AFFINITY`               | Suggested value: `granularity=fine,compact,1,0`.  For more information, please see the guide for [Thread Affinity Interface (Linux* and Windows*)](https://software.intel.com/en-us/node/522691). |
-| `MXNET_SUBGRAPH_BACKEND` | Set to MKLDNN to enable the [subgraph feature](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN) for better performance. For more information please see [Build/Install MXNet with MKL-DNN](https://github.com/apache/incubator-mxnet/blob/master/MKLDNN_README.md)|
+| `MXNET_SUBGRAPH_BACKEND` | Set to MKLDNN to enable the [subgraph feature](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN) for better performance. For more information please see [Build/Install MXNet with MKL-DNN](https://github.com/apache/incubator-mxnet/blob/master/docs/tutorials/mkldnn/MKLDNN_README.md)|
 
 Note that _MXNet_ treats all CPUs on a single machine as a single device.
 So whether you specify `cpu(0)` or `cpu()`, _MXNet_ will use all CPU cores on the machine.
diff --git a/docs/gluon/index.md b/docs/gluon/index.md
index 469222e46293..ea3eedce065e 100644
--- a/docs/gluon/index.md
+++ b/docs/gluon/index.md
@@ -25,7 +25,7 @@ To get started with Gluon, checkout the following resources and tutorials:
 * [60-minute Gluon Crash Course](https://gluon-crash-course.mxnet.io/) - six 10-minute lessons on using Gluon
 * [GluonCV Toolkit](https://gluon-cv.mxnet.io/) - implementations of state of the art deep learning algorithms in **Computer Vision (CV)**
 * [GluonNLP Toolkit](https://gluon-nlp.mxnet.io/) - implementations of state of the art deep learning algorithms in **Natural Language Processing (NLP)**
-* [Gluon: The Straight Dope](https://gluon.mxnet.io/) - notebooks designed to teach deep learning from the ground up, all using the Gluon API
+* [Dive into Deep Learning](https://www.d2l.ai/) - notebooks designed to teach deep learning from the ground up, all using the Gluon API
 
 <br/>
 <div class="boxed">
@@ -42,14 +42,14 @@ To get started with Gluon, checkout the following resources and tutorials:
 
 <br/>
 <div class="boxed">
-    The Straight Dope
+    Dive into Deep Learning
 </div>
 
-The community is also working on parallel effort to create a foundational resource for learning about machine learning. The Straight Dope is a book composed of introductory as well as advanced tutorials – all based on the Gluon interface. For example,
+The community is also working on parallel effort to create a foundational resource for learning about machine learning. Dive into Deep Learning is a book composed of introductory as well as advanced tutorials – all based on the Gluon interface. For example,
 
-* [Learn about machine learning basics](http://gluon.mxnet.io/chapter01_crashcourse/introduction.html).
-* [Develop and train a simple neural network model](http://gluon.mxnet.io/chapter03_deep-neural-networks/mlp-gluon.html).
-* [Implement a Recurrent Neural Network (RNN) model for Language Modeling](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/simple-rnn.html).
+* [Learn about machine learning basics](https://www.d2l.ai/chapter_introduction/intro.html).
+* [Develop and train a simple neural network model](https://www.d2l.ai/chapter_multilayer-perceptrons/mlp-scratch.html).
+* [Implement a Recurrent Neural Network (RNN) model for Language Modeling](https://www.d2l.ai/chapter_recurrent-neural-networks/rnn-scratch.html).
 
 <br/>
 <div class="boxed">
@@ -124,4 +124,4 @@ net.hybridize()
 * [60-minute Gluon Crash Course](https://gluon-crash-course.mxnet.io/)
 * [GluonCV Toolkit](https://gluon-cv.mxnet.io/)
 * [GluonNLP Toolkit](https://gluon-nlp.mxnet.io/)
-* [Gluon: The Straight Dope](https://gluon.mxnet.io/)
+* [Dive into Deep Learning](https://www.d2l.ai)
diff --git a/docs/install/build_from_source.md b/docs/install/build_from_source.md
index fee4ae530277..7b00b03abefe 100644
--- a/docs/install/build_from_source.md
+++ b/docs/install/build_from_source.md
@@ -45,11 +45,11 @@ MXNet's newest and most popular API is Gluon. Gluon is built into the Python bin
     - [Python (includes Gluon)](../api/python/index.html)
     - [C++](../api/c++/index.html)
     - [Clojure](../api/clojure/index.html)
+    - [Java](../api/java/index.html)
     - [Julia](../api/julia/index.html)
     - [Perl](../api/perl/index.html)
     - [R](../api/r/index.html)
     - [Scala](../api/scala/index.html)
-    - [Java](../api/java/index.html)
 
 <hr>
 
diff --git a/docs/install/download.md b/docs/install/download.md
index 725cf5eb72db..cf95c2344f14 100644
--- a/docs/install/download.md
+++ b/docs/install/download.md
@@ -21,6 +21,7 @@ These source archives are generated from tagged releases. Updates and patches wi
 
 | Version | Source                                                                                                      | PGP                                                                                                             | SHA                                                                                                                |
 |---------|-------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------|
+| 1.4.0   | [Download](https://www.apache.org/dyn/closer.cgi/incubator/mxnet/1.4.0/apache-mxnet-src-1.4.0-incubating.tar.gz)   | [Download](https://apache.org/dist/incubator/mxnet/1.4.0/apache-mxnet-src-1.4.0-incubating.tar.gz.asc)    | [Download](https://apache.org/dist/incubator/mxnet/1.4.0/apache-mxnet-src-1.4.0-incubating.tar.gz.sha512)      |
 | 1.3.1   | [Download](https://www.apache.org/dyn/closer.cgi/incubator/mxnet/1.3.1/apache-mxnet-src-1.3.1-incubating.tar.gz)   | [Download](https://apache.org/dist/incubator/mxnet/1.3.1/apache-mxnet-src-1.3.1-incubating.tar.gz.asc)    | [Download](https://apache.org/dist/incubator/mxnet/1.3.1/apache-mxnet-src-1.3.1-incubating.tar.gz.sha512)      |
 | 1.3.0   | [Download](https://archive.apache.org/dist/incubator/mxnet/1.3.0/apache-mxnet-src-1.3.0-incubating.tar.gz)   | [Download](https://archive.apache.org/dist/incubator/mxnet/1.3.0/apache-mxnet-src-1.3.0-incubating.tar.gz.asc)    | [Download](https://archive.apache.org/dist/incubator/mxnet/1.3.0/apache-mxnet-src-1.3.0-incubating.tar.gz.sha512)      |
 | 1.2.1   | [Download](https://archive.apache.org/dist/incubator/mxnet/1.2.1/apache-mxnet-src-1.2.1-incubating.tar.gz)   | [Download](https://archive.apache.org/dist/incubator/mxnet/1.2.1/apache-mxnet-src-1.2.1-incubating.tar.gz.asc)    | [Download](https://archive.apache.org/dist/incubator/mxnet/1.2.1/apache-mxnet-src-1.2.1-incubating.tar.gz.sha512)      |
diff --git a/docs/install/index.md b/docs/install/index.md
index 2b7e0457e2b4..5708a5b7d4c5 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -39,10 +39,11 @@
 Indicate your preferred configuration. Then, follow the customized commands to install MXNet.
 
 <div class="dropdown">
-  <button class="btn current-version btn-primary dropdown-toggle" type="button" data-toggle="dropdown">v1.3.1
+  <button class="btn current-version btn-primary dropdown-toggle" type="button" data-toggle="dropdown">v1.4.0
   <span class="caret"></span></button>
   <ul class="dropdown-menu opt-group">
-    <li class="opt active versions"><a href="#">v1.3.1</a></li>
+    <li class="opt active versions"><a href="#">v1.4.0</a></li>
+    <li class="opt versions"><a href="#">v1.3.1</a></li>
     <li class="opt versions"><a href="#">v1.2.1</a></li>
     <li class="opt versions"><a href="#">v1.1.0</a></li>
     <li class="opt versions"><a href="#">v1.0.0</a></li>
@@ -117,12 +118,19 @@ Indicate your preferred configuration. Then, follow the customized commands to i
 <div class="python">
 <div class="cpu">
 <div class="pip">
-<div class="v1-3-1">
+<div class="v1-4-0">
 
 ```
 $ pip install mxnet
 ```
 
+</div> <!-- End of v1-4-0 -->
+<div class="v1-3-1">
+
+```
+$ pip install mxnet==1.3.1
+```
+
 </div> <!-- End of v1-3-1 -->
 <div class="v1-2-1">
 
@@ -182,7 +190,7 @@ $ pip install mxnet --pre
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.4.0.png" alt="pip packages"/>
 
 **NOTES:**
 
@@ -251,12 +259,19 @@ To build from source, refer to the <a href="ubuntu_setup.html">MXNet Ubuntu inst
 
 <div class="gpu">
 <div class="pip">
-<div class="v1-3-1">
+<div class="v1-4-0">
 
 ```
 $ pip install mxnet-cu92
 ```
 
+</div> <!-- End of v1-4-0 -->
+<div class="v1-3-1">
+
+```
+$ pip install mxnet==1.3.1
+```
+
 </div> <!-- End of v1-3-1-->
 <div class="v1-2-1">
 
@@ -309,7 +324,7 @@ $ pip install mxnet-cu92 --pre
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.4.0.png" alt="pip packages"/>
 
 **NOTES:**
 
@@ -490,7 +505,7 @@ You can use the Maven packages defined in the following dependency to include MX
 <br/>
 You can use the Maven packages defined in the following dependency to include MXNet in your Java project. The Java API is provided as a subset of the Scala API and is intended for inference only. Please refer to the <a href="java_setup.html">MXNet-Java setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0-SNAPSHOT~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
 
 ```html
 <dependency>
@@ -507,7 +522,7 @@ You can use the Maven packages defined in the following dependency to include MX
 <br/>
 You can use the Maven packages defined in the following dependency to include MXNet in your Java project. The Java API is provided as a subset of the Scala API and is intended for inference only. Please refer to the <a href="java_setup.html">MXNet-Java setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0-SNAPSHOT~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
 
 ```html
 <dependency>
@@ -558,11 +573,17 @@ Refer to the <a href="ubuntu_setup.html#install-the-mxnet-package-for-perl">Perl
 <div class="python">
 <div class="cpu">
 <div class="pip">
-<div class="v1-3-1">
+<div class="v1-4-0">
 
 ```
 $ pip install mxnet
 ```
+</div> <!-- End of v1-4-0 -->
+<div class="v1-3-1">
+
+```
+$ pip install mxnet==1.3.1
+```
 
 </div> <!-- End of v1-3-1 -->
 <div class="v1-2-1">
@@ -619,7 +640,7 @@ $ pip install mxnet --pre
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.4.0.png" alt="pip packages"/>
 
 **NOTES:**
 
@@ -789,7 +810,7 @@ Not available at this time. <br>
 </br>
 You can use the Maven packages defined in the following dependency to include MXNet in your Java project. The Java API is provided as a subset of the Scala API and is intended for inference only. Please refer to the <a href="java_setup.html">MXNet-Java setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0-SNAPSHOT~~"><img src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg" alt="maven badge"/></a>
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0~~"><img src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg" alt="maven badge"/></a>
 
 ```html
 <dependency>
@@ -842,12 +863,19 @@ For more installation options, refer to the <a href="osx_setup.html">MXNet macOS
 <div class="python">
 <div class="cpu">
 <div class="pip">
-<div class="v1-3-1">
+<div class="v1-4-0">
 
 ```
 $ pip install mxnet
 ```
 
+</div> <!-- End of v1-4-0 -->
+<div class="v1-3-1">
+
+```
+$ pip install mxnet==1.3.1
+```
+
 </div> <!-- End of v1-3-1 -->
 <div class="v1-2-1">
 
@@ -900,7 +928,7 @@ $ pip install mxnet --pre
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.4.0.png" alt="pip packages"/>
 
 **NOTES:**
 
@@ -966,12 +994,19 @@ Refer to the <a href="windows_setup.html">MXNet Windows installation guide</a>
 
 <div class="gpu">
 <div class="pip">
-<div class="v1-3-1">
+<div class="v1-4-0">
 
 ```
 $ pip install mxnet-cu92
 ```
 
+</div> <!-- End of v1-4-0 -->
+<div class="v1-3-1">
+
+```
+$ pip install mxnet==1.3.1
+```
+
 </div> <!-- End of v1-3-1 -->
 <div class="v1-2-1">
 
@@ -1024,7 +1059,7 @@ $ pip install mxnet-cu92 --pre
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.4.0.png" alt="pip packages"/>
 
 **NOTES:**
 
@@ -1090,7 +1125,8 @@ You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for
   options(repos = cran)
   install.packages("mxnet")
 ```
-Change cu92 to cu80, cu90 or cu91 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
+Change cu92 to cu90, cu91 or cuda100 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
+Note : You also need to have cuDNN installed on Windows. Check out this [guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installwindows) on the steps for installation.
 
 </div> <!-- END of GPU -->
 </div> <!-- END - Windows R -->
diff --git a/docs/install/java_setup.md b/docs/install/java_setup.md
index 569f439f8b02..ce4a4abf80b7 100644
--- a/docs/install/java_setup.md
+++ b/docs/install/java_setup.md
@@ -35,21 +35,15 @@ The following instructions are provided for macOS and Ubuntu. Windows is not yet
 brew update
 brew tap caskroom/versions
 brew cask install java8
-brew install opencv
 brew install maven
 ```
 
 **Ubuntu Steps**
 
-These scripts will install Maven and its dependencies. You will be running the Scala scripts because the MXNet-Java project has a dependency on the MXNet-Scala project.
+Please run the following lines:
 
 ```bash
-wget https://raw.githubusercontent.com/apache/incubator-mxnet/master/ci/docker/install/ubuntu_core.sh
-wget https://raw.githubusercontent.com/apache/incubator-mxnet/master/ci/docker/install/ubuntu_scala.sh
-chmod +x ubuntu_core.sh
-chmod +x ubuntu_scala.sh
-sudo ./ubuntu_core.sh
-sudo ./ubuntu_scala.sh
+sudo apt-get install openjdk-8-java maven
 ```
 
 **Step 2.** Run the demo MXNet-Java project.
@@ -58,25 +52,14 @@ Go to the [MXNet-Java demo project's README](https://github.com/apache/incubator
 
 #### Maven Repository
 
-MXNet-Java can be easily included in your Maven managed project. The Java packages are currently available as nightly builds on Maven. Add the following Maven repository to your `pom.xml` to fetch the Java packages :
-
-```html
-<repositories>
-    <repository>
-      <id>Apache Snapshot</id>
-      <url>https://repository.apache.org/content/groups/snapshots</url>
-    </repository>
-</repositories>
-```
-
-Also, add the dependency which corresponds to your platform to the `dependencies` tag :
+MXNet-Java can be easily included in your Maven managed project. The Java packages are currently available on Maven. Add the dependency which corresponds to your platform to the `dependencies` tag :
 
 **Linux CPU**
 ```html
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
-  <version>1.4.0-SNAPSHOT</version>
+  <version>1.4.0</version>
 </dependency>
 ```
 
@@ -85,7 +68,7 @@ Also, add the dependency which corresponds to your platform to the `dependencies
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
-  <version>1.4.0-SNAPSHOT</version>
+  <version>1.4.0</version>
 </dependency>
 ```
 
@@ -94,12 +77,11 @@ Also, add the dependency which corresponds to your platform to the `dependencies
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
-  <version>1.4.0-SNAPSHOT</version>
+  <version>1.4.0</version>
 </dependency>
 ```
 
-
-The official Java Packages will be released with the release of MXNet 1.4 and will be available on  [MXNet Maven package repository](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.mxnet%22).
+The official Java Packages have been released as part of MXNet 1.4 and are available on the [MXNet Maven package repository](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.mxnet%22).
 <hr>
 
 ### Eclipse IDE Support
diff --git a/docs/install/ubuntu_setup.md b/docs/install/ubuntu_setup.md
index bffbdb3e58f2..f225023d18d5 100644
--- a/docs/install/ubuntu_setup.md
+++ b/docs/install/ubuntu_setup.md
@@ -97,7 +97,7 @@ Alternatively, you can use the table below to select the package that suits your
 
 The following table presents the pip packages that are recommended for each version of MXNet.
 
-![pip package table](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png)
+![pip package table](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.4.0.png)
 
 To install an older version of MXNet with one of the packages in the previous table add `==` with the version you require. For example for version 1.1.0 of MXNet with CUDA 8, you would use `pip install mxnet-cu80==1.1.0`.
 
@@ -175,7 +175,7 @@ If building on CPU and using OpenBLAS:
     make -j $(nproc)
 ```
 
-If building on CPU and using MKL and MKL-DNN (make sure MKL is installed according to [Math Library Selection](build_from_source.html#math-library-selection) and [MKL-DNN README](https://github.com/apache/incubator-mxnet/blob/master/MKLDNN_README.md)):
+If building on CPU and using MKL and MKL-DNN (make sure MKL is installed according to [Math Library Selection](build_from_source.html#math-library-selection) and [MKL-DNN README](https://github.com/apache/incubator-mxnet/blob/master/docs/tutorials/mkldnn/MKLDNN_README.md)):
 
 ```bash
     git clone --recursive https://github.com/apache/incubator-mxnet.git
diff --git a/docs/install/windows_setup.md b/docs/install/windows_setup.md
index 3c3da5349235..96929692f1ec 100644
--- a/docs/install/windows_setup.md
+++ b/docs/install/windows_setup.md
@@ -136,7 +136,7 @@ We provide two primary options to build and install MXNet yourself using [Micros
 
 **NOTE:** Visual Studio 2017's compiler is `vc15`. This is not to be confused with Visual Studio 2015's compiler, `vc14`.
 
-You also have the option to install MXNet with MKL or MKL-DNN. In this case it is recommended that you refer to the [MKLDNN_README](https://github.com/apache/incubator-mxnet/blob/master/MKLDNN_README.md).
+You also have the option to install MXNet with MKL or MKL-DNN. In this case it is recommended that you refer to the [MKLDNN_README](https://github.com/apache/incubator-mxnet/blob/master/docs/tutorials/mkldnn/MKLDNN_README.md).
 
 **Option 1: Build with Microsoft Visual Studio 2017 (VS2017)**
 
@@ -156,7 +156,7 @@ To build and install MXNet yourself using [VS2017](https://www.visualstudio.com/
 1. Download and run the  [OpenCV](https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.4.1/opencv-3.4.1-vc14_vc15.exe/download) package. There are more recent versions of OpenCV, so please create an issue/PR to update this info if you validate one of these later versions.
 1. This will unzip several files. You can place them in another directory if you wish. We will use `C:\utils`(```mkdir C:\utils```) as our default path.
 1. Set the environment variable `OpenCV_DIR` to point to the OpenCV build directory that you just unzipped. Start ```cmd``` and type `set OpenCV_DIR=C:\utils\opencv\build`.
-1. If you don’t have the Intel Math Kernel Library (MKL) installed, you can install it and follow the [MKLDNN_README](https://github.com/apache/incubator-mxnet/blob/master/MKLDNN_README.md) from here, or you can use OpenBLAS. These instructions will assume you're using OpenBLAS.
+1. If you don’t have the Intel Math Kernel Library (MKL) installed, you can install it and follow the [MKLDNN_README](https://github.com/apache/incubator-mxnet/blob/master/docs/tutorials/mkldnn/MKLDNN_README.md) from here, or you can use OpenBLAS. These instructions will assume you're using OpenBLAS.
 1. Download the [OpenBlas](https://sourceforge.net/projects/openblas/files/v0.2.19/OpenBLAS-v0.2.19-Win64-int32.zip/download) package. Later versions of OpenBLAS are available, but you would need to build from source. v0.2.19 is the most recent version that ships with binaries. Contributions of more recent binaries would be appreciated.
 1. Unzip the file, rename it to ```OpenBLAS``` and put it under `C:\utils`. You can place the unzipped files and folders in another directory if you wish.
 1. Set the environment variable `OpenBLAS_HOME` to point to the OpenBLAS directory that contains the `include` and `lib` directories and type `set OpenBLAS_HOME=C:\utils\OpenBLAS` on the command prompt(```cmd```).
diff --git a/docs/tutorials/c++/subgraphAPI.md b/docs/tutorials/c++/subgraphAPI.md
index b834df8741b5..6b1b477b8021 100644
--- a/docs/tutorials/c++/subgraphAPI.md
+++ b/docs/tutorials/c++/subgraphAPI.md
@@ -97,23 +97,57 @@ class SgProperty : public SubgraphProperty {
     return n;
   }
   SubgraphSelectorPtr CreateSubgraphSelector() const override {
-    return std::make_shared<SgSelector>();
+    auto property = std::make_shared<CreateSubgraphSelector>();
+    property->SetAttr<std::string>("property_name", "subgraph example pass"); // Optional, better to have it.
+    property->SetAttr<bool>("inference_only", true); // Optional, only for inference_only pass.
+    return property;
   }
 };
 ```
+`SetAttr` is optional and developer can define their own attributes to control property behavior.
+There're 2 built-in attributes that used by MXNet executor.
 
-After defining the subgraph property, we need to register it.
+`property_name`  : std::string, name of this property.
+
+`inference_only` : bool, apply this property only for inference. Property will be skiped when need_grad=True. Default `false` if this attribute isn't defined.
+
+After defining the subgraph property, we need to register it in .cc file.
 
 ```C++
 MXNET_REGISTER_SUBGRAPH_PROPERTY(SgTest, SgProperty);
 ```
 
-After compiling this subgraph mechanism into MXNet, we can use the environment variable `MXNET_SUBGRAPH_BACKEND` to activate it.
+It's possible to register multiple properties for same backend. In practice, we recommend to put each property definition into .h file, and register backend in single .cc file. Property will be executed according to the register order.
+
+```C++
+#include "SgProperty.h" // Define SgProperty class
+#include "SgProperty2.h" // Define SgProperty2 class
+#include "SgProperty3.h" // Define SgProperty3 class
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(SgTest, SgProperty);  // Execution order 1.
+MXNET_REGISTER_SUBGRAPH_PROPERTY(SgTest, SgProperty2); // Execution order 2.
+MXNET_REGISTER_SUBGRAPH_PROPERTY(SgTest, SgProperty3); // Execution order 3.
+```
+
+After compiling this subgraph mechanism into MXNet, we can use the environment variable `MXNET_SUBGRAPH_BACKEND` to activate it during symbol bind.
 
 ```bash
 export MXNET_SUBGRAPH_BACKEND=SgTest
 ```
 
+Or you can use python symbol API `get_backend_symbol` to run all properties registered for this backend and get returned symbol.
+
+```Python
+sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+sym = sym.get_backend_symbol('SgTest')
+```
+
+When `SgProperty` is activated, a message will be shown in terminal as
+
+```bash
+start to execute subgraph example pass.
+```
+
 This tutorial shows a simple example of how to use the subgraph API to search for patterns in an NNVM graph.
 Intested users can try different pattern matching rules (i.e., define their own `SubgraphSelector`) and
 attach different operators to execute the subgraphs.
diff --git a/docs/tutorials/gluon/customop.md b/docs/tutorials/gluon/customop.md
index eae0344c8702..29ab21843114 100644
--- a/docs/tutorials/gluon/customop.md
+++ b/docs/tutorials/gluon/customop.md
@@ -30,6 +30,7 @@ Custom operator in python is easy to develop and good for prototyping, but may h
 import numpy as np
 import mxnet as mx
 from mxnet import gluon, autograd
+import os
 ```
 
 ## Parameter-less operators
@@ -214,5 +215,24 @@ y = dense(x)
 print(y)
 ```
 
+## Using custom operators with fork
+In Linux systems, the default method in multiprocessing to create process is by using fork. If there are unfinished async custom operations when forking, the program will be blocked because of python GIL. Always use sync calls like `wait_to_read` or `waitall` before calling fork.
+
+```
+x = mx.nd.array([0, 1, 2, 3])
+y = mx.nd.Custom(x, op_type='sigmoid')
+# unfinished async sigmoid operation will cause blocking
+os.fork()
+```
+
+Correctly handling this will make mxnet depend upon libpython, so the workaround now is to ensure that all custom operations are executed before forking process.
+
+```
+x = mx.nd.array([0, 1, 2, 3])
+y = mx.nd.Custom(x, op_type='sigmoid')
+# force execution by reading y
+print(y.asnumpy())
+os.fork()
+```
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
index 42a65b352199..f84a72ef1063 100644
--- a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
@@ -322,9 +322,9 @@ You can also find more ways to run inference and deploy your models here:
 ## References
 
 1. [Transfer Learning for Oxford102 Flower Dataset](https://github.com/Arsey/keras-transfer-learning-for-oxford102)
-2. [Gluon book on fine-tuning](https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html)
+2. [Gluon book on fine-tuning](https://www.d2l.ai/chapter_computer-vision/fine-tuning.html)
 3. [Gluon CV transfer learning tutorial](https://gluon-cv.mxnet.io/build/examples_classification/transfer_learning_minc.html)
 4. [Gluon crash course](https://gluon-crash-course.mxnet.io/)
 5. [Gluon CPP inference example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)
 
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/gluon/hybrid.md b/docs/tutorials/gluon/hybrid.md
index b7725386336e..9de9713403dc 100644
--- a/docs/tutorials/gluon/hybrid.md
+++ b/docs/tutorials/gluon/hybrid.md
@@ -154,7 +154,11 @@ You can use other language bindings to load them. You can also load them back
 to gluon with `SymbolBlock`:
 
 ```python
-net2 = gluon.SymbolBlock.imports('model-symbol.json', ['data'], 'model-0001.params')
+import warnings
+
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    net2 = gluon.SymbolBlock.imports('model-symbol.json', ['data'], 'model-0001.params')
 ```
 
 ## Operators that do not work with hybridize
@@ -259,4 +263,4 @@ For example, avoid writing `x += y` and use `x  = x + y`, otherwise you will get
 
 The recommended practice is to utilize the flexibility of imperative NDArray API during experimentation. Once you finalized your model, make necessary changes mentioned above so you can call `hybridize` function to improve performance.
 
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/gluon/save_load_params.md b/docs/tutorials/gluon/save_load_params.md
index ffebefdf80e1..26d6b8924b3c 100644
--- a/docs/tutorials/gluon/save_load_params.md
+++ b/docs/tutorials/gluon/save_load_params.md
@@ -260,7 +260,10 @@ One of the main reasons to serialize model architecture into a JSON file is to l
 Serialized Hybrid networks (saved as .JSON and .params file) can be loaded and used inside Python frontend using `gluon.nn.SymbolBlock`. To demonstrate that, let's load the network we serialized above.
 
 ```python
-deserialized_net = gluon.nn.SymbolBlock.imports("lenet-symbol.json", ['data'], "lenet-0001.params", ctx=ctx)
+import warnings
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    deserialized_net = gluon.nn.SymbolBlock.imports("lenet-symbol.json", ['data'], "lenet-0001.params", ctx=ctx)
 ```
 
 `deserialized_net` now contains the network we deserialized from files. Let's test the deserialized network to make sure it works.
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 7e0ffaa3f72a..8dca7e86d509 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -27,6 +27,7 @@
    embedded/index.md
    gluon/index.md
    java/index.md
+   mkldnn/index.md
    nlp/index.md
    onnx/index.md
    python/index.md
@@ -51,7 +52,7 @@ Another great resource for learning MXNet is our [examples section](https://gith
 
 We have two types of API available for Python: Gluon APIs and Module APIs. [See here](/api/python/gluon/gluon.html) for a comparison.
 
-A comprehensive introduction to Gluon can be found at [The Straight Dope](http://gluon.mxnet.io/). Structured like a book, it build up from first principles of deep learning and take a theoretical walkthrough of progressively more complex models using the Gluon API. Also check out the [60-Minute Gluon Crash Course](http://gluon-crash-course.mxnet.io/) if you're short on time or have used other deep learning frameworks before.
+A comprehensive introduction to Gluon can be found at [Dive into Deep Learning](http://www.d2l.ai/). Structured like a book, it build up from first principles of deep learning and take a theoretical walkthrough of progressively more complex models using the Gluon API. Also check out the [60-Minute Gluon Crash Course](http://gluon-crash-course.mxnet.io/) if you're short on time or have used other deep learning frameworks before.
 
 Use the tutorial selector below to filter to the relevant tutorials. You might see a download link in the top right corner of some tutorials. Use this to download a Jupyter Notebook version of the tutorial, and re-run and adjust the code as you wish.
 
@@ -89,8 +90,9 @@ Select API:&nbsp;
     * [Learning Rate Schedules](/tutorials/gluon/learning_rate_schedules.html)
     * [Advanced Learning Rate Schedules](/tutorials/gluon/learning_rate_schedules_advanced.html)
     * [Profiling MXNet Models](/tutorials/python/profiler.html)
-    * [Hybridize Gluon models with control flows](/tutorials/control_flow/ControlFlowTutorial.html)
+    * [Module to Gluon API](/tutorials/python/module_to_gluon.html)<span style="color:red"> (new!)</span>
     * [Gluon end to end from training to inference](/tutorials/gluon/gluon_from_experiment_to_deployment.html)
+
 * API Guides
     * Core APIs
         * NDArray
@@ -113,6 +115,7 @@ Select API:&nbsp;
             * [HybridBlocks](/tutorials/gluon/hybrid.html) ([Alternative](http://gluon.mxnet.io/chapter07_distributed-learning/hybridize.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>)
             * [Block Naming](/tutorials/gluon/naming.html)
             * [Custom Operators](/tutorials/gluon/customop.html)
+            * [Control Flow operators](/tutorials/control_flow/ControlFlowTutorial.html)<span style="color:red"> (new!)</span>
         * Autograd
             * [AutoGrad API](/tutorials/gluon/autograd.html)
             * [AutoGrad API with chain rule](http://gluon.mxnet.io/chapter01_crashcourse/autograd.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
@@ -134,7 +137,6 @@ Select API:&nbsp;
     * [MNIST Handwritten Digit Classification](/tutorials/python/mnist.html)
     * [Movie Review Classification using Convolutional Networks](/tutorials/nlp/cnn.html)
     * [Generative Adversarial Networks (GANs)](/tutorials/unsupervised_learning/gan.html)
-    * [Recommender Systems using Matrix Factorization](/tutorials/python/matrix_factorization.html)
     * [Speech Recognition with Connectionist Temporal Classification Loss](/tutorials/speech_recognition/ctc.html)
 * Practitioner Guides
     * [Predicting on new images using a pre-trained ImageNet model](/tutorials/python/predict_image.html)
@@ -142,6 +144,7 @@ Select API:&nbsp;
     * [Large-Scale Multi-Host Multi-GPU Image Classification](/tutorials/vision/large_scale_classification.html)
     * [Importing an ONNX model into MXNet](/tutorials/onnx/super_resolution.html)
     * [Optimizing Deep Learning Computation Graphs with TensorRT](/tutorials/tensorrt/inference_with_trt.html)
+    * [How to build and install MXNet with MKL-DNN backend](/tutorials/mkldnn/MKLDNN_README.html)
 * API Guides
     * Core APIs
         * NDArray
diff --git a/docs/tutorials/java/mxnet_java_on_intellij.md b/docs/tutorials/java/mxnet_java_on_intellij.md
index ce84788c846e..e2f5a6c6cad2 100644
--- a/docs/tutorials/java/mxnet_java_on_intellij.md
+++ b/docs/tutorials/java/mxnet_java_on_intellij.md
@@ -43,16 +43,10 @@ brew install opencv
 
 Run the following commands to install the prerequisites on Ubuntu.
 
-```bash
-wget https://github.com/apache/incubator-mxnet/blob/master/ci/docker/install/ubuntu_core.sh
-sudo ./ubuntu_core.sh
-wget https://github.com/apache/incubator-mxnet/blob/master/ci/docker/install/ubuntu_scala.sh
-sudo ./ubuntu_scala.sh
+```
+sudo apt-get install openjdk-8-java maven
 ```
 
-Note : You might need to run `chmod u+x ubuntu_core.sh` and `chmod u+x ubuntu_scala` before running the scripts.
-
-The `ubuntu_scala.sh` installs the common dependencies required for both MXNet Scala and MXNet Java packages.
 
 ## Set Up Your Project
 
@@ -102,27 +96,16 @@ Set the project's location. The rest of the settings can be left as their defaul
 After clicking Finish, you will be presented with the project's first view.
 The project's `pom.xml` will be open for editing.
 
-**Step 3.** The Java packages are currently available as nightly builds on Maven. Add the following Maven repository to your `pom.xml` to fetch the Java packages :
- 
-```html
-<repositories>
-    <repository>
-      <id>Apache Snapshot</id>
-      <url>https://repository.apache.org/content/groups/snapshots</url>
-    </repository>
-</repositories>
-```
-
-Also, add this under the `dependencies` tag :
+**Step 3.** The Java packages are currently available on Maven. Add the following under the `dependencies` tag :
 
 ```html
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
-  <version>1.4.0-SNAPSHOT</version>
+  <version>1.4.0</version>
 </dependency>
 ```
-The official Java Packages will be released with the release of MXNet 1.4 and will be available on  [MXNet Maven package repository](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.mxnet%22).
+The official Java Packages have been released as part of MXNet 1.4 and are available on the [MXNet Maven package repository](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.mxnet%22).
 
 Note :
 - Change the osx-x86_64 to linux-x86_64 if your platform is linux.
diff --git a/docs/tutorials/mkldnn/MKLDNN_README.md b/docs/tutorials/mkldnn/MKLDNN_README.md
new file mode 100644
index 000000000000..c5779670cd87
--- /dev/null
+++ b/docs/tutorials/mkldnn/MKLDNN_README.md
@@ -0,0 +1,338 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Build/Install MXNet with MKL-DNN
+
+A better training and inference performance is expected to be achieved on Intel-Architecture CPUs with MXNet built with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
+In the following sections, you will find build instructions for MXNet with Intel MKL-DNN on Linux, MacOS and Windows.
+
+The detailed performance data collected on Intel Xeon CPU with MXNet built with Intel MKL-DNN can be found [here](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu).
+
+
+<h2 id="0">Contents</h2>
+
+* [1. Linux](#1)
+* [2. MacOS](#2)
+* [3. Windows](#3)
+* [4. Verify MXNet with python](#4)
+* [5. Enable MKL BLAS](#5)
+* [6. Enable graph optimization](#6)
+* [7. Quantization](#7)
+* [8. Support](#8)
+
+<h2 id="1">Linux</h2>
+
+### Prerequisites
+
+```
+sudo apt-get update
+sudo apt-get install -y build-essential git
+sudo apt-get install -y libopenblas-dev liblapack-dev
+sudo apt-get install -y libopencv-dev
+sudo apt-get install -y graphviz
+```
+
+### Clone MXNet sources
+
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+cd incubator-mxnet
+```
+
+### Build MXNet with MKL-DNN
+
+```
+make -j $(nproc) USE_OPENCV=1 USE_MKLDNN=1 USE_BLAS=mkl USE_INTEL_PATH=/opt/intel
+```
+
+If you don't have the full [MKL](https://software.intel.com/en-us/intel-mkl) library installation, you might use OpenBLAS as the blas library, by setting USE_BLAS=openblas.
+
+<h2 id="2">MacOS</h2>
+
+### Prerequisites
+
+Install the dependencies, required for MXNet, with the following commands:
+
+- [Homebrew](https://brew.sh/)
+- llvm (clang in macOS does not support OpenMP)
+- OpenCV (for computer vision operations)
+
+```
+# Paste this command in Mac terminal to install Homebrew
+/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+
+# install dependency
+brew update
+brew install pkg-config
+brew install graphviz
+brew tap homebrew/core
+brew install opencv
+brew tap homebrew/versions
+brew install llvm
+```
+
+### Clone MXNet sources
+
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+cd incubator-mxnet
+```
+
+### Build MXNet with MKL-DNN
+
+```
+LIBRARY_PATH=$(brew --prefix llvm)/lib/ make -j $(sysctl -n hw.ncpu) CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ USE_OPENCV=1 USE_OPENMP=1 USE_MKLDNN=1 USE_BLAS=apple USE_PROFILER=1
+```
+
+<h2 id="3">Windows</h2>
+
+On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) and [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) to compile MXNet with Intel MKL-DNN.
+[Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is recommended.
+
+**Visual Studio 2015**
+
+To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
+
+1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
+2. Download and Install [CMake 3](https://cmake.org/) if it is not already installed.
+3. Download and install [OpenCV 3](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
+4. Unzip the OpenCV package.
+5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (```C:\opencv\build\x64\vc14``` for example). Also, you need to add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
+6. If you have Intel Math Kernel Library (MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in
+```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\mkl```.
+7. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/). Note that you should also download ```mingw64.dll.zip`` along with openBLAS and add them to PATH.
+8. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```. 
+
+After you have installed all of the required dependencies, build the MXNet source code:
+
+1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet). Don't forget to pull the submodules:
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+```
+
+2. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
+
+3. Start a Visual Studio command prompt.
+
+4. Use [CMake 3](https://cmake.org/) to create a Visual Studio solution in ```./build``` or some other directory. Make sure to specify the architecture in the 
+[CMake 3](https://cmake.org/) command:
+```
+mkdir build
+cd build
+cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
+```
+
+5. In Visual Studio, open the solution file,```.sln```, and compile it.
+These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
+Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
+
+6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
+
+**Visual Studio 2017**
+
+To build and install MXNet yourself using [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/), you need the following dependencies. Install the required dependencies:
+
+1. If [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) is not already installed, download and install it. You can download and install the free community edition.
+2. Download and install [CMake 3](https://cmake.org/files/v3.11/cmake-3.11.0-rc4-win64-x64.msi) if it is not already installed.
+3. Download and install [OpenCV](https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.4.1/opencv-3.4.1-vc14_vc15.exe/download).
+4. Unzip the OpenCV package.
+5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (e.g., ```OpenCV_DIR = C:\utils\opencv\build```).
+6. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](https://sourceforge.net/projects/openblas/files/v0.2.20/OpenBLAS%200.2.20%20version.zip/download).
+7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories (e.g., ```OpenBLAS_HOME = C:\utils\OpenBLAS```).
+
+After you have installed all of the required dependencies, build the MXNet source code:
+
+1. Start ```cmd``` in windows.
+
+2. Download the MXNet source code from GitHub by using following command:
+
+```r
+cd C:\
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+```
+
+3. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
+
+4. Follow [this link](https://docs.microsoft.com/en-us/visualstudio/install/modify-visual-studio) to modify ```Individual components```, and check ```VC++ 2017 version 15.4 v14.11 toolset```, and click ```Modify```.
+
+5. Change the version of the Visual studio 2017 to v14.11 using the following command (by default the VS2017 is installed in the following path):
+
+```r
+"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.11
+```
+
+6. Create a build dir using the following command and go to the directory, for example:
+
+```r
+mkdir C:\build
+cd C:\build
+```
+
+7. CMake the MXNet source code by using following command:
+
+```r
+cmake -G "Visual Studio 15 2017 Win64" .. -T host=x64 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
+```
+
+8. After the CMake successfully completed, compile the the MXNet source code by using following command:
+
+```r
+msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
+```
+
+9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
+
+<h2 id="4">Verify MXNet with python</h2>
+
+```
+cd python
+sudo python setup.py install
+python -c "import mxnet as mx;print((mx.nd.ones((2, 3))*2).asnumpy());"
+
+Expected Output:
+
+[[ 2.  2.  2.]
+ [ 2.  2.  2.]]
+```
+
+### Verify whether MKL-DNN works
+
+After MXNet is installed, you can verify if MKL-DNN backend works well with a single Convolution layer.
+
+```
+import mxnet as mx
+import numpy as np
+
+num_filter = 32
+kernel = (3, 3)
+pad = (1, 1)
+shape = (32, 32, 256, 256)
+
+x = mx.sym.Variable('x')
+w = mx.sym.Variable('w')
+y = mx.sym.Convolution(data=x, weight=w, num_filter=num_filter, kernel=kernel, no_bias=True, pad=pad)
+exe = y.simple_bind(mx.cpu(), x=shape)
+
+exe.arg_arrays[0][:] = np.random.normal(size=exe.arg_arrays[0].shape)
+exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape)
+
+exe.forward(is_train=False)
+o = exe.outputs[0]
+t = o.asnumpy()
+```
+
+More detailed debugging and profiling information can be logged by setting the environment variable 'MKLDNN_VERBOSE':
+```
+export MKLDNN_VERBOSE=1
+```
+For example, by running above code snippet, the following debugging logs providing more insights on MKL-DNN primitives `convolution` and `reorder`. That includes: Memory layout, infer shape and the time cost of primitive execution.
+```
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nchw out:f32_nChw16c,num:1,32x32x256x256,6.47681
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0429688
+mkldnn_verbose,exec,convolution,jit:avx512_common,forward_inference,fsrc:nChw16c fwei:OIhw16i16o fbia:undef fdst:nChw16c,alg:convolution_direct,mb32_g1ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1,9.98193
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0510254
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nChw16c out:f32_nchw,num:1,32x32x256x256,20.4819
+```
+
+<h2 id="5">Enable MKL BLAS</h2>
+
+With MKL BLAS, the performace is expected to furtherly improved with variable range depending on the computation load of the models.
+You can redistribute not only dynamic libraries but also headers, examples and static libraries on accepting the license [Intel Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license).
+Installing the full MKL installation enables MKL support for all operators under the linalg namespace.
+
+  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl)
+
+  2. Run `make -j ${nproc} USE_BLAS=mkl`
+
+  3. Navigate into the python directory
+
+  4. Run `sudo python setup.py install`
+
+### Verify whether MKL works
+
+After MXNet is installed, you can verify if MKL BLAS works well with a single dot layer.
+
+```
+import mxnet as mx
+import numpy as np
+
+shape_x = (1, 10, 8)
+shape_w = (1, 12, 8)
+
+x_npy = np.random.normal(0, 1, shape_x)
+w_npy = np.random.normal(0, 1, shape_w)
+
+x = mx.sym.Variable('x')
+w = mx.sym.Variable('w')
+y = mx.sym.batch_dot(x, w, transpose_b=True)
+exe = y.simple_bind(mx.cpu(), x=x_npy.shape, w=w_npy.shape)
+
+exe.forward(is_train=False)
+o = exe.outputs[0]
+t = o.asnumpy()
+```
+
+You can open the `MKL_VERBOSE` flag by setting environment variable:
+```
+export MKL_VERBOSE=1
+```
+Then by running above code snippet, you probably will get the following output message which means `SGEMM` primitive from MKL are called. Layout information and primitive execution performance are also demonstrated in the log message.
+```
+Numpy + Intel(R) MKL: THREADING LAYER: (null)
+Numpy + Intel(R) MKL: setting Intel(R) MKL to use INTEL OpenMP runtime
+Numpy + Intel(R) MKL: preloading libiomp5.so runtime
+MKL_VERBOSE Intel(R) MKL 2018.0 Update 1 Product build 20171007 for Intel(R) 64 architecture Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) enabled processors, Lnx 2.40GHz lp64 intel_thread NMICDev:0
+MKL_VERBOSE SGEMM(T,N,12,10,8,0x7f7f927b1378,0x1bc2140,8,0x1ba8040,8,0x7f7f927b1380,0x7f7f7400a280,12) 8.93ms CNR:OFF Dyn:1 FastMM:1 TID:0  NThr:40 WDiv:HOST:+0.000
+```
+
+<h2 id="6">Enable graph optimization</h2>
+
+Graph optimization by subgraph feature are available in master branch. You can build from source and then use below command to enable this *experimental* feature for better performance:
+
+```
+export MXNET_SUBGRAPH_BACKEND=MKLDNN
+```
+
+When `MKLDNN` backend is enabled, advanced control options are avaliable:
+
+```
+export MXNET_DISABLE_MKLDNN_CONV_OPT=1 # disable MKLDNN convolution optimization pass
+export MXNET_DISABLE_MKLDNN_FC_OPT=1 # disable MKLDNN FullyConnected optimization pass
+```
+
+
+This limitations of this experimental feature are:
+
+- Use this feature only for inference. When training, be sure to turn the feature off by unsetting the `MXNET_SUBGRAPH_BACKEND` environment variable.
+
+- This feature will only run on the CPU, even if you're using a GPU-enabled build of MXNet. 
+
+- [MXNet Graph Optimization and Quantization Technical Information and Performance Details](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN).
+
+<h2 id="7">Quantization and Inference with INT8</h2>
+
+Benefiting from Intel MKL-DNN, MXNet built with Intel MKL-DNN brings outstanding performance improvement on quantization and inference with INT8 Intel CPU Platform on Intel Xeon Scalable Platform.
+
+- [CNN Quantization Examples](https://github.com/apache/incubator-mxnet/tree/master/example/quantization).
+
+<h2 id="8">Next Steps and Support</h2>
+
+- For questions or support specific to MKL, visit the [Intel MKL](https://software.intel.com/en-us/mkl) website.
+
+- For questions or support specific to MKL, visit the [Intel MKLDNN](https://github.com/intel/mkl-dnn) website.
+
+- If you find bugs, please open an issue on GitHub for [MXNet with MKL](https://github.com/apache/incubator-mxnet/labels/MKL) or [MXNet with MKLDNN](https://github.com/apache/incubator-mxnet/labels/MKLDNN).
diff --git a/docs/tutorials/mkldnn/index.md b/docs/tutorials/mkldnn/index.md
new file mode 100644
index 000000000000..faf6526fb824
--- /dev/null
+++ b/docs/tutorials/mkldnn/index.md
@@ -0,0 +1,25 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/onnx/fine_tuning_gluon.md b/docs/tutorials/onnx/fine_tuning_gluon.md
index dd0c0e93e862..e7cef6809b10 100644
--- a/docs/tutorials/onnx/fine_tuning_gluon.md
+++ b/docs/tutorials/onnx/fine_tuning_gluon.md
@@ -279,7 +279,10 @@ We create a symbol block that is going to hold all our pre-trained layers, and a
 
 
 ```python
-pre_trained = gluon.nn.SymbolBlock(outputs=new_sym, inputs=mx.sym.var('data_0'))
+import warnings
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    pre_trained = gluon.nn.SymbolBlock(outputs=new_sym, inputs=mx.sym.var('data_0'))
 net_params = pre_trained.collect_params()
 for param in new_arg_params:
     if param in net_params:
diff --git a/docs/tutorials/onnx/inference_on_onnx_model.md b/docs/tutorials/onnx/inference_on_onnx_model.md
index f12e050fcc73..1c946bffcc06 100644
--- a/docs/tutorials/onnx/inference_on_onnx_model.md
+++ b/docs/tutorials/onnx/inference_on_onnx_model.md
@@ -144,7 +144,10 @@ print(data_names)
 And load them into a MXNet Gluon symbol block. 
 
 ```python
-net = gluon.nn.SymbolBlock(outputs=sym, inputs=mx.sym.var('data_0'))
+import warnings
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    net = gluon.nn.SymbolBlock(outputs=sym, inputs=mx.sym.var('data_0'))
 net_params = net.collect_params()
 for param in arg_params:
     if param in net_params:
@@ -247,6 +250,7 @@ Lucky for us, the [Caltech101 dataset](http://www.vision.caltech.edu/Image_Datas
 
 We show that in our next tutorial:
 
+
 - [Fine-tuning an ONNX Model using the modern imperative MXNet/Gluon](http://mxnet.incubator.apache.org/tutorials/onnx/fine_tuning_gluon.html)
     
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/python/matrix_factorization.md b/docs/tutorials/python/matrix_factorization.md
deleted file mode 100644
index cfe73a4856e0..000000000000
--- a/docs/tutorials/python/matrix_factorization.md
+++ /dev/null
@@ -1,289 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Matrix Factorization
-
-In a recommendation system, there is a group of users and a set of items. Given
-that each users have rated some items in the system, we would like to predict
-how the users would rate the items that they have not yet rated, such that we
-can make recommendations to the users.
-
-Matrix factorization is one of the main algorithms used in recommendation
-systems. It can be used to discover latent features underlying the interactions
-between two different kinds of entities.
-
-Assume we assign a k-dimensional vector to each user and a k-dimensional vector
-to each item such that the dot product of these two vectors gives the user's
-rating of that item. We can learn the user and item vectors directly, which is
-essentially performing SVD on the user-item matrix. We can also try to learn the
-latent features using multi-layer neural networks.
-
-In this tutorial, we will work though the steps to implement these ideas in
-MXNet.
-
-```python
-# Set the logging level
-import logging
-head = '%(asctime)-15s %(message)s'
-logging.basicConfig(level=logging.INFO)
-```
-
-```python
-import mxnet as mx
-import random
-
-# Fix the random seeds
-mx.random.seed(42)
-random.seed(42)
-
-# set the context on GPU is available otherwise CPU
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
-```
-
-## Prepare Data
-
-We use the [MovieLens](http://grouplens.org/datasets/movielens/) data here, but
-it can apply to other datasets as well. Each row of this dataset contains a
-tuple of user id, movie id, rating, and time stamp, we will only use the first
-three items. We first define the a batch which contains n tuples. It also
-provides name and shape information to MXNet about the data and label.
-
-
-```python
-
-class Batch(object):
-    def __init__(self, data_names, data, label_names, label):
-        self.data = data
-        self.label = label
-        self.data_names = data_names
-        self.label_names = label_names
-
-    @property
-    def provide_data(self):
-        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
-
-    @property
-    def provide_label(self):
-        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
-```
-
-Then we define a data iterator, which returns a batch of tuples each time.
-
-
-```python
-
-class Batch(object):
-    def __init__(self, data_names, data, label_names, label):
-        self.data = data
-        self.label = label
-        self.data_names = data_names
-        self.label_names = label_names
-
-    @property
-    def provide_data(self):
-        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
-
-    @property
-    def provide_label(self):
-        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
-
-class DataIter(mx.io.DataIter):
-    def __init__(self, fname, batch_size):
-        super(DataIter, self).__init__()
-        self.batch_size = batch_size
-        self.data = []
-        for line in open(fname):
-            tks = line.strip().split('\t')
-            if len(tks) != 4:
-                continue
-            self.data.append((int(tks[0]), int(tks[1]), float(tks[2])))
-        self.provide_data = [('user', (batch_size, )), ('item', (batch_size, ))]
-        self.provide_label = [('score', (self.batch_size, ))]
-
-    def __iter__(self):
-        for k in range(int(len(self.data) / self.batch_size)):
-            users = []
-            items = []
-            scores = []
-            for i in range(self.batch_size):
-                j = k * self.batch_size + i
-                user, item, score = self.data[j]
-                users.append(user)
-                items.append(item)
-                scores.append(score)
-
-            data_all = [mx.nd.array(users), mx.nd.array(items)]
-            label_all = [mx.nd.array(scores)]
-            data_names = ['user', 'item']
-            label_names = ['score']
-
-            data_batch = Batch(data_names, data_all, label_names, label_all)
-            yield data_batch
-
-    def reset(self):
-        random.shuffle(self.data)
-```
-
-Now we download the data and provide a function to obtain the data iterator:
-
-
-```python
-import os
-import urllib
-import zipfile
-file = mx.test_utils.download('http://files.grouplens.org/datasets/movielens/ml-100k.zip', 'ml-100k.zip')
-with zipfile.ZipFile("ml-100k.zip","r") as f:
-    f.extractall(".")
-def get_data(batch_size):
-    return (DataIter(os.path.join('.','ml-100k','u1.base'), batch_size), DataIter(os.path.join('.','ml-100k','u1.test'), batch_size))
-```
-
-Finally we calculate the numbers of users and items for later use.
-
-```python
-def max_id(fname):
-    mu = 0
-    mi = 0
-    for line in open(fname):
-        tks = line.strip().split('\t')
-        if len(tks) != 4:
-            continue
-        mu = max(mu, int(tks[0]))
-        mi = max(mi, int(tks[1]))
-    return mu + 1, mi + 1
-max_user, max_item = max_id(os.path.join('.','ml-100k','u.data'))
-(max_user, max_item)
-```
-
-## Optimization
-
-We first implement the RMSE (root-mean-square error) measurement, which is
-commonly used by matrix factorization.
-
-```python
-import math
-def RMSE(label, pred):
-    ret = 0.0
-    n = 0.0
-    pred = pred.flatten()
-    for i in range(len(label)):
-        ret += (label[i] - pred[i]) * (label[i] - pred[i])
-        n += 1.0
-    return math.sqrt(ret / n)
-```
-
-Then we define a general training module, which is borrowed from the image
-classification application.
-
-```python
-def train(network, batch_size, num_epoch, learning_rate):
-    model = mx.mod.Module(symbol=network, context=ctx, data_names=('user','item'), label_names=['score'])
-
-    batch_size = 64
-    train, test = get_data(batch_size)
-
-    model.fit(train,
-              eval_data = test,
-              eval_metric = RMSE,
-              batch_end_callback=mx.callback.Speedometer(batch_size, 20000/batch_size),
-              num_epoch=num_epoch,
-              optimizer='sgd',
-              optimizer_params={'learning_rate':learning_rate, 'momentum':0.9, 'wd':0.0001}
-             )
-```
-
-## Networks
-
-Now we try various networks. We first learn the latent vectors directly.
-
-```python
-def plain_net(k):
-    # input
-    user = mx.symbol.Variable('user')
-    item = mx.symbol.Variable('item')
-    score = mx.symbol.Variable('score')
-    # user feature lookup
-    user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k)
-    # item feature lookup
-    item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k)
-    # predict by the inner product, which is elementwise product and then sum
-    pred = user * item
-    pred = mx.symbol.sum_axis(data = pred, axis = 1)
-    pred = mx.symbol.Flatten(data = pred)
-    # loss layer
-    pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)
-    return pred
-
-train(plain_net(64), batch_size=64, num_epoch=10, learning_rate=.05)
-```
-
-Next we try to use 2 layers neural network to learn the latent variables, which stack a fully connected layer above the embedding layers:
-
-```python
-def get_one_layer_mlp(hidden, k):
-    # input
-    user = mx.symbol.Variable('user')
-    item = mx.symbol.Variable('item')
-    score = mx.symbol.Variable('score')
-    # user latent features
-    user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k)
-    user = mx.symbol.Activation(data = user, act_type="relu")
-    user = mx.symbol.FullyConnected(data = user, num_hidden = hidden)
-    # item latent features
-    item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k)
-    item = mx.symbol.Activation(data = item, act_type="relu")
-    item = mx.symbol.FullyConnected(data = item, num_hidden = hidden)
-    # predict by the inner product
-    pred = user * item
-    pred = mx.symbol.sum_axis(data = pred, axis = 1)
-    pred = mx.symbol.Flatten(data = pred)
-    # loss layer
-    pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)
-    return pred
-
-train(get_one_layer_mlp(64, 64), batch_size=64, num_epoch=10, learning_rate=.05)
-```
-
-Adding dropout layers to relief the over-fitting.
-
-```python
-def get_one_layer_dropout_mlp(hidden, k):
-    # input
-    user = mx.symbol.Variable('user')
-    item = mx.symbol.Variable('item')
-    score = mx.symbol.Variable('score')
-    # user latent features
-    user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k)
-    user = mx.symbol.Activation(data = user, act_type="relu")
-    user = mx.symbol.FullyConnected(data = user, num_hidden = hidden)
-    user = mx.symbol.Dropout(data=user, p=0.5)
-    # item latent features
-    item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k)
-    item = mx.symbol.Activation(data = item, act_type="relu")
-    item = mx.symbol.FullyConnected(data = item, num_hidden = hidden)
-    item = mx.symbol.Dropout(data=item, p=0.5)
-    # predict by the inner product
-    pred = user * item
-    pred = mx.symbol.sum_axis(data = pred, axis = 1)
-    pred = mx.symbol.Flatten(data = pred)
-    # loss layer
-    pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)
-    return pred
-train(get_one_layer_mlp(256, 512), batch_size=64, num_epoch=10, learning_rate=.05)
-```
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/python/module_to_gluon.md b/docs/tutorials/python/module_to_gluon.md
new file mode 100644
index 000000000000..838ad667836d
--- /dev/null
+++ b/docs/tutorials/python/module_to_gluon.md
@@ -0,0 +1,365 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Converting Module API code to the Gluon API
+
+Sometimes you find yourself in the situation where the model you want to use has been written using the symbolic Module API rather than the simpler, easier-to-debug, more flexible, imperative Gluon API. In this tutorial, we will give you a comprehensive guide for transforming Module code to Gluon code.
+
+The different steps to take into consideration are:
+
+I) Data loading
+
+II) Model definition
+
+III) Loss
+
+IV) Training Loop
+
+V) Exporting Models
+
+VI) Loading Models for Inference
+
+In the following section we will look at 1:1 mappings between the Module and the Gluon ways of training a neural network.
+
+## I - Data Loading
+
+In this section we will be looking at the difference in loading data between Module and Gluon.
+Let's first import a few Python modules.
+
+```python
+from collections import namedtuple
+import logging
+logging.basicConfig(level=logging.INFO)
+import random
+
+import numpy as np
+import mxnet as mx
+from mxnet.gluon.data import ArrayDataset, DataLoader
+from mxnet.gluon import nn
+from mxnet import gluon
+
+# parameters
+batch_size = 5
+dataset_length = 50
+
+# random seeds
+random.seed(1)
+np.random.seed(1)
+mx.random.seed(1)
+
+```
+
+#### Module
+
+When using the Module API we use a [`DataIter`](https://mxnet.incubator.apache.org/api/python/io/io.html?highlight=dataiter#mxnet.io.DataIter), in addition to the data itself, the [`DataIter`](https://mxnet.incubator.apache.org/api/python/io/io.html?highlight=dataiter#mxnet.io.DataIter) contains information about the name of the input symbols. 
+
+In the Module API, `DataIter`s are responsible for both holding the data and iterating through it. Some `DataIter`s support multi-threading like the [`ImageRecordIter`](https://mxnet.incubator.apache.org/api/python/io/io.html#mxnet.io.ImageRecordIter), while other don't, such as the [`NDArrayIter`](https://mxnet.incubator.apache.org/api/python/io/io.html?highlight=ndarrayiter#mxnet.io.NDArrayIter).
+
+Let's create some random data, following the same format as grayscale 28x28 images.
+
+
+```python
+train_data = np.random.rand(dataset_length, 28,28).astype('float32')
+train_label = np.random.randint(0, 10, (dataset_length,)).astype('float32')
+```
+
+We can now wraps this data into an ArrayIterator that will create batches of data using the first dimension of the provided array as the batch dimension. 
+
+```python
+data_iter = mx.io.NDArrayIter(data=train_data, label=train_label, batch_size=batch_size, shuffle=False, data_name='data', label_name='softmax_label')
+for batch in data_iter:
+    print(batch.data[0].shape, batch.label[0])
+    break
+```
+
+    (5, 28, 28) 
+    [5. 0. 3. 4. 9.]
+    <NDArray 5 @cpu(0)>
+
+
+#### Gluon
+
+With Gluon, the preferred method is to use a [`DataLoader`](https://mxnet.incubator.apache.org/api/python/gluon/data.html?highlight=dataloader#mxnet.gluon.data.DataLoader) that makes use of a [`Dataset`](https://mxnet.incubator.apache.org/api/python/gluon/data.html?highlight=dataset#mxnet.gluon.data.Dataset) to asynchronously prefetch the data. 
+
+The Gluon API offers you the ability to efficiently fetch data and separate the concerns of loading versus holding data. The DataLoader role is to request certain indices of the dataset. The Dataset has ownership of the data.
+The `Dataset` data can be in or out of memory, and the `DataLoader` role is to request certain indices of the dataset, in the main thread or through multi-processing (or multi-threaded) workers and batch the data together. 
+
+```python
+dataset = ArrayDataset(train_data, train_label)
+dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)
+for data, label in dataloader:
+    print(data.shape, label)
+    break
+```
+
+    (5, 28, 28) 
+    [5. 0. 3. 4. 9.]
+    <NDArray 5 @cpu(0)>
+
+You can check the [`Dataset` and `DataLoader` tutorials](https://mxnet.incubator.apache.org/tutorials/gluon/datasets.html) out. You can either rewrite your code in order to use one of the provided [`Dataset`](https://mxnet.incubator.apache.org/api/python/gluon/data.html?highlight=dataset#mxnet.gluon.data.Dataset) class, like the [`ArrayDataset`](https://mxnet.incubator.apache.org/api/python/gluon/data.html?highlight=arraydataset#mxnet.gluon.data.ArrayDataset) or the [`ImageFolderDataset`](https://mxnet.incubator.apache.org/api/python/gluon/data.html?highlight=imagefolderdataset#mxnet.gluon.data.vision.datasets.ImageFolderDataset)
+
+
+## II - Model Definition
+
+Let's look at the model definition from the [MNIST Module Tutorial](https://mxnet.incubator.apache.org/tutorials/python/mnist.html):
+
+#### Module
+
+For the Module API, you define the data flow by setting `data` keyword argument of one layer to the next.
+You then bind the symbolic model to a specific compute context and specify the symbol names for the data and the label.
+
+```python
+
+# context
+ctx = mx.cpu()
+
+def get_module_network():
+    data = mx.sym.var('data')
+    data = mx.sym.flatten(data=data)
+    fc1  = mx.sym.FullyConnected(data=data, num_hidden=128)
+    act1 = mx.sym.Activation(data=fc1, act_type="relu")
+    fc2  = mx.sym.FullyConnected(data=act1, num_hidden = 64)
+    act2 = mx.sym.Activation(data=fc2, act_type="relu")
+    fc3  = mx.sym.FullyConnected(data=act2, num_hidden=10)
+    mlp  = mx.sym.SoftmaxOutput(data=fc3, name='softmax')
+    return mlp
+
+mlp = get_module_network()
+# Bind model to Module
+mlp_model = mx.mod.Module(symbol=mlp, context=ctx, data_names=['data'], label_names=['softmax_label'])
+```
+
+#### Gluon
+
+In Gluon, for the equivalent model, you would create a `Sequential` block, in that case a `HybridSequential` block to allow for future hybridization since we are only using [hybridizable blocks](https://mxnet.incubator.apache.org/tutorials/gluon/hybrid.html). The flow of the data will be automatically set from one layer to the next, since they are held in a `Sequential` block.
+Note that we don't need named symbols for the input, and we show how the loss is handled in Gluon in the next section.
+
+```python
+def get_gluon_network():
+    net = nn.HybridSequential()
+    with net.name_scope():
+        net.add(
+            nn.Flatten(),
+            nn.Dense(units=128, activation="relu"),
+            nn.Dense(units=64, activation="relu"),
+            nn.Dense(units=10)
+        )
+    return net
+
+net = get_gluon_network()
+```
+
+## III - Loss
+
+The loss, that you are trying to minimize using an optimization algorithm like SGD, is defined differently in the Module API than in Gluon.
+
+
+#### Module
+
+
+In the module API, the loss is part of the network. It has usually a forward pass result, that is the inference value, and a backward pass that is the gradient of the output with respect to that particular loss.
+
+For example, the [sym.SoftmaxOutput](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html?highlight=softmaxout#mxnet.symbol.SoftmaxOutput) is a softmax output in the forward pass and the gradient with respect to the cross-entropy loss in the backward pass.
+
+```python
+# Softmax with cross entropy loss, directly part of the network
+out = mx.sym.SoftmaxOutput(data=mlp, name='softmax')
+```
+
+#### Gluon
+
+
+In Gluon, it is a lot more transparent. Losses, like the [SoftmaxCrossEntropyLoss](https://mxnet.incubator.apache.org/api/python/gluon/loss.html?highlight=softmaxcross#mxnet.gluon.loss.SoftmaxCrossEntropyLoss), are only computing the actual value of the loss. You then call `.backward()` on the loss value to compute the gradient of the parameters with respect to that loss. At inference time, you simply call `.softmax()` on your output to get the output of your network normalized according to the softmax function.
+
+
+```python
+# We simply create a loss function we will use in our training loop
+loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+```
+
+In the next section we will show how you use this loss function in Gluon to generate the loss value in the main training loop.
+
+## IV - Training Loop
+
+
+#### Module
+
+The Module API provides a [`.fit()`](https://mxnet.incubator.apache.org/api/python/module/module.html?highlight=.fit#mxnet.module.BaseModule.fit) function that takes care of fitting training data to your symbolic model. With Gluon, your execution flow controls the data flow, so you need to write your own loop. It might seems like it is more verbose, but you have a lot more control as to what is happening during the training. 
+With the [`.fit()`](https://mxnet.incubator.apache.org/api/python/module/module.html?highlight=.fit#mxnet.module.BaseModule.fit) function, you control the metric reporting, checkpointing or weights initialization through a lot of different keyword arguments (check the [docs](https://mxnet.incubator.apache.org/api/python/module/module.html?highlight=.fit#mxnet.module.BaseModule.fit)). That is where you define the optimizer for example.
+
+```python
+mlp_model.fit(data_iter,  # train data
+              eval_data=data_iter,  # validation data
+              optimizer='sgd',  # use SGD to train
+              force_init=True,
+              force_rebind=True,
+              optimizer_params={'learning_rate':0.1},  # use fixed learning rate
+              eval_metric='acc',  # report accuracy during training
+              num_epoch=5)  # train for 5 full dataset passes
+```
+
+```INFO:root:Epoch[4] Train-accuracy=0.070000```<!--notebook-skip-line-->
+
+```INFO:root:Epoch[4] Time cost=0.038```<!--notebook-skip-line-->
+
+```INFO:root:Epoch[4] Validation-accuracy=0.125000```<!--notebook-skip-line-->
+
+#### Gluon
+
+
+With Gluon, you do these operations directly in the training loop, and the optimizer is part of the [`Trainer`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html?highlight=trainer#mxnet.gluon.Trainer) object that handles the weight updates of your parameters.
+
+Notice the `loss.backward()` we call before updating the weight as mentionned in the previous section
+
+```python
+net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) # Initialize network and trainer
+trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
+
+metric = mx.metric.Accuracy() # Pick a metric
+
+for e in range(5): # start of epoch
+    
+    for data, label in dataloader: # start of mini-batch
+        data = data.as_in_context(ctx)
+        label = label.as_in_context(ctx)
+        
+        with mx.autograd.record():
+            output = net(data) # forward pass
+            loss = loss_fn(output, label) # get loss
+            
+        loss.backward() # compute gradients
+        trainer.step(data.shape[0]) # update weights with SGD
+        metric.update(label, output) # update the metrics # end of mini-batch
+
+    name, acc = metric.get()
+    print('training metrics at epoch %d: %s=%f'%(e, name, acc))
+    metric.reset() # end of epoch
+```
+
+```training metrics at epoch 3: accuracy=0.155000```<!--notebook-skip-line-->
+
+```training metrics at epoch 4: accuracy=0.145000```<!--notebook-skip-line-->
+
+The Gluon training code is more verbose than the simple `.fit` from Module. However that is also the main advantage, there is no black magic going on here, you have full control of your training loop. You can for example easily set breakpoints, modify a learning rate or print data during the training flow. This flexibility also makes easy to implement more complex use-case like gradient accumulation across batches.
+
+## V - Exporting Model
+
+The ultimate purpose of training a model is to be able to export it and share it, whether it is for deployment or simply reproducibility purposes.
+
+#### Module
+
+
+With the Module API, you can save model using the [`.save_checkpoint()`](https://mxnet.incubator.apache.org/api/python/module/module.html?highlight=save_chec#mxnet.module.Module.save_checkpoint) and get a `-symbol.json` and a `.params` file that represent your network. 
+
+
+```python
+mlp_model.save_checkpoint('module-model', epoch=5)
+# module-model-0005.params module-model-symbol.json
+```
+
+```INFO:root:Saved checkpoint to "module-model-0005.params"```<!--notebook-skip-line-->
+
+#### Gluon
+
+
+
+With Gluon, network parameters are associated with a `Block`, but the execution flow is controlled in python through the code in `.forward()` function. Hence only [hybridized networks]() can be exported with a `-symbol.json` and `.params` file using [`.export()`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html?highlight=export#mxnet.gluon.HybridBlock.export), non-hybridized models can only have their parameters exported using [`.save_parameters()`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html?highlight=save_pa#mxnet.gluon.Block.save_parameters). Check this great tutorial to learn more: [Saving and Loading Gluon Models](https://mxnet.incubator.apache.org/tutorials/gluon/save_load_params.html).
+
+
+Any models:
+
+```python
+# save only the parameters
+net.save_parameters('gluon-model.params')
+# gluon-model.params
+```
+
+Hybridized models:
+
+```python
+# save the parameters and the symbolic representation
+net.hybridize()
+net(mx.nd.ones((1,1,28,28), ctx))
+
+net.export('gluon-model-hybrid', epoch=5)
+# gluon-model-hybrid-symbol.json gluon-model-hybrid-0005.params
+```
+
+## VI - Loading Model for Inference
+
+
+#### Module
+
+
+For inference, in the Module API, you need to first load the parameters and symbol, bind the symbol to a module and load the corresponding parameters. You can then pass a batch of data through that module and request the output of the network.
+
+
+```python
+# Load the symbol and parameters
+sym, arg_params, aux_params = mx.model.load_checkpoint('module-model', 5)
+
+# Bind them in a module
+mod = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+mod.bind(for_training=False, data_shapes=[('data', (1,1,28,28))], 
+         label_shapes=mod._label_shapes)
+
+# Set the parameters
+mod.set_params(arg_params, aux_params, allow_missing=True)
+
+# Run the inference
+Batch = namedtuple('Batch', ['data'])
+mod.forward(Batch([mx.nd.ones((1,28,28))]))
+prob = mod.get_outputs()[0].asnumpy()
+print("Output probabilities: {}".format(prob))
+```
+
+`Output probabilities: [[0.05537598 0.03889056 0.06126577 0.08879893 0.12371024 0.05759033 0.1378248  0.26134694 0.07905186 0.09614458]]`<!--notebook-skip-line-->
+
+#### Gluon (Symbolic Model)
+
+For the Gluon API, it is a lot simpler. You can just load a serialized model in a [`SymbolBlock`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html?highlight=symbolblo#mxnet.gluon.SymbolBlock) and run inference directly.
+
+```python
+import warnings
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    net = gluon.SymbolBlock.imports('module-model-symbol.json', ['data', 'softmax_label'], 'module-model-0005.params')
+prob = net(mx.nd.ones((1,1,28,28)), mx.nd.ones(1)) # note the second argument here to account for the softmax_label symbol
+print("Output probabilities: {}".format(prob.asnumpy()))
+```
+
+`Output probabilities: [[0.05537598 0.03889056 0.06126577 0.08879893 0.12371024 0.05759033 0.1378248  0.26134694 0.07905186 0.09614458]]`<!--notebook-skip-line-->
+
+#### Gluon (Imperative Model)
+
+```python
+net = get_gluon_network()
+net.load_parameters('gluon-model.params')
+prob = net(mx.nd.ones((1,1,28,28))).softmax()
+print("Output probabilities: {}".format(prob.asnumpy()))
+```
+
+`Output probabilities: [[0.01298077 0.00173413 0.01661885 0.3362421  0.00536332 0.02099853 0.01413316 0.5528366  0.0133819  0.02571066]]`<!--notebook-skip-line-->
+
+## Conclusion
+
+This tutorial lead you through the steps necessary to train a deep learning model and showed you the differences between the symbolic approach of the Module API and the imperative one of the Gluon API. If you need more help converting your Module API code to the Gluon API, reach out to the community on the [discuss forum](https://discuss.mxnet.io)!
+You can also compare the scripts for training MNIST in [Gluon](https://mxnet.incubator.apache.org/tutorials/gluon/mnist.html) and [Module](https://mxnet.incubator.apache.org/tutorials/python/mnist.html).
+
+
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/example/README.md b/example/README.md
index 18022beb7598..16b556dcda08 100644
--- a/example/README.md
+++ b/example/README.md
@@ -87,6 +87,7 @@ If your tutorial depends on specific packages, simply add them to this provision
 * [MXNet C++ API](http://mxnet.incubator.apache.org/api/c++/index.html)
    - [C++ examples](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp) - Example code for using C++ interface, including NDArray, symbolic layer and models.
 * [MXNet Python API](http://mxnet.incubator.apache.org/api/python/index.html)
+* [MXNet Java API](http://mxnet.incubator.apache.org/api/java/index.html)
 * [MXNet Scala API](http://mxnet.incubator.apache.org/api/scala/index.html)
 * [MXNet R API](http://mxnet.incubator.apache.org/api/r/index.html)
 * [MXNet Julia API](http://mxnet.incubator.apache.org/api/julia/index.html)
@@ -127,7 +128,6 @@ If your tutorial depends on specific packages, simply add them to this provision
 * [Neural Style](neural-style) - use deep learning for style transfer in images
 * [Numpy Operator Customization](numpy-ops) - Examplea on quick customize new ops with Numpy
 * [Profiling](profiler) - generate profiling results in json files
-* [Python How To](python-howto) - a variety of Python examples
 * [Quantization and Calibration Examples](quantization) - examples of quantizing a FP32 model to INT8 and performing low-precision inference with Intel MKL-DNN on CPU or cuDNN on GPU
 * [R-CNN](rcnn) - R-CNN with distributed implementation and data parallelization
 * [Recommender Systems](recommenders) - examples of how to build various kinds of recommender systems
diff --git a/example/bayesian-methods/bdk_demo.py b/example/bayesian-methods/bdk_demo.py
index 83a43192b1ee..a59b4df4fe44 100644
--- a/example/bayesian-methods/bdk_demo.py
+++ b/example/bayesian-methods/bdk_demo.py
@@ -58,7 +58,7 @@ def backward(self, out_grad, in_data, out_data, in_grad):
 
 
 class LogSoftmax(mx.operator.NumpyOp):
-    """Generate helper functions to evaluate softmax loss function"""
+    """Generate helper functions to calculate the logarithm of softmax"""
     def __init__(self):
         super(LogSoftmax, self).__init__(False)
 
@@ -162,7 +162,6 @@ def dev(gpu_id=None):
     return mx.gpu(gpu_id) if gpu_id else mx.cpu()
 
 
-
 def run_mnist_SGD(num_training=50000, gpu_id=None):
     X, Y, X_test, Y_test = load_mnist(num_training)
     minibatch_size = 100
diff --git a/example/caffe/caffe_net.py b/example/caffe/caffe_net.py
index 6796fca5c6a8..803efda9b68e 100644
--- a/example/caffe/caffe_net.py
+++ b/example/caffe/caffe_net.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
 """Generate helper functions to load Caffe into MXNet"""
 import argparse
 import mxnet as mx
@@ -86,8 +87,7 @@ def get_network_from_json_file(file_name):
 
 
 def parse_args():
-    """Parse the arguments
-    """
+    """Parse the arguments"""
     parser = argparse.ArgumentParser(description='train an image classifier on mnist')
     parser.add_argument('--network', type=str, default='lenet',
                         help='the cnn to use (mlp | lenet | <path to network json file>')
diff --git a/example/caffe/train_model.py b/example/caffe/train_model.py
index 16b18674fe7c..d7dfd5d7a31e 100644
--- a/example/caffe/train_model.py
+++ b/example/caffe/train_model.py
@@ -14,14 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Train module with using Caffe operator in MXNet"""
+"""Train module using Caffe operator in MXNet"""
 import os
 import logging
 import mxnet as mx
 
 
 def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
-    """Train the model with using Caffe operator in MXNet"""
+    """Train the model using Caffe operator in MXNet"""
     # kvstore
     kv = mx.kvstore.create(args.kv_store)
 
diff --git a/example/capsnet/capsulenet.py b/example/capsnet/capsulenet.py
index 05df9cdc56c4..4d455dbc504c 100644
--- a/example/capsnet/capsulenet.py
+++ b/example/capsnet/capsulenet.py
@@ -14,7 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Generate MXNet implementation of CapsNet"""
+"""Generate MXNet implementation of CapsNet
+Reference 1: https://www.cs.toronto.edu/~fritz/absps/transauto6.pdf
+Reference 2: https://arxiv.org/pdf/1710.09829.pdf
+"""
 import os
 import re
 import gzip
@@ -190,7 +193,7 @@ def __call__(self, num_update):
 
 
 def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay):
-    """Run training to CapsNet"""
+    """Perform CapsNet training"""
     summary_writer = SummaryWriter(args.tblog_dir)
     lr_scheduler = SimpleLRScheduler(learning_rate)
     optimizer_params = {'lr_scheduler': lr_scheduler}
diff --git a/example/cnn_chinese_text_classification/data_helpers.py b/example/cnn_chinese_text_classification/data_helpers.py
index 49bb3d5dc275..af3c62fb8b9c 100644
--- a/example/cnn_chinese_text_classification/data_helpers.py
+++ b/example/cnn_chinese_text_classification/data_helpers.py
@@ -14,8 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-"""Help functions to support for implementing CNN + Highway Network for Chinese Text Classification in MXNet"""
+"""Helper functions to support for implementing CNN + Highway Network for Chinese Text Classification in MXNet"""
 
 import codecs
 import itertools
diff --git a/example/cnn_chinese_text_classification/text_cnn.py b/example/cnn_chinese_text_classification/text_cnn.py
index ce706813637a..f17ccb2eda83 100644
--- a/example/cnn_chinese_text_classification/text_cnn.py
+++ b/example/cnn_chinese_text_classification/text_cnn.py
@@ -66,7 +66,6 @@
 
 def save_model():
     """Save cnn model
-
     Returns
     ----------
     callback: A callback function that can be passed as epoch_end_callback to fit
@@ -78,11 +77,9 @@ def save_model():
 
 def highway(data):
     """Construct highway net
-
     Parameters
     ----------
     data:
-
     Returns
     ----------
     Highway Networks
@@ -104,7 +101,6 @@ def highway(data):
 
 def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
     """Construct data iter
-
     Parameters
     ----------
     batch_size: int
@@ -166,7 +162,6 @@ def sym_gen(batch_size, sentences_size, num_embed, vocabulary_size,
             num_label=2, filter_list=None, num_filter=100,
             dropout=0.0, pre_trained_word2vec=False):
     """Generate network symbol
-
     Parameters
     ----------
     batch_size: int
@@ -178,7 +173,7 @@ def sym_gen(batch_size, sentences_size, num_embed, vocabulary_size,
     num_filter: int
     dropout: int
     pre_trained_word2vec: boolean
-                        identify the pre-trained layers or not
+                          identify the pre-trained layers or not
     Returns
     ----------
     sm: symbol
@@ -236,7 +231,6 @@ def sym_gen(batch_size, sentences_size, num_embed, vocabulary_size,
 
 def train(symbol_data, train_iterator, valid_iterator, data_column_names, target_names):
     """Train cnn model
-
     Parameters
     ----------
     symbol_data: symbol
@@ -291,7 +285,6 @@ class CustomInit(Initializer):
     """https://mxnet.incubator.apache.org/api/python/optimization.html#mxnet.initializer.register
     Create and register a custom initializer that
     Initialize the weight and bias with custom requirements
-
     """
     weightMethods = ["normal", "uniform", "orthogonal", "xavier"]
     biasMethods = ["costant"]
diff --git a/example/cnn_text_classification/data_helpers.py b/example/cnn_text_classification/data_helpers.py
index 093da7bf32bc..78dacb034a17 100644
--- a/example/cnn_text_classification/data_helpers.py
+++ b/example/cnn_text_classification/data_helpers.py
@@ -15,7 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""Help functions to support for implementing CNN + Highway Network for Text Classification in MXNet"""
+"""
+Help functions to support for implementing CNN + Highway Network for Text Classification in MXNet
+"""
 
 import itertools
 import os
@@ -108,7 +110,9 @@ def build_input_data(sentences, labels, vocabulary):
 
 
 def build_input_data_with_word2vec(sentences, labels, word2vec_list):
-    """Map sentences and labels to vectors based on a pretrained word2vec"""
+    """
+    Map sentences and labels to vectors based on a pretrained word2vec
+    """
     x_vec = []
     for sent in sentences:
         vec = []
diff --git a/example/cnn_text_classification/text_cnn.py b/example/cnn_text_classification/text_cnn.py
index 9ad944398448..79c39454d142 100644
--- a/example/cnn_text_classification/text_cnn.py
+++ b/example/cnn_text_classification/text_cnn.py
@@ -19,12 +19,13 @@
 
 # -*- coding: utf-8 -*-
 
-import sys
+"""Implementing CNN + Highway Network for Text Classification in MXNet"""
+
 import os
-import mxnet as mx
-import numpy as np
-import argparse
 import logging
+import argparse
+import numpy as np
+import mxnet as mx
 import data_helpers
 
 logging.basicConfig(level=logging.DEBUG)
@@ -54,26 +55,55 @@
 parser.add_argument('--save-period', type=int, default=10,
                     help='save checkpoint for every n epochs')
 
+
 def save_model():
+    """Save cnn model
+
+    Returns
+    ----------
+    callback: A callback function that can be passed as epoch_end_callback to fit
+    """
     if not os.path.exists("checkpoint"):
         os.mkdir("checkpoint")
     return mx.callback.do_checkpoint("checkpoint/checkpoint", args.save_period)
 
+
 def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
+    """Construct data iter
+
+    Parameters
+    ----------
+    batch_size: int
+    num_embed: int
+    pre_trained_word2vec: boolean
+                        identify the pre-trained layers or not
+    Returns
+    ----------
+    train_set: DataIter
+                Train DataIter
+    valid: DataIter
+                Valid DataIter
+    sentences_size: int
+                array dimensions
+    embedded_size: int
+                array dimensions
+    vocab_size: int
+                array dimensions
+    """
     print('Loading data...')
     if pre_trained_word2vec:
         word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
         x, y = data_helpers.load_data_with_word2vec(word2vec)
-        # reshpae for convolution input
+        # reshape for convolution input
         x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
-        embed_size = x.shape[-1]
-        sentence_size = x.shape[2]
-        vocab_size = -1
+        embedded_size = x.shape[-1]
+        sentences_size = x.shape[2]
+        vocabulary_size = -1
     else:
         x, y, vocab, vocab_inv = data_helpers.load_data()
-        embed_size = num_embed
-        sentence_size = x.shape[1]
-        vocab_size = len(vocab)
+        embedded_size = num_embed
+        sentences_size = x.shape[1]
+        vocabulary_size = len(vocab)
 
     # randomly shuffle data
     np.random.seed(10)
@@ -87,27 +117,53 @@ def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
     print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
     print('train shape:', x_train.shape)
     print('valid shape:', x_dev.shape)
-    print('sentence max words', sentence_size)
-    print('embedding size', embed_size)
-    print('vocab size', vocab_size)
+    print('sentence max words', sentences_size)
+    print('embedding size', embedded_size)
+    print('vocab size', vocabulary_size)
 
-    train = mx.io.NDArrayIter(
+    train_set = mx.io.NDArrayIter(
         x_train, y_train, batch_size, shuffle=True)
     valid = mx.io.NDArrayIter(
         x_dev, y_dev, batch_size)
 
-    return (train, valid, sentence_size, embed_size, vocab_size)
+    return train_set, valid, sentences_size, embedded_size, vocabulary_size
 
-def sym_gen(batch_size, sentence_size, num_embed, vocab_size,
-            num_label=2, filter_list=[3, 4, 5], num_filter=100,
+
+def sym_gen(batch_size, sentences_size, num_embed, vocabulary_size,
+            num_label=2, filter_list=None, num_filter=100,
             dropout=0.0, pre_trained_word2vec=False):
+    """Generate network symbol
+
+    Parameters
+    ----------
+    batch_size: int
+    sentences_size: int
+    num_embed: int
+    vocabulary_size: int
+    num_label: int
+    filter_list: list
+    num_filter: int
+    dropout: int
+    pre_trained_word2vec: boolean
+                        identify the pre-trained layers or not
+    Returns
+    ----------
+    sm: symbol
+    data: list of str
+        data names
+    softmax_label: list of str
+        label names
+    """
     input_x = mx.sym.Variable('data')
     input_y = mx.sym.Variable('softmax_label')
 
     # embedding layer
     if not pre_trained_word2vec:
-        embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed')
-        conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentence_size, num_embed))
+        embed_layer = mx.sym.Embedding(data=input_x,
+                                       input_dim=vocabulary_size,
+                                       output_dim=num_embed,
+                                       name='vocab_embed')
+        conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentences_size, num_embed))
     else:
         conv_input = input_x
 
@@ -116,7 +172,7 @@ def sym_gen(batch_size, sentence_size, num_embed, vocab_size,
     for i, filter_size in enumerate(filter_list):
         convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
         relui = mx.sym.Activation(data=convi, act_type='relu')
-        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))
+        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentences_size - filter_size + 1, 1), stride=(1, 1))
         pooled_outputs.append(pooli)
 
     # combine all pooled outputs
@@ -141,20 +197,39 @@ def sym_gen(batch_size, sentence_size, num_embed, vocab_size,
 
     return sm, ('data',), ('softmax_label',)
 
-def train(symbol, train_iter, valid_iter, data_names, label_names):
-    devs = mx.cpu() if args.gpus is None or args.gpus is '' else [
-        mx.gpu(int(i)) for i in args.gpus.split(',')]
-    module = mx.mod.Module(symbol, data_names=data_names, label_names=label_names, context=devs)
-    module.fit(train_data = train_iter,
-            eval_data = valid_iter,
-            eval_metric = 'acc',
-            kvstore = args.kv_store,
-            optimizer = args.optimizer,
-            optimizer_params = { 'learning_rate': args.lr },
-            initializer = mx.initializer.Uniform(0.1),
-            num_epoch = args.num_epochs,
-            batch_end_callback = mx.callback.Speedometer(args.batch_size, args.disp_batches),
-            epoch_end_callback = save_model())
+
+def train(symbol_data, train_iterator, valid_iterator, data_column_names, target_names):
+    """Train cnn model
+
+    Parameters
+    ----------
+    symbol_data: symbol
+    train_iterator: DataIter
+                    Train DataIter
+    valid_iterator: DataIter
+                    Valid DataIter
+    data_column_names: list of str
+                       Defaults to ('data') for a typical model used in image classification
+    target_names: list of str
+                  Defaults to ('softmax_label') for a typical model used in image classification
+    """
+    devs = mx.cpu()  # default setting
+    if args.gpus is not None:
+        for i in args.gpus.split(','):
+            mx.gpu(int(i))
+        devs = mx.gpu()
+    module = mx.mod.Module(symbol_data, data_names=data_column_names, label_names=target_names, context=devs)
+    module.fit(train_data=train_iterator,
+               eval_data=valid_iterator,
+               eval_metric='acc',
+               kvstore=args.kv_store,
+               optimizer=args.optimizer,
+               optimizer_params={'learning_rate': args.lr},
+               initializer=mx.initializer.Uniform(0.1),
+               num_epoch=args.num_epochs,
+               batch_end_callback=mx.callback.Speedometer(args.batch_size, args.disp_batches),
+               epoch_end_callback=save_model())
+
 
 if __name__ == '__main__':
     # parse args
@@ -162,14 +237,17 @@ def train(symbol, train_iter, valid_iter, data_names, label_names):
 
     # data iter
     train_iter, valid_iter, sentence_size, embed_size, vocab_size = data_iter(args.batch_size,
-                                                                args.num_embed,
-                                                                args.pretrained_embedding)
+                                                                              args.num_embed,
+                                                                              args.pretrained_embedding)
     # network symbol
     symbol, data_names, label_names = sym_gen(args.batch_size,
-                                            sentence_size,
-                                            embed_size,
-                                            vocab_size,
-                                            num_label=2, filter_list=[3, 4, 5], num_filter=100,
-                                            dropout=args.dropout, pre_trained_word2vec=args.pretrained_embedding)
+                                              sentence_size,
+                                              embed_size,
+                                              vocab_size,
+                                              num_label=2,
+                                              filter_list=[3, 4, 5],
+                                              num_filter=100,
+                                              dropout=args.dropout,
+                                              pre_trained_word2vec=args.pretrained_embedding)
     # train cnn model
     train(symbol, train_iter, valid_iter, data_names, label_names)
diff --git a/example/ctc/captcha_generator.py b/example/ctc/captcha_generator.py
index 97fab4082ec0..8a3d72620a74 100644
--- a/example/ctc/captcha_generator.py
+++ b/example/ctc/captcha_generator.py
@@ -15,22 +15,20 @@
 # specific language governing permissions and limitations
 # under the License.
 """ Helper classes for multiprocess captcha image generation
-
 This module also provides script for saving captcha images to file using CLI.
 """
 
 from __future__ import print_function
 import random
 
+import numpy as np
 from captcha.image import ImageCaptcha
 import cv2
 from multiproc_data import MPData
-import numpy as np
 
 
 class CaptchaGen(object):
-    """
-    Generates a captcha image
+    """Generates a captcha image
     """
     def __init__(self, h, w, font_paths):
         """
@@ -48,8 +46,7 @@ def __init__(self, h, w, font_paths):
         self.w = w
 
     def image(self, captcha_str):
-        """
-        Generate a greyscale captcha image representing number string
+        """Generate a greyscale captcha image representing number string
 
         Parameters
         ----------
@@ -71,8 +68,7 @@ def image(self, captcha_str):
 
 
 class DigitCaptcha(object):
-    """
-    Provides shape() and get() interface for digit-captcha image generation
+    """Provides shape() and get() interface for digit-captcha image generation
     """
     def __init__(self, font_paths, h, w, num_digit_min, num_digit_max):
         """
@@ -95,8 +91,7 @@ def __init__(self, font_paths, h, w, num_digit_min, num_digit_max):
 
     @property
     def shape(self):
-        """
-        Returns shape of the image data generated
+        """Returns shape of the image data generated
 
         Returns
         -------
@@ -105,8 +100,7 @@ def shape(self):
         return self.captcha.h, self.captcha.w
 
     def get(self):
-        """
-        Get an image from the queue
+        """Get an image from the queue
 
         Returns
         -------
@@ -117,9 +111,8 @@ def get(self):
 
     @staticmethod
     def get_rand(num_digit_min, num_digit_max):
-        """
-        Generates a character string of digits. Number of digits are
-         between self.num_digit_min and self.num_digit_max
+        """Generates a character string of digits. Number of digits are
+        between self.num_digit_min and self.num_digit_max
         Returns
         -------
         str
@@ -131,8 +124,7 @@ def get_rand(num_digit_min, num_digit_max):
         return buf
 
     def _gen_sample(self):
-        """
-        Generate a random captcha image sample
+        """Generate a random captcha image sample
         Returns
         -------
         (numpy.ndarray, str)
@@ -143,13 +135,10 @@ def _gen_sample(self):
 
 
 class MPDigitCaptcha(DigitCaptcha):
-    """
-    Handles multi-process captcha image generation
+    """Handles multi-process captcha image generation
     """
     def __init__(self, font_paths, h, w, num_digit_min, num_digit_max, num_processes, max_queue_size):
-        """
-
-        Parameters
+        """Parameters
         ----------
         font_paths: list of str
             List of path to ttf font files
@@ -170,14 +159,11 @@ def __init__(self, font_paths, h, w, num_digit_min, num_digit_max, num_processes
         self.mp_data = MPData(num_processes, max_queue_size, self._gen_sample)
 
     def start(self):
-        """
-        Starts the processes
-        """
+        """Starts the processes"""
         self.mp_data.start()
 
     def get(self):
-        """
-        Get an image from the queue
+        """Get an image from the queue
 
         Returns
         -------
@@ -187,9 +173,7 @@ def get(self):
         return self.mp_data.get()
 
     def reset(self):
-        """
-        Resets the generator by stopping all processes
-        """
+        """Resets the generator by stopping all processes"""
         self.mp_data.reset()
 
 
@@ -197,6 +181,7 @@ def reset(self):
     import argparse
 
     def main():
+        """Program entry point"""
         parser = argparse.ArgumentParser()
         parser.add_argument("font_path", help="Path to ttf font file")
         parser.add_argument("output", help="Output filename including extension (e.g. 'sample.jpg')")
diff --git a/example/ctc/ctc_metrics.py b/example/ctc/ctc_metrics.py
index 0db680af18d7..5f51025464a1 100644
--- a/example/ctc/ctc_metrics.py
+++ b/example/ctc/ctc_metrics.py
@@ -22,14 +22,17 @@
 
 
 class CtcMetrics(object):
+    """Module for calculating the prediction accuracy during training. Two accuracy measures are implemented:
+    A simple accuracy measure that calculates number of correct predictions divided by total number of predictions
+    and a second accuracy measure based on sum of Longest Common Sequence(LCS) ratio of all predictions divided by total
+    number of predictions
+    """
     def __init__(self, seq_len):
         self.seq_len = seq_len
 
     @staticmethod
     def ctc_label(p):
-        """
-        Iterates through p, identifying non-zero and non-repeating values, and returns them in a list
-        Parameters
+        """Iterates through p, identifying non-zero and non-repeating values, and returns them in a list Parameters
         ----------
         p: list of int
 
@@ -42,7 +45,7 @@ def ctc_label(p):
         for i, _ in enumerate(p):
             c1 = p1[i]
             c2 = p1[i+1]
-            if c2 == 0 or c2 == c1:
+            if c2 in (0, c1):
                 continue
             ret.append(c2)
         return ret
@@ -65,12 +68,16 @@ def _lcs(p, l):
             return 0
         P = np.array(list(p)).reshape((1, len(p)))
         L = np.array(list(l)).reshape((len(l), 1))
-        M = np.int32(P == L)
+        M = np.ndarray(shape=(len(P), len(L)), dtype=np.int32)
         for i in range(M.shape[0]):
             for j in range(M.shape[1]):
                 up = 0 if i == 0 else M[i-1, j]
                 left = 0 if j == 0 else M[i, j-1]
-                M[i, j] = max(up, left, M[i, j] if (i == 0 or j == 0) else M[i, j] + M[i-1, j-1])
+
+                if i == 0 or j == 0:
+                    M[i, j] = max(up, left, M[i, j])
+                else:
+                    M[i, j] = M[i, j] + M[i - 1, j - 1]
         return M.max()
 
     def accuracy(self, label, pred):
@@ -111,4 +118,3 @@ def accuracy_lcs(self, label, pred):
             total += 1.0
         assert total == batch_size
         return hit / total
-
diff --git a/example/ctc/hyperparams.py b/example/ctc/hyperparams.py
index cdcb874b3a6b..ef24401d06f7 100644
--- a/example/ctc/hyperparams.py
+++ b/example/ctc/hyperparams.py
@@ -20,9 +20,7 @@
 
 
 class Hyperparams(object):
-    """
-    Hyperparameters for LSTM network
-    """
+    """Hyperparameters for LSTM network"""
     def __init__(self):
         # Training hyper parameters
         self._train_epoch_size = 30000
@@ -48,7 +46,7 @@ def eval_epoch_size(self):
     @property
     def batch_size(self):
         return self._batch_size
-    
+
     @property
     def num_epoch(self):
         return self._num_epoch
diff --git a/example/ctc/lstm_ocr_infer.py b/example/ctc/lstm_ocr_infer.py
index 80de2c7efac4..eec48ead78b6 100644
--- a/example/ctc/lstm_ocr_infer.py
+++ b/example/ctc/lstm_ocr_infer.py
@@ -20,12 +20,12 @@
 
 import argparse
 
+import numpy as np
 from ctc_metrics import CtcMetrics
 import cv2
 from hyperparams import Hyperparams
 import lstm
 import mxnet as mx
-import numpy as np
 from ocr_iter import SimpleBatch
 
 
@@ -46,8 +46,7 @@ def lstm_init_states(batch_size):
 
 
 def load_module(prefix, epoch, data_names, data_shapes):
-    """
-    Loads the model from checkpoint specified by prefix and epoch, binds it
+    """Loads the model from checkpoint specified by prefix and epoch, binds it
     to an executor, and sets its parameters and returns a mx.mod.Module
     """
     sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
@@ -64,6 +63,7 @@ def load_module(prefix, epoch, data_names, data_shapes):
 
 
 def main():
+    """Program entry point"""
     parser = argparse.ArgumentParser()
     parser.add_argument("path", help="Path to the CAPTCHA image file")
     parser.add_argument("--prefix", help="Checkpoint prefix [Default 'ocr']", default='ocr')
@@ -86,7 +86,6 @@ def main():
     # Predictions are 1 to 10 for digits 0 to 9 respectively (prediction 0 means no-digit)
     prediction = [p - 1 for p in prediction]
     print("Digits:", prediction)
-    return
 
 
 if __name__ == '__main__':
diff --git a/example/ctc/lstm_ocr_train.py b/example/ctc/lstm_ocr_train.py
index 2c25f7e31e11..49d9531920ae 100644
--- a/example/ctc/lstm_ocr_train.py
+++ b/example/ctc/lstm_ocr_train.py
@@ -42,7 +42,7 @@ def get_fonts(path):
 
 
 def parse_args():
-    # Parse command line arguments
+    """Parse command line arguments"""
     parser = argparse.ArgumentParser()
     parser.add_argument("font_path", help="Path to ttf font file or directory containing ttf files")
     parser.add_argument("--loss", help="'ctc' or 'warpctc' loss [Default 'ctc']", default='ctc')
@@ -56,6 +56,7 @@ def parse_args():
 
 
 def main():
+    """Program entry point"""
     args = parse_args()
     if not any(args.loss == s for s in ['ctc', 'warpctc']):
         raise ValueError("Invalid loss '{}' (must be 'ctc' or 'warpctc')".format(args.loss))
@@ -122,4 +123,3 @@ def main():
 
 if __name__ == '__main__':
     main()
-
diff --git a/example/ctc/multiproc_data.py b/example/ctc/multiproc_data.py
index c5f8da56355a..f4c667621f70 100644
--- a/example/ctc/multiproc_data.py
+++ b/example/ctc/multiproc_data.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Contains a class for handling multi-process data generation"""
 
 from __future__ import print_function
 from ctypes import c_bool
@@ -25,12 +26,9 @@
     from Queue import Full as QFullExcept
     from Queue import Empty as QEmptyExcept
 
-import numpy as np
-
 
 class MPData(object):
-    """
-    Handles multi-process data generation.
+    """Handles multi-process data generation.
 
     Operation:
         - call start() to start the data generation
@@ -38,9 +36,7 @@ class MPData(object):
         - call reset() to stop data generation
     """
     def __init__(self, num_processes, max_queue_size, fn):
-        """
-
-        Parameters
+        """Parameters
         ----------
         num_processes: int
             Number of processes to spawn
@@ -56,22 +52,12 @@ def __init__(self, num_processes, max_queue_size, fn):
         self.fn = fn
 
     def start(self):
-        """
-        Starts the processes
-        Parameters
-        ----------
-        fn: function
-
-        """
-        """
-        Starts the processes
-        """
+        """Starts the processes"""
         self._init_proc()
 
     @staticmethod
     def _proc_loop(proc_id, alive, queue, fn):
-        """
-        Thread loop for generating data
+        """Thread loop for generating data
 
         Parameters
         ----------
@@ -102,9 +88,7 @@ def _proc_loop(proc_id, alive, queue, fn):
         queue.close()
 
     def _init_proc(self):
-        """
-        Start processes if not already started
-        """
+        """Start processes if not already started"""
         if not self.proc:
             self.proc = [
                 mp.Process(target=self._proc_loop, args=(i, self.alive, self.queue, self.fn))
@@ -115,8 +99,7 @@ def _init_proc(self):
                 p.start()
 
     def get(self):
-        """
-        Get a datum from the queue
+        """Get a datum from the queue
 
         Returns
         -------
@@ -127,9 +110,7 @@ def get(self):
         return self.queue.get()
 
     def reset(self):
-        """
-        Resets the generator by stopping all processes
-        """
+        """Resets the generator by stopping all processes"""
         self.alive.value = False
         qsize = 0
         try:
diff --git a/example/ctc/ocr_iter.py b/example/ctc/ocr_iter.py
index 1432e92a80fd..5a02f3891ccd 100644
--- a/example/ctc/ocr_iter.py
+++ b/example/ctc/ocr_iter.py
@@ -23,7 +23,11 @@
 
 
 class SimpleBatch(object):
-    def __init__(self, data_names, data, label_names=list(), label=list()):
+    """Batch class for getting label data
+    Operation:
+        - call get_label() to start label data generation
+    """
+    def __init__(self, data_names, data, label_names=None, label=None):
         self._data = data
         self._label = label
         self._data_names = data_names
@@ -59,20 +63,17 @@ def provide_label(self):
 
 def get_label(buf):
     ret = np.zeros(4)
-    for i in range(len(buf)):
-        ret[i] = 1 + int(buf[i])
+    for i, element in enumerate(buf):
+        ret[i] = 1 + int(element)
     if len(buf) == 3:
         ret[3] = 0
     return ret
 
 
 class OCRIter(mx.io.DataIter):
-    """
-    Iterator class for generating captcha image data
-    """
+    """Iterator class for generating captcha image data"""
     def __init__(self, count, batch_size, lstm_init_states, captcha, name):
-        """
-        Parameters
+        """Parameters
         ----------
         count: int
             Number of batches to produce for one epoch
diff --git a/example/ctc/ocr_predict.py b/example/ctc/ocr_predict.py
index 2cf19678f4b5..c5c691b53de2 100644
--- a/example/ctc/ocr_predict.py
+++ b/example/ctc/ocr_predict.py
@@ -21,18 +21,12 @@
 import argparse
 
 import sys
-import cv2
 import numpy as np
-import mxnet as mx
-from collections import namedtuple
-from ocr_iter import SimpleBatch
-from captcha_generator import DigitCaptcha
-from ctc_metrics import CtcMetrics
-import lstm
-from hyperparams import Hyperparams
+import cv2
 
 
 class lstm_ocr_model(object):
+    """LSTM network for predicting the Optical Character Recognition"""
     # Keep Zero index for blank. (CTC request it)
     CONST_CHAR = '0123456789'
 
@@ -67,6 +61,16 @@ def __init_ocr(self):
                                    all_shapes_dict)
 
     def forward_ocr(self, img_):
+        """Forward the image through the LSTM network model
+
+        Parameters
+        ----------
+        img_: int of array
+
+        Returns
+        ----------
+        label_list: string of list
+        """
         img_ = cv2.resize(img_, (80, 30))
         img_ = img_.transpose(1, 0)
         print(img_.shape)
@@ -92,7 +96,7 @@ def __get_string(label_list):
         for i in range(len(label_list)):
             c1 = label_list2[i]
             c2 = label_list2[i+1]
-            if c2 == 0 or c2 == c1:
+            if c2 in (0, c1):
                 continue
             ret.append(c2)
         # change to ascii
diff --git a/example/deep-embedded-clustering/autoencoder.py b/example/deep-embedded-clustering/autoencoder.py
index 096f04529c3b..c75634475e3a 100644
--- a/example/deep-embedded-clustering/autoencoder.py
+++ b/example/deep-embedded-clustering/autoencoder.py
@@ -19,9 +19,8 @@
 from __future__ import print_function
 
 import logging
-
-import mxnet as mx
 import numpy as np
+import mxnet as mx
 import model
 from solver import Solver, Monitor
 
@@ -95,18 +94,18 @@ def make_stack(self, istack, data, num_input, num_hidden, sparseness_penalty=Non
         else:
             x = mx.symbol.LinearRegressionOutput(data=x, label=data)
 
-        args = {'encoder_%d_weight'%istack: mx.nd.empty((num_hidden, num_input), self.xpu),
-                'encoder_%d_bias'%istack: mx.nd.empty((num_hidden,), self.xpu),
-                'decoder_%d_weight'%istack: mx.nd.empty((num_input, num_hidden), self.xpu),
-                'decoder_%d_bias'%istack: mx.nd.empty((num_input,), self.xpu),}
-        args_grad = {'encoder_%d_weight'%istack: mx.nd.empty((num_hidden, num_input), self.xpu),
-                     'encoder_%d_bias'%istack: mx.nd.empty((num_hidden,), self.xpu),
-                     'decoder_%d_weight'%istack: mx.nd.empty((num_input, num_hidden), self.xpu),
-                     'decoder_%d_bias'%istack: mx.nd.empty((num_input,), self.xpu),}
-        args_mult = {'encoder_%d_weight'%istack: 1.0,
-                     'encoder_%d_bias'%istack: 2.0,
-                     'decoder_%d_weight'%istack: 1.0,
-                     'decoder_%d_bias'%istack: 2.0,}
+        args = {'encoder_%d_weight' % istack: mx.nd.empty((num_hidden, num_input), self.xpu),
+                'encoder_%d_bias' % istack: mx.nd.empty((num_hidden,), self.xpu),
+                'decoder_%d_weight' % istack: mx.nd.empty((num_input, num_hidden), self.xpu),
+                'decoder_%d_bias' % istack: mx.nd.empty((num_input,), self.xpu), }
+        args_grad = {'encoder_%d_weight' % istack: mx.nd.empty((num_hidden, num_input), self.xpu),
+                     'encoder_%d_bias' % istack: mx.nd.empty((num_hidden,), self.xpu),
+                     'decoder_%d_weight' % istack: mx.nd.empty((num_input, num_hidden), self.xpu),
+                     'decoder_%d_bias' % istack: mx.nd.empty((num_input,), self.xpu), }
+        args_mult = {'encoder_%d_weight' % istack: 1.0,
+                     'encoder_%d_bias' % istack: 2.0,
+                     'decoder_%d_weight' % istack: 1.0,
+                     'decoder_%d_bias' % istack: 2.0, }
         auxs = {}
         if encoder_act == 'sigmoid' and sparseness_penalty:
             auxs['sparse_encoder_%d_moving_avg' % istack] = mx.nd.ones(num_hidden, self.xpu) * 0.5
@@ -203,4 +202,4 @@ def eval(self, X):
                                       last_batch_handle='pad')
         Y = list(model.extract_feature(
             self.loss, self.args, self.auxs, data_iter, X.shape[0], self.xpu).values())[0]
-        return np.mean(np.square(Y-X))/2.0
\ No newline at end of file
+        return np.mean(np.square(Y-X))/2.0
diff --git a/example/deep-embedded-clustering/data.py b/example/deep-embedded-clustering/data.py
index 3649990dbbb9..4ae3271fe6d2 100644
--- a/example/deep-embedded-clustering/data.py
+++ b/example/deep-embedded-clustering/data.py
@@ -18,10 +18,8 @@
 # pylint: disable=missing-docstring
 from __future__ import print_function
 
-import os
-
-import mxnet as mx
 import numpy as np
+import mxnet as mx
 
 
 def get_mnist():
@@ -35,7 +33,3 @@ def get_mnist():
     X = X[p].reshape((X.shape[0], -1)).astype(np.float32)*5
     Y = Y[p]
     return X, Y
-
-
-
-
diff --git a/example/deep-embedded-clustering/dec.py b/example/deep-embedded-clustering/dec.py
index d7594703cfc3..8fb3891e3e99 100644
--- a/example/deep-embedded-clustering/dec.py
+++ b/example/deep-embedded-clustering/dec.py
@@ -17,27 +17,29 @@
 
 # pylint: skip-file
 from __future__ import print_function
-import sys
+
 import os
-import mxnet as mx
+import logging
 import numpy as np
-import data
-from scipy.spatial.distance import cdist
 from sklearn.cluster import KMeans
+from scipy.spatial.distance import cdist
+import mxnet as mx
+import data
 import model
 from autoencoder import AutoEncoderModel
 from solver import Solver, Monitor
-import logging
+
 
 def cluster_acc(Y_pred, Y):
     from sklearn.utils.linear_assignment_ import linear_assignment
     assert Y_pred.size == Y.size
     D = max(Y_pred.max(), Y.max())+1
-    w = np.zeros((D,D), dtype=np.int64)
+    w = np.zeros((D, D), dtype=np.int64)
     for i in range(Y_pred.size):
         w[Y_pred[i], int(Y[i])] += 1
     ind = linear_assignment(w.max() - w)
-    return sum([w[i,j] for i,j in ind])*1.0/Y_pred.size, w
+    return sum([w[i, j] for i, j in ind])*1.0/Y_pred.size, w
+
 
 class DECModel(model.MXModel):
     class DECLoss(mx.operator.NumpyOp):
@@ -81,12 +83,12 @@ def setup(self, X, num_centers, alpha, save_to='dec_model'):
         sep = X.shape[0]*9//10
         X_train = X[:sep]
         X_val = X[sep:]
-        ae_model = AutoEncoderModel(self.xpu, [X.shape[1],500,500,2000,10], pt_dropout=0.2)
+        ae_model = AutoEncoderModel(self.xpu, [X.shape[1], 500, 500, 2000, 10], pt_dropout=0.2)
         if not os.path.exists(save_to+'_pt.arg'):
             ae_model.layerwise_pretrain(X_train, 256, 50000, 'sgd', l_rate=0.1, decay=0.0,
-                                        lr_scheduler=mx.lr_scheduler.FactorScheduler(20000,0.1))
+                                        lr_scheduler=mx.lr_scheduler.FactorScheduler(20000, 0.1))
             ae_model.finetune(X_train, 256, 100000, 'sgd', l_rate=0.1, decay=0.0,
-                              lr_scheduler=mx.lr_scheduler.FactorScheduler(20000,0.1))
+                              lr_scheduler=mx.lr_scheduler.FactorScheduler(20000, 0.1))
             ae_model.save(save_to+'_pt.arg')
             logging.log(logging.INFO, "Autoencoder Training error: %f"%ae_model.eval(X_train))
             logging.log(logging.INFO, "Autoencoder Validation error: %f"%ae_model.eval(X_val))
@@ -98,9 +100,9 @@ def setup(self, X, num_centers, alpha, save_to='dec_model'):
         label = mx.sym.Variable('label')
         self.feature = self.ae_model.encoder
         self.loss = self.dec_op(data=self.ae_model.encoder, label=label, name='dec')
-        self.args.update({k:v for k,v in self.ae_model.args.items() if k in self.ae_model.encoder.list_arguments()})
+        self.args.update({k: v for k, v in self.ae_model.args.items() if k in self.ae_model.encoder.list_arguments()})
         self.args['dec_mu'] = mx.nd.empty((num_centers, self.ae_model.dims[-1]), ctx=self.xpu)
-        self.args_grad.update({k: mx.nd.empty(v.shape, ctx=self.xpu) for k,v in self.args.items()})
+        self.args_grad.update({k: mx.nd.empty(v.shape, ctx=self.xpu) for k, v in self.args.items()})
         self.args_mult.update({k: k.endswith('bias') and 2.0 or 1.0 for k in self.args})
         self.num_centers = num_centers
 
@@ -117,6 +119,7 @@ def cluster(self, X, y=None, update_interval=None):
         kmeans.fit(z)
         args['dec_mu'][:] = kmeans.cluster_centers_
         solver = Solver('sgd', momentum=0.9, wd=0.0, learning_rate=0.01)
+
         def ce(label, pred):
             return np.sum(label*np.log(label/(pred+0.000001)))/label.shape[0]
         solver.set_metric(mx.metric.CustomMetric(ce))
@@ -125,6 +128,7 @@ def ce(label, pred):
         train_iter = mx.io.NDArrayIter({'data': X}, {'label': label_buff}, batch_size=batch_size,
                                        shuffle=False, last_batch_handle='roll_over')
         self.y_pred = np.zeros((X.shape[0]))
+
         def refresh(i):
             if i%update_interval == 0:
                 z = list(model.extract_feature(self.feature, args, None, test_iter, N, self.xpu).values())[0]
@@ -155,6 +159,7 @@ def refresh(i):
         else:
             return -1
 
+
 def mnist_exp(xpu):
     X, Y = data.get_mnist()
     if not os.path.isdir('data'):
@@ -167,7 +172,7 @@ def mnist_exp(xpu):
     logging.info(str(acc))
     logging.info('Best Clustering ACC: %f at update_interval: %d'%(np.max(acc), 10*(2**np.argmax(acc))))
 
+
 if __name__ == '__main__':
     logging.basicConfig(level=logging.INFO)
     mnist_exp(mx.gpu(0))
-
diff --git a/example/deep-embedded-clustering/model.py b/example/deep-embedded-clustering/model.py
index b388c551387e..ac029c1ae6a3 100644
--- a/example/deep-embedded-clustering/model.py
+++ b/example/deep-embedded-clustering/model.py
@@ -20,7 +20,6 @@
 
 import numpy as np
 import mxnet as mx
-
 try:
     import cPickle as pickle
 except ImportError:
@@ -54,7 +53,7 @@ def extract_feature(sym, args, auxs, data_iter, N, xpu=mx.cpu()):
 
 
 class MXModel(object):
-    def __init__(self, *args, xpu=mx.cpu(), **kwargs):
+    def __init__(self, xpu=mx.cpu(), *args, **kwargs):
         self.xpu = xpu
         self.loss = None
         self.args = {}
diff --git a/example/distributed_training-horovod/README.md b/example/distributed_training-horovod/README.md
new file mode 100644
index 000000000000..c4776044a385
--- /dev/null
+++ b/example/distributed_training-horovod/README.md
@@ -0,0 +1,201 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Distributed Training using MXNet with Horovod 
+[Horovod](https://github.com/horovod/horovod) is a distributed training framework that demonstrates 
+excellent scaling efficiency for dense models running on a large number of nodes. It currently 
+supports mainstream deep learning frameworks such as MXNet, TensorFlow, Keras, and PyTorch. 
+It is created at Uber and currently hosted by the [Linux Foundation Deep Learning](https://lfdl.io)(LF DL). 
+
+MXNet is supported in Horovod 0.16.0 [release](https://eng.uber.com/horovod-pyspark-apache-mxnet-support/).
+
+## What's New?
+Compared with the standard distributed training script in MXNet which uses parameter server to 
+distribute and aggregate parameters, Horovod uses ring allreduce and/or tree-based allreduce algorithm 
+to communicate parameters between workers. There is no dedicated server and the communication data size 
+between workers does not depend on the number of workers. Therefore, it scales well in the case where 
+there are a large number of workers and network bandwidth is the bottleneck.
+
+# Install
+## Install MXNet
+```bash
+$ pip install mxnet
+```
+**Note**: There is a [known issue](https://github.com/horovod/horovod/issues/884) when running Horovod with MXNet on a Linux system with GCC version 5.X and above. We recommend users to build MXNet from source following this [guide](https://mxnet.incubator.apache.org/install/build_from_source.html) as a workaround for now. Also mxnet-mkl package in 1.4.0 release does not support Horovod.
+
+## Install Horovod
+```bash
+$ pip install horovod
+```
+
+This basic installation is good for laptops and for getting to know Horovod.
+If you're installing Horovod on a server with GPUs, read the [Horovod on GPU](https://github.com/horovod/horovod/blob/master/docs/gpus.md) page.
+If you want to use Docker, read the [Horovod in Docker](https://github.com/horovod/horovod/blob/master/docs/docker.md) page.
+
+## Install MPI
+MPI is required to run distributed training with Horovod. Install [Open MPI](https://www.open-mpi.org/) or another MPI implementation.
+Steps to install Open MPI are listed [here](https://www.open-mpi.org/faq/?category=building#easy-build).
+
+**Note**: Open MPI 3.1.3 has an issue that may cause hangs.  It is recommended
+to downgrade to Open MPI 3.1.2 or upgrade to Open MPI 4.0.0.
+
+# Usage
+
+To run MXNet with Horovod, make the following additions to your training script:
+
+1. Run `hvd.init()`.
+
+2. Pin the context to a processor using `hvd.local_rank()`.
+    Typically, each Horovod worker is associated with one process. The local rank is a unique ID specifically
+    for all processes running Horovod job on the same node.
+
+3. Scale the learning rate by number of workers. Effective batch size in synchronous distributed training is scaled by
+    the number of workers. An increase in learning rate compensates for the increased batch size.
+
+4. Wrap optimizer in `hvd.DistributedOptimizer`.  The distributed optimizer delegates gradient computation
+    to the original optimizer, averages gradients using *allreduce* or *allgather*, and then applies those averaged
+    gradients.
+
+5. Add `hvd.broadcast_parameters` to broadcast initial variable states from rank 0 to all other processes.
+    This is necessary to ensure consistent initialization of all workers when training is started with random weights or
+    restored from a checkpoint. 
+
+# Example
+
+Here we provide the building blocks to train a model using MXNet with Horovod.
+The full examples are in [MNIST](gluon_mnist.py) and [ImageNet](resnet50_imagenet.py).
+
+## Gluon API
+```python
+from mxnet import autograd, gluon
+import mxnet as mx
+import horovod.mxnet as hvd
+
+# Initialize Horovod
+hvd.init()
+
+# Set context to current process 
+context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
+
+num_workers = hvd.size()
+
+# Build model
+model = ...
+model.hybridize()
+
+# Define hyper parameters
+optimizer_params = ...
+
+# Add Horovod Distributed Optimizer
+opt = mx.optimizer.create('sgd', **optimizer_params)
+opt = hvd.DistributedOptimizer(opt)
+
+# Initialize parameters
+model.initialize(initializer, ctx=context)
+
+# Fetch and broadcast parameters
+params = model.collect_params()
+if params is not None:
+    hvd.broadcast_parameters(params, root_rank=0)
+
+# Create trainer and loss function
+trainer = gluon.Trainer(params, opt, kvstore=None)
+loss_fn = ...
+
+# Train model
+for epoch in range(num_epoch):
+    train_data.reset()
+    for nbatch, batch in enumerate(train_data, start=1):
+        data = batch.data[0].as_in_context(context)
+        label = batch.label[0].as_in_context(context)
+        with autograd.record():
+            output = model(data.astype(dtype, copy=False))
+            loss = loss_fn(output, label)
+        loss.backward()
+        trainer.step(batch_size)
+```
+
+## Module API
+```python
+import mxnet as mx
+import horovod.mxnet as hvd
+
+# Initialize Horovod
+hvd.init()
+
+# Set context to current process
+context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
+num_workers = hvd.size()
+
+# Build model
+model = ...
+
+# Define hyper parameters
+optimizer_params = ...
+
+# Add Horovod Distributed Optimizer
+opt = mx.optimizer.create('sgd', **optimizer_params)
+opt = hvd.DistributedOptimizer(opt)
+
+# Initialize parameters
+initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
+                             magnitude=2)
+model.bind(data_shapes=train_data.provide_data,
+           label_shapes=train_data.provide_label)
+model.init_params(initializer)
+
+# Fetch and broadcast parameters
+(arg_params, aux_params) = model.get_params()
+if arg_params:
+    hvd.broadcast_parameters(arg_params, root_rank=0)
+if aux_params:
+    hvd.broadcast_parameters(aux_params, root_rank=0)
+model.set_params(arg_params=arg_params, aux_params=aux_params)
+
+# Train model
+model.fit(train_data,
+          kvstore=None,
+          optimizer=opt,
+          num_epoch=num_epoch)
+```
+
+
+# Running Horovod
+
+The example commands below show how to run distributed training. See the 
+[Running Horovod](https://github.com/horovod/horovod/blob/master/docs/running.md)
+page for more instructions, including RoCE/InfiniBand tweaks and tips for dealing with hangs.
+
+1. To run on a machine with 4 CPUs:
+
+```bash
+$ mpirun -np 4 \
+    -H localhost:4 \
+    -bind-to none -map-by slot \
+    python train.py
+```
+
+2. To run on 2 machines with 4 GPUs each:
+
+```bash
+$ mpirun -np 8 \
+    -H server1:4,server2:4 \
+    -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO \
+    -mca pml ob1 -mca btl ^openib \
+    python train.py
+```
\ No newline at end of file
diff --git a/example/distributed_training-horovod/gluon_mnist.py b/example/distributed_training-horovod/gluon_mnist.py
new file mode 100644
index 000000000000..7e4be58cc2ef
--- /dev/null
+++ b/example/distributed_training-horovod/gluon_mnist.py
@@ -0,0 +1,186 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import logging
+import os
+import zipfile
+import time
+
+import mxnet as mx
+import horovod.mxnet as hvd
+from mxnet import autograd, gluon, nd
+from mxnet.test_utils import download
+
+# Training settings
+parser = argparse.ArgumentParser(description='MXNet MNIST Example')
+
+parser.add_argument('--batch-size', type=int, default=64,
+                    help='training batch size (default: 64)')
+parser.add_argument('--dtype', type=str, default='float32',
+                    help='training data type (default: float32)')
+parser.add_argument('--epochs', type=int, default=5,
+                    help='number of training epochs (default: 5)')
+parser.add_argument('--lr', type=float, default=0.01,
+                    help='learning rate (default: 0.01)')
+parser.add_argument('--momentum', type=float, default=0.9,
+                    help='SGD momentum (default: 0.9)')
+parser.add_argument('--use-gpu', action='store_true', default=False,
+                    help='run training on GPU (default: False)')
+args = parser.parse_args()
+
+logging.basicConfig(level=logging.INFO)
+logging.info(args)
+
+
+# Function to get mnist iterator given a rank
+def get_mnist_iterator(rank):
+    data_dir = "data-%d" % rank
+    if not os.path.isdir(data_dir):
+        os.makedirs(data_dir)
+    zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip',
+                             dirname=data_dir)
+    with zipfile.ZipFile(zip_file_path) as zf:
+        zf.extractall(data_dir)
+
+    input_shape = (1, 28, 28)
+    batch_size = args.batch_size
+
+    train_iter = mx.io.MNISTIter(
+        image="%s/train-images-idx3-ubyte" % data_dir,
+        label="%s/train-labels-idx1-ubyte" % data_dir,
+        input_shape=input_shape,
+        batch_size=batch_size,
+        shuffle=True,
+        flat=False,
+        num_parts=hvd.size(),
+        part_index=hvd.rank()
+    )
+
+    val_iter = mx.io.MNISTIter(
+        image="%s/t10k-images-idx3-ubyte" % data_dir,
+        label="%s/t10k-labels-idx1-ubyte" % data_dir,
+        input_shape=input_shape,
+        batch_size=batch_size,
+        flat=False,
+    )
+
+    return train_iter, val_iter
+
+
+# Function to define neural network
+def conv_nets():
+    net = gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
+        net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+        net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
+        net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+        net.add(gluon.nn.Flatten())
+        net.add(gluon.nn.Dense(512, activation="relu"))
+        net.add(gluon.nn.Dense(10))
+    return net
+
+
+# Function to evaluate accuracy for a model
+def evaluate(model, data_iter, context):
+    data_iter.reset()
+    metric = mx.metric.Accuracy()
+    for _, batch in enumerate(data_iter):
+        data = batch.data[0].as_in_context(context)
+        label = batch.label[0].as_in_context(context)
+        output = model(data.astype(args.dtype, copy=False))
+        metric.update([label], [output])
+
+    return metric.get()
+
+
+# Initialize Horovod
+hvd.init()
+
+# Horovod: pin context to local rank
+context = mx.gpu(hvd.local_rank()) if args.use_gpu else mx.cpu(hvd.local_rank())
+num_workers = hvd.size()
+
+# Load training and validation data
+train_data, val_data = get_mnist_iterator(hvd.rank())
+
+# Build model
+model = conv_nets()
+model.cast(args.dtype)
+model.hybridize()
+
+# Define hyper parameters
+optimizer_params = {'momentum': args.momentum,
+                    'learning_rate': args.lr * hvd.size(),
+                    'rescale_grad': 1.0 / args.batch_size}
+
+# Add Horovod Distributed Optimizer
+opt = mx.optimizer.create('sgd', **optimizer_params)
+opt = hvd.DistributedOptimizer(opt)
+
+# Initialize parameters
+initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
+                             magnitude=2)
+model.initialize(initializer, ctx=context)
+
+# Fetch and broadcast parameters
+params = model.collect_params()
+if params is not None:
+    hvd.broadcast_parameters(params, root_rank=0)
+
+# Create trainer, loss function and train metric
+trainer = gluon.Trainer(params, opt, kvstore=None)
+loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+metric = mx.metric.Accuracy()
+
+# Train model
+for epoch in range(args.epochs):
+    tic = time.time()
+    train_data.reset()
+    metric.reset()
+    for nbatch, batch in enumerate(train_data, start=1):
+        data = batch.data[0].as_in_context(context)
+        label = batch.label[0].as_in_context(context)
+        with autograd.record():
+            output = model(data.astype(args.dtype, copy=False))
+            loss = loss_fn(output, label)
+        loss.backward()
+        trainer.step(args.batch_size)
+        metric.update([label], [output])
+
+        if nbatch % 100 == 0:
+            name, acc = metric.get()
+            logging.info('[Epoch %d Batch %d] Training: %s=%f' %
+                         (epoch, nbatch, name, acc))
+
+    if hvd.rank() == 0:
+        elapsed = time.time() - tic
+        speed = nbatch * args.batch_size * hvd.size() / elapsed
+        logging.info('Epoch[%d]\tSpeed=%.2f samples/s\tTime cost=%f',
+                     epoch, speed, elapsed)
+
+    # Evaluate model accuracy
+    _, train_acc = metric.get()
+    name, val_acc = evaluate(model, val_data, context)
+    if hvd.rank() == 0:
+        logging.info('Epoch[%d]\tTrain: %s=%f\tValidation: %s=%f', epoch, name,
+                     train_acc, name, val_acc)
+
+    if hvd.rank() == 0 and epoch == args.epochs - 1:
+        assert val_acc > 0.96, "Achieved accuracy (%f) is lower than expected\
+                                (0.96)" % val_acc
diff --git a/example/distributed_training-horovod/module_mnist.py b/example/distributed_training-horovod/module_mnist.py
new file mode 100644
index 000000000000..5c02aaed966c
--- /dev/null
+++ b/example/distributed_training-horovod/module_mnist.py
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import logging
+import os
+import zipfile
+
+import horovod.mxnet as hvd
+import mxnet as mx
+from mxnet.test_utils import download
+
+# Training settings
+parser = argparse.ArgumentParser(description='MXNet MNIST Example')
+parser.add_argument('--batch-size', type=int, default=64,
+                    help='training batch size (default: 64)')
+parser.add_argument('--dtype', type=str, default='float32',
+                    help='training data type (default: float32)')
+parser.add_argument('--epochs', type=int, default=5,
+                    help='number of training epochs (default: 5)')
+parser.add_argument('--lr', type=float, default=0.05,
+                    help='learning rate (default: 0.05)')
+parser.add_argument('--momentum', type=float, default=0.5,
+                    help='SGD momentum (default: 0.5)')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disables CUDA training (default: False)')
+args = parser.parse_args()
+
+if not args.no_cuda:
+    # Disable CUDA if there are no GPUs.
+    if not mx.test_utils.list_gpus():
+        args.no_cuda = True
+
+logging.basicConfig(level=logging.INFO)
+logging.info(args)
+
+
+# Function to get mnist iterator given a rank
+def get_mnist_iterator(rank):
+    data_dir = "data-%d" % rank
+    if not os.path.isdir(data_dir):
+        os.makedirs(data_dir)
+    zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip',
+                             dirname=data_dir)
+    with zipfile.ZipFile(zip_file_path) as zf:
+        zf.extractall(data_dir)
+
+    input_shape = (1, 28, 28)
+    batch_size = args.batch_size
+
+    train_iter = mx.io.MNISTIter(
+        image="%s/train-images-idx3-ubyte" % data_dir,
+        label="%s/train-labels-idx1-ubyte" % data_dir,
+        input_shape=input_shape,
+        batch_size=batch_size,
+        shuffle=True,
+        flat=False,
+        num_parts=hvd.size(),
+        part_index=hvd.rank()
+    )
+
+    val_iter = mx.io.MNISTIter(
+        image="%s/t10k-images-idx3-ubyte" % data_dir,
+        label="%s/t10k-labels-idx1-ubyte" % data_dir,
+        input_shape=input_shape,
+        batch_size=batch_size,
+        flat=False,
+        num_parts=hvd.size(),
+        part_index=hvd.rank()
+    )
+
+    return train_iter, val_iter
+
+# Step 1: initialize Horovod
+hvd.init()
+
+# Horovod: pin context to process
+context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
+
+# Step 2: load data
+train_iter, val_iter = get_mnist_iterator(hvd.rank())
+
+
+# Step 3: define network
+def conv_net():
+    # placeholder for data
+    data = mx.sym.var('data')
+    # first conv layer
+    conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=10)
+    relu1 = mx.sym.Activation(data=conv1, act_type='relu')
+    pool1 = mx.sym.Pooling(data=relu1, pool_type='max', kernel=(2, 2),
+                           stride=(2, 2))
+    # second conv layer
+    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=20)
+    relu2 = mx.sym.Activation(data=conv2, act_type='relu')
+    pool2 = mx.sym.Pooling(data=relu2, pool_type='max', kernel=(2, 2),
+                           stride=(2, 2))
+    # first fully connected layer
+    flatten = mx.sym.flatten(data=pool2)
+    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=50)
+    relu3 = mx.sym.Activation(data=fc1, act_type='relu')
+    # second fully connected layer
+    fc2 = mx.sym.FullyConnected(data=relu3, num_hidden=10)
+    # softmax loss
+    loss = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
+    return loss
+
+
+# Step 4: fit the model
+net = conv_net()
+model = mx.mod.Module(symbol=net, context=context)
+optimizer_params = {'learning_rate': args.lr * hvd.size(),
+                    'rescale_grad': 1.0 / args.batch_size}
+opt = mx.optimizer.create('sgd', **optimizer_params)
+
+# Horovod: wrap optimizer with DistributedOptimizer
+opt = hvd.DistributedOptimizer(opt)
+
+initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
+                             magnitude=2)
+model.bind(data_shapes=train_iter.provide_data,
+           label_shapes=train_iter.provide_label)
+model.init_params(initializer)
+
+# Horovod: fetch and broadcast parameters
+(arg_params, aux_params) = model.get_params()
+if arg_params is not None:
+    hvd.broadcast_parameters(arg_params, root_rank=0)
+if aux_params is not None:
+    hvd.broadcast_parameters(aux_params, root_rank=0)
+model.set_params(arg_params=arg_params, aux_params=aux_params)
+
+model.fit(train_iter,  # train data
+          kvstore=None,  # no kvstore
+          eval_data=val_iter,  # validation data
+          optimizer=opt,  # use SGD to train
+          eval_metric='acc',  # report accuracy during training
+          batch_end_callback=mx.callback.Speedometer(args.batch_size),
+          num_epoch=args.epochs)  # train for at most 10 dataset passes
+
+# Step 5: evaluate model accuracy
+acc = mx.metric.Accuracy()
+model.score(val_iter, acc)
+
+if hvd.rank() == 0:
+    print(acc)
+    assert acc.get()[1] > 0.96, "Achieved accuracy (%f) is lower than \
+                                expected (0.96)" % acc.get()[1]
diff --git a/example/distributed_training-horovod/resnet50_imagenet.py b/example/distributed_training-horovod/resnet50_imagenet.py
new file mode 100644
index 000000000000..9b993403a9f0
--- /dev/null
+++ b/example/distributed_training-horovod/resnet50_imagenet.py
@@ -0,0 +1,453 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import logging
+import math
+import os
+import time
+
+from gluoncv.model_zoo import get_model
+import horovod.mxnet as hvd
+import mxnet as mx
+import numpy as np
+from mxnet import autograd, gluon, lr_scheduler
+from mxnet.io import DataBatch, DataIter
+
+
+# Training settings
+parser = argparse.ArgumentParser(description='MXNet ImageNet Example',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--use-rec', action='store_true', default=False,
+                    help='use image record iter for data input (default: False)')
+parser.add_argument('--data-nthreads', type=int, default=2,
+                    help='number of threads for data decoding (default: 2)')
+parser.add_argument('--rec-train', type=str, default='',
+                    help='the training data')
+parser.add_argument('--rec-train-idx', type=str, default='',
+                    help='the index of training data')
+parser.add_argument('--rec-val', type=str, default='',
+                    help='the validation data')
+parser.add_argument('--rec-val-idx', type=str, default='',
+                    help='the index of validation data')
+parser.add_argument('--batch-size', type=int, default=128,
+                    help='training batch size per device (default: 128)')
+parser.add_argument('--dtype', type=str, default='float32',
+                    help='data type for training (default: float32)')
+parser.add_argument('--num-epochs', type=int, default=90,
+                    help='number of training epochs (default: 90)')
+parser.add_argument('--lr', type=float, default=0.05,
+                    help='learning rate for a single GPU (default: 0.05)')
+parser.add_argument('--momentum', type=float, default=0.9,
+                    help='momentum value for optimizer (default: 0.9)')
+parser.add_argument('--wd', type=float, default=0.0001,
+                    help='weight decay rate (default: 0.0001)')
+parser.add_argument('--lr-mode', type=str, default='poly',
+                    help='learning rate scheduler mode. Options are step, \
+                    poly and cosine (default: poly)')
+parser.add_argument('--lr-decay', type=float, default=0.1,
+                    help='decay rate of learning rate (default: 0.1)')
+parser.add_argument('--lr-decay-epoch', type=str, default='40,60',
+                    help='epoches at which learning rate decays (default: 40,60)')
+parser.add_argument('--warmup-lr', type=float, default=0.0,
+                    help='starting warmup learning rate (default: 0.0)')
+parser.add_argument('--warmup-epochs', type=int, default=10,
+                    help='number of warmup epochs (default: 10)')
+parser.add_argument('--last-gamma', action='store_true', default=False,
+                    help='whether to init gamma of the last BN layer in \
+                    each bottleneck to 0 (default: False)')
+parser.add_argument('--model', type=str, default='resnet50_v1',
+                    help='type of model to use. see vision_model for options.')
+parser.add_argument('--mode', type=str, default='module',
+                    help='mode in which to train the model. options are \
+                    module, gluon (default: module)')
+parser.add_argument('--use-pretrained', action='store_true', default=False,
+                    help='load pretrained model weights (default: False)')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disables CUDA training (default: False)')
+parser.add_argument('--eval-epoch', action='store_true', default=False,
+                    help='evaluate validation accuracy after each epoch \
+                    when training in module mode (default: False)')
+parser.add_argument('--eval-frequency', type=int, default=0,
+                    help='frequency of evaluating validation accuracy \
+                    when training with gluon mode (default: 0)')
+parser.add_argument('--log-interval', type=int, default=0,
+                    help='number of batches to wait before logging (default: 0)')
+parser.add_argument('--save-frequency', type=int, default=0,
+                    help='frequency of model saving (default: 0)')
+
+
+args = parser.parse_args()
+
+logging.basicConfig(level=logging.INFO)
+logging.info(args)
+
+# Horovod: initialize Horovod
+hvd.init()
+num_workers = hvd.size()
+rank = hvd.rank()
+local_rank = hvd.local_rank()
+
+num_classes = 1000
+num_training_samples = 1281167
+batch_size = args.batch_size
+epoch_size = \
+    int(math.ceil(int(num_training_samples // num_workers) / batch_size))
+
+if args.lr_mode == 'step':
+    lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]
+    steps = [epoch_size * x for x in lr_decay_epoch]
+    lr_sched = lr_scheduler.MultiFactorScheduler(
+        step=steps,
+        factor=args.lr_decay,
+        base_lr=(args.lr * num_workers),
+        warmup_steps=(args.warmup_epochs * epoch_size),
+        warmup_begin_lr=args.warmup_lr
+    )
+elif args.lr_mode == 'poly':
+    lr_sched = lr_scheduler.PolyScheduler(
+        args.num_epochs * epoch_size,
+        base_lr=(args.lr * num_workers),
+        pwr=2,
+        warmup_steps=(args.warmup_epochs * epoch_size),
+        warmup_begin_lr=args.warmup_lr
+    )
+elif args.lr_mode == 'cosine':
+    lr_sched = lr_scheduler.CosineScheduler(
+        args.num_epochs * epoch_size,
+        base_lr=(args.lr * num_workers),
+        warmup_steps=(args.warmup_epochs * epoch_size),
+        warmup_begin_lr=args.warmup_lr
+    )
+else:
+    raise ValueError('Invalid lr mode')
+
+# Function for reading data from record file
+# For more details about data loading in MXNet, please refer to
+# https://mxnet.incubator.apache.org/tutorials/basic/data.html?highlight=imagerecorditer
+def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size,
+                 data_nthreads):
+    rec_train = os.path.expanduser(rec_train)
+    rec_train_idx = os.path.expanduser(rec_train_idx)
+    rec_val = os.path.expanduser(rec_val)
+    rec_val_idx = os.path.expanduser(rec_val_idx)
+    jitter_param = 0.4
+    lighting_param = 0.1
+    mean_rgb = [123.68, 116.779, 103.939]
+
+    def batch_fn(batch, ctx):
+        data = batch.data[0].as_in_context(ctx)
+        label = batch.label[0].as_in_context(ctx)
+        return data, label
+
+    train_data = mx.io.ImageRecordIter(
+        path_imgrec=rec_train,
+        path_imgidx=rec_train_idx,
+        preprocess_threads=data_nthreads,
+        shuffle=True,
+        batch_size=batch_size,
+        label_width=1,
+        data_shape=(3, 224, 224),
+        mean_r=mean_rgb[0],
+        mean_g=mean_rgb[1],
+        mean_b=mean_rgb[2],
+        rand_mirror=True,
+        rand_crop=False,
+        random_resized_crop=True,
+        max_aspect_ratio=4. / 3.,
+        min_aspect_ratio=3. / 4.,
+        max_random_area=1,
+        min_random_area=0.08,
+        verbose=False,
+        brightness=jitter_param,
+        saturation=jitter_param,
+        contrast=jitter_param,
+        pca_noise=lighting_param,
+        num_parts=num_workers,
+        part_index=rank,
+        device_id=local_rank
+    )
+    # Kept each node to use full val data to make it easy to monitor results
+    val_data = mx.io.ImageRecordIter(
+        path_imgrec=rec_val,
+        path_imgidx=rec_val_idx,
+        preprocess_threads=data_nthreads,
+        shuffle=False,
+        batch_size=batch_size,
+        resize=256,
+        label_width=1,
+        rand_crop=False,
+        rand_mirror=False,
+        data_shape=(3, 224, 224),
+        mean_r=mean_rgb[0],
+        mean_g=mean_rgb[1],
+        mean_b=mean_rgb[2],
+        device_id=local_rank
+    )
+
+    return train_data, val_data, batch_fn
+
+# Create data iterator for synthetic data
+class SyntheticDataIter(DataIter):
+    def __init__(self, num_classes, data_shape, max_iter, dtype, ctx):
+        self.batch_size = data_shape[0]
+        self.cur_iter = 0
+        self.max_iter = max_iter
+        self.dtype = dtype
+        label = np.random.randint(0, num_classes, [self.batch_size, ])
+        data = np.random.uniform(-1, 1, data_shape)
+        self.data = mx.nd.array(data, dtype=self.dtype,
+                                ctx=ctx)
+        self.label = mx.nd.array(label, dtype=self.dtype,
+                                 ctx=ctx)
+
+    def __iter__(self):
+        return self
+
+    @property
+    def provide_data(self):
+        return [mx.io.DataDesc('data', self.data.shape, self.dtype)]
+
+    @property
+    def provide_label(self):
+        return [mx.io.DataDesc('softmax_label',
+                               (self.batch_size,), self.dtype)]
+
+    def next(self):
+        self.cur_iter += 1
+        if self.cur_iter <= self.max_iter:
+            return DataBatch(data=(self.data,),
+                             label=(self.label,),
+                             pad=0,
+                             index=None,
+                             provide_data=self.provide_data,
+                             provide_label=self.provide_label)
+        else:
+            raise StopIteration
+
+    def __next__(self):
+        return self.next()
+
+    def reset(self):
+        self.cur_iter = 0
+
+# Horovod: pin GPU to local rank
+context = mx.cpu(local_rank) if args.no_cuda else mx.gpu(local_rank)
+
+if args.use_rec:
+    # Fetch training and validation data if present
+    train_data, val_data, batch_fn = get_data_rec(args.rec_train,
+                                                  args.rec_train_idx,
+                                                  args.rec_val,
+                                                  args.rec_val_idx,
+                                                  batch_size,
+                                                  args.data_nthreads)
+else:
+    # Otherwise use synthetic data
+    image_shape = (3, 224, 224)
+    data_shape = (batch_size,) + image_shape
+    train_data = SyntheticDataIter(num_classes, data_shape, epoch_size,
+                                   np.float32, context)
+    val_data = None
+
+
+# Get model from GluonCV model zoo
+# https://gluon-cv.mxnet.io/model_zoo/index.html
+kwargs = {'ctx': context,
+          'pretrained': args.use_pretrained,
+          'classes': num_classes}
+if args.last_gamma:
+    kwargs['last_gamma'] = True
+net = get_model(args.model, **kwargs)
+net.cast(args.dtype)
+
+# Create initializer
+initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
+                             magnitude=2)
+
+# Create optimizer
+optimizer_params = {'wd': args.wd,
+                    'momentum': args.momentum,
+                    'rescale_grad': 1.0 / batch_size,
+                    'lr_scheduler': lr_sched}
+if args.dtype == 'float16':
+    optimizer_params['multi_precision'] = True
+opt = mx.optimizer.create('sgd', **optimizer_params)
+
+# Horovod: wrap optimizer with DistributedOptimizer
+opt = hvd.DistributedOptimizer(opt)
+
+
+def train_gluon():
+    def evaluate(epoch):
+        if not args.use_rec:
+            return
+
+        val_data.reset()
+        acc_top1 = mx.metric.Accuracy()
+        acc_top5 = mx.metric.TopKAccuracy(5)
+        for _, batch in enumerate(val_data):
+            data, label = batch_fn(batch, context)
+            output = net(data.astype(args.dtype, copy=False))
+            acc_top1.update([label], [output])
+            acc_top5.update([label], [output])
+
+        top1_name, top1_acc = acc_top1.get()
+        top5_name, top5_acc = acc_top5.get()
+        logging.info('Epoch[%d] Rank[%d]\tValidation-%s=%f\tValidation-%s=%f',
+                     epoch, rank, top1_name, top1_acc, top5_name, top5_acc)
+
+    # Hybridize and initialize model
+    net.hybridize()
+    net.initialize(initializer, ctx=context)
+
+    # Horovod: fetch and broadcast parameters
+    params = net.collect_params()
+    if params is not None:
+        hvd.broadcast_parameters(params, root_rank=0)
+
+    # Create trainer, loss function and train metric
+    trainer = gluon.Trainer(params, opt, kvstore=None)
+    loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+    metric = mx.metric.Accuracy()
+
+    # Train model
+    for epoch in range(args.num_epochs):
+        tic = time.time()
+        if args.use_rec:
+            train_data.reset()
+        metric.reset()
+
+        btic = time.time()
+        for nbatch, batch in enumerate(train_data, start=1):
+            data, label = batch_fn(batch, context)
+            with autograd.record():
+                output = net(data.astype(args.dtype, copy=False))
+                loss = loss_fn(output, label)
+            loss.backward()
+            trainer.step(batch_size)
+
+            metric.update([label], [output])
+            if args.log_interval and nbatch % args.log_interval == 0:
+                name, acc = metric.get()
+                logging.info('Epoch[%d] Rank[%d] Batch[%d]\t%s=%f\tlr=%f',
+                             epoch, rank, nbatch, name, acc, trainer.learning_rate)
+                if rank == 0:
+                    batch_speed = num_workers * batch_size * args.log_interval / (time.time() - btic)
+                    logging.info('Epoch[%d] Batch[%d]\tSpeed: %.2f samples/sec',
+                                 epoch, nbatch, batch_speed)
+                btic = time.time()
+
+        # Report metrics
+        elapsed = time.time() - tic
+        _, acc = metric.get()
+        logging.info('Epoch[%d] Rank[%d] Batch[%d]\tTime cost=%.2f\tTrain-accuracy=%f',
+                     epoch, rank, nbatch, elapsed, acc)
+        if rank == 0:
+            epoch_speed = num_workers * batch_size * nbatch / elapsed
+            logging.info('Epoch[%d]\tSpeed: %.2f samples/sec', epoch, epoch_speed)
+
+        # Evaluate performance
+        if args.eval_frequency and (epoch + 1) % args.eval_frequency == 0:
+            evaluate(epoch)
+
+        # Save model
+        if args.save_frequency and (epoch + 1) % args.save_frequency == 0:
+            net.export('%s-%d' % (args.model, rank), epoch=epoch)
+
+    # Evaluate performance at the end of training
+    evaluate(epoch)
+
+
+def train_module():
+    # Create input symbol
+    data = mx.sym.var('data')
+    if args.dtype == 'float16':
+        data = mx.sym.Cast(data=data, dtype=np.float16)
+        net.cast(np.float16)
+
+    # Create output symbol
+    out = net(data)
+    if args.dtype == 'float16':
+        out = mx.sym.Cast(data=out, dtype=np.float32)
+    softmax = mx.sym.SoftmaxOutput(out, name='softmax')
+
+    # Create model
+    mod = mx.mod.Module(softmax, context=context)
+
+    # Initialize parameters
+    if args.use_pretrained:
+        arg_params = {}
+        for x in net.collect_params().values():
+            x.reset_ctx(mx.cpu())
+            arg_params[x.name] = x.data()
+    else:
+        arg_params = None
+    aux_params = None
+    mod.bind(data_shapes=train_data.provide_data,
+             label_shapes=train_data.provide_label)
+    mod.init_params(initializer, arg_params=arg_params, aux_params=aux_params)
+
+    # Horovod: fetch and broadcast parameters
+    (arg_params, aux_params) = mod.get_params()
+    if arg_params is not None:
+        hvd.broadcast_parameters(arg_params, root_rank=0)
+    if aux_params is not None:
+        hvd.broadcast_parameters(aux_params, root_rank=0)
+    mod.set_params(arg_params=arg_params, aux_params=aux_params)
+
+    # Setup validation data and callback during training
+    eval_data = None
+    if args.eval_epoch:
+        eval_data = val_data
+    batch_callback = None
+    if args.log_interval > 0 and rank == 0:
+        batch_callback = mx.callback.Speedometer(batch_size * num_workers,
+                                                 args.log_interval)
+
+    epoch_callback = None
+    if args.save_frequency > 0:
+        epoch_callback = mx.callback.do_checkpoint(
+            '%s-%d' % (args.model, rank),
+            period=args.save_frequency)
+
+    # Train model
+    mod.fit(train_data,
+            eval_data=eval_data,
+            num_epoch=args.num_epochs,
+            kvstore=None,
+            batch_end_callback=batch_callback,
+            epoch_end_callback=epoch_callback,
+            optimizer=opt)
+
+    # Evaluate performance if not using synthetic data
+    if args.use_rec:
+        acc_top1 = mx.metric.Accuracy()
+        acc_top5 = mx.metric.TopKAccuracy(5)
+        res = mod.score(val_data, [acc_top1, acc_top5])
+        for name, val in res:
+            logging.info('Epoch[%d] Rank[%d] Validation-%s=%f',
+                         args.num_epochs - 1, rank, name, val)
+
+
+if __name__ == '__main__':
+    if args.mode == 'module':
+        train_module()
+    elif args.mode == 'gluon':
+        train_gluon()
+    else:
+        raise ValueError('Invalid training mode.')
diff --git a/example/distributed_training/cifar10_dist.py b/example/distributed_training/cifar10_dist.py
index 506afbbe081a..d3ba515776f6 100644
--- a/example/distributed_training/cifar10_dist.py
+++ b/example/distributed_training/cifar10_dist.py
@@ -17,15 +17,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
+"""cifar10_dist.py contains code that trains a ResNet18 network using distributed training"""
+
 from __future__ import print_function
-import random, sys
 
+import sys
+import random
+import numpy as np
 import mxnet as mx
 from mxnet import autograd, gluon, kv, nd
 from mxnet.gluon.model_zoo import vision
 
-import numpy as np
-
 # Create a distributed key-value store
 store = kv.create('dist')
 
@@ -45,11 +47,13 @@
 # Create the context (a list of all GPUs to be used for training)
 ctx = [mx.gpu(i) for i in range(gpus_per_machine)]
 
+
 # Convert to float 32
 # Having channel as the first dimension makes computation more efficient. Hence the (2,0,1) transpose.
 # Dividing by 255 normalizes the input between 0 and 1
 def transform(data, label):
-    return nd.transpose(data.astype(np.float32), (2,0,1))/255, label.astype(np.float32)
+    return nd.transpose(data.astype(np.float32), (2, 0, 1))/255, label.astype(np.float32)
+
 
 class SplitSampler(gluon.data.sampler.Sampler):
     """ Split the dataset into `num_parts` parts and sample from the part with index `part_index`
@@ -80,14 +84,14 @@ def __iter__(self):
     def __len__(self):
         return self.part_len
 
+
 # Load the training data
-train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(train=True, transform=transform),
-                                      batch_size,
-                                      sampler=SplitSampler(50000, store.num_workers, store.rank))
+train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(train=True, transform=transform), batch_size,
+                                   sampler=SplitSampler(50000, store.num_workers, store.rank))
 
-# Load the test data 
+# Load the test data
 test_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(train=False, transform=transform),
-                                     batch_size, shuffle=False)
+                                  batch_size, shuffle=False)
 
 # Use ResNet from model zoo
 net = vision.resnet18_v1()
@@ -98,12 +102,25 @@ def __len__(self):
 # SoftmaxCrossEntropy is the most common choice of loss function for multiclass classification
 softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
 
-# Use Adam optimizer. Ask trainer to use the distributer kv store.
+# Use Adam optimizer. Ask trainer to use the distributor kv store.
 trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': .001}, kvstore=store)
 
+
 # Evaluate accuracy of the given network using the given data
-def evaluate_accuracy(data_iterator, net):
+def evaluate_accuracy(data_iterator, network):
+    """ Measure the accuracy of ResNet
+
+    Parameters
+    ----------
+    data_iterator: Iter
+      examples of dataset
+    network:
+      ResNet
 
+    Returns
+    ----------
+    tuple of array element
+    """
     acc = mx.metric.Accuracy()
 
     # Iterate through data and label
@@ -115,7 +132,7 @@ def evaluate_accuracy(data_iterator, net):
 
         # Get network's output which is a probability distribution
         # Apply argmax on the probability distribution to get network's classification.
-        output = net(data)
+        output = network(data)
         predictions = nd.argmax(output, axis=1)
 
         # Give network's prediction and the correct label to update the metric
@@ -124,38 +141,54 @@ def evaluate_accuracy(data_iterator, net):
     # Return the accuracy
     return acc.get()[1]
 
+
 # We'll use cross entropy loss since we are doing multiclass classification
 loss = gluon.loss.SoftmaxCrossEntropyLoss()
 
+
 # Run one forward and backward pass on multiple GPUs
-def forward_backward(net, data, label):
+def forward_backward(network, data, label):
 
     # Ask autograd to remember the forward pass
     with autograd.record():
         # Compute the loss on all GPUs
-        losses = [loss(net(X), Y) for X, Y in zip(data, label)]
+        losses = [loss(network(X), Y) for X, Y in zip(data, label)]
 
     # Run the backward pass (calculate gradients) on all GPUs
     for l in losses:
         l.backward()
 
+
 # Train a batch using multiple GPUs
-def train_batch(batch, ctx, net, trainer):
+def train_batch(batch_list, context, network, gluon_trainer):
+    """ Training with multiple GPUs
 
+    Parameters
+    ----------
+    batch_list: List
+      list of dataset
+    context: List
+      a list of all GPUs to be used for training
+    network:
+      ResNet
+    gluon_trainer:
+      rain module of gluon
+    """
     # Split and load data into multiple GPUs
-    data = batch[0]
-    data = gluon.utils.split_and_load(data, ctx)
+    data = batch_list[0]
+    data = gluon.utils.split_and_load(data, context)
 
     # Split and load label into multiple GPUs
-    label = batch[1]
-    label = gluon.utils.split_and_load(label, ctx)
+    label = batch_list[1]
+    label = gluon.utils.split_and_load(label, context)
 
     # Run the forward and backward pass
-    forward_backward(net, data, label)
+    forward_backward(network, data, label)
 
     # Update the parameters
-    this_batch_size = batch[0].shape[0]
-    trainer.step(this_batch_size)
+    this_batch_size = batch_list[0].shape[0]
+    gluon_trainer.step(this_batch_size)
+
 
 # Run as many epochs as required
 for epoch in range(epochs):
@@ -173,4 +206,3 @@ def train_batch(batch, ctx, net, trainer):
     test_accuracy = evaluate_accuracy(test_data, net)
     print("Epoch %d: Test_acc %f" % (epoch, test_accuracy))
     sys.stdout.flush()
-
diff --git a/example/gluon/dc_gan/dcgan.py b/example/gluon/dc_gan/dcgan.py
index 970c35d54df4..93af13ababf3 100644
--- a/example/gluon/dc_gan/dcgan.py
+++ b/example/gluon/dc_gan/dcgan.py
@@ -14,28 +14,27 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Generate MXNet implementation of Deep Convolutional Generative Adversarial Networks"""
 
-import matplotlib as mpl
-mpl.use('Agg')
-from matplotlib import pyplot as plt
-
+import logging
+from datetime import datetime
 import argparse
+import os
+import time
+import numpy as np
+from matplotlib import pyplot as plt
+import matplotlib as mpl
 import mxnet as mx
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet import autograd
-import numpy as np
-import logging
-from datetime import datetime
-import os
-import time
-
 from inception_score import get_inception_score
 
+mpl.use('Agg')
+
 
 def fill_buf(buf, i, img, shape):
-    """
-    Reposition the images generated by the generator so that it can be saved as picture matrix.
+    """Reposition the images generated by the generator so that it can be saved as picture matrix.
     :param buf: the images metric
     :param i: index of each image
     :param img: images generated by generator once
@@ -48,12 +47,10 @@ def fill_buf(buf, i, img, shape):
     sx = (i%m)*shape[0]
     sy = (i//m)*shape[1]
     buf[sy:sy+shape[1], sx:sx+shape[0], :] = img
-    return None
 
 
 def visual(title, X, name):
-    """
-    Image visualization and preservation
+    """Image visualization and preservation
     :param title: title
     :param X: images to visualized
     :param name: saved picture`s name
@@ -79,9 +76,11 @@ def visual(title, X, name):
 parser.add_argument('--batch-size', type=int, default=64, help='input batch size, default is 64')
 parser.add_argument('--nz', type=int, default=100, help='size of the latent z vector, default is 100')
 parser.add_argument('--ngf', type=int, default=64, help='the channel of each generator filter layer, default is 64.')
-parser.add_argument('--ndf', type=int, default=64, help='the channel of each descriminator filter layer, default is 64.')
+parser.add_argument('--ndf', type=int, default=64, help='the channel of each descriminator filter layer, '
+                                                        'default is 64.')
 parser.add_argument('--nepoch', type=int, default=25, help='number of epochs to train for, default is 25.')
-parser.add_argument('--niter', type=int, default=10, help='save generated images and inception_score per niter iters, default is 100.')
+parser.add_argument('--niter', type=int, default=10, help='save generated images and inception_score per niter iters, '
+                                                          'default is 100.')
 parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002')
 parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
 parser.add_argument('--cuda', action='store_true', help='enables cuda')
@@ -89,7 +88,8 @@ def visual(title, X, name):
 parser.add_argument('--netD', default='', help="path to netD (to continue training)")
 parser.add_argument('--outf', default='./results', help='folder to output images and model checkpoints')
 parser.add_argument('--check-point', default=True, help="save results at each epoch or not")
-parser.add_argument('--inception_score', type=bool, default=True, help='To record the inception_score, default is True.')
+parser.add_argument('--inception_score', type=bool, default=True, help='To record the inception_score, '
+                                                                       'default is True.')
 
 opt = parser.parse_args()
 print(opt)
@@ -115,6 +115,7 @@ def visual(title, X, name):
 
 
 def transformer(data, label):
+    """Get the translation of images"""
     # resize to 64x64
     data = mx.image.imresize(data, 64, 64)
     # transpose from (64, 64, 3) to (3, 64, 64)
@@ -128,7 +129,17 @@ def transformer(data, label):
 
 
 # get dataset with the batch_size num each time
-def get_dataset(dataset):
+def get_dataset(dataset_name):
+    """Load the dataset and split it to train/valid data
+
+    :param dataset_name: string
+
+    Returns:
+    train_data: int array
+        training dataset
+    val_data: int array
+        valid dataset
+    """
     # mnist
     if dataset == "mnist":
         train_data = gluon.data.DataLoader(
@@ -152,6 +163,7 @@ def get_dataset(dataset):
 
 
 def get_netG():
+    """Get net G"""
     # build the generator
     netG = nn.Sequential()
     with netG.name_scope():
@@ -180,6 +192,7 @@ def get_netG():
 
 
 def get_netD():
+    """Get the netD"""
     # build the discriminator
     netD = nn.Sequential()
     with netD.name_scope():
@@ -206,6 +219,7 @@ def get_netD():
 
 
 def get_configurations(netG, netD):
+    """Get configurations for net"""
     # loss
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
 
@@ -233,6 +247,7 @@ def ins_save(inception_score):
 
 # main function
 def main():
+    """Entry point to dcgan"""
     print("|------- new changes!!!!!!!!!")
     # to get the dataset and net configuration
     train_data, val_data = get_dataset(dataset)
@@ -300,7 +315,7 @@ def main():
 
             name, acc = metric.get()
             logging.info('discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d'
-                         % (mx.nd.mean(errD).asscalar(), mx.nd.mean(errG).asscalar(), acc, iter, epoch))
+                         , mx.nd.mean(errD).asscalar(), mx.nd.mean(errG).asscalar(), acc, iter, epoch)
             if iter % niter == 0:
                 visual('gout', fake.asnumpy(), name=os.path.join(outf, 'fake_img_iter_%d.png' % iter))
                 visual('data', data.asnumpy(), name=os.path.join(outf, 'real_img_iter_%d.png' % iter))
@@ -316,13 +331,13 @@ def main():
 
         name, acc = metric.get()
         metric.reset()
-        logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc))
-        logging.info('time: %f' % (time.time() - tic))
+        logging.info('\nbinary training acc at epoch %d: %s=%f', epoch, name, acc)
+        logging.info('time: %f', time.time() - tic)
 
         # save check_point
         if check_point:
-            netG.save_parameters(os.path.join(outf,'generator_epoch_%d.params' %epoch))
-            netD.save_parameters(os.path.join(outf,'discriminator_epoch_%d.params' % epoch))
+            netG.save_parameters(os.path.join(outf, 'generator_epoch_%d.params' %epoch))
+            netD.save_parameters(os.path.join(outf, 'discriminator_epoch_%d.params' % epoch))
 
     # save parameter
     netG.save_parameters(os.path.join(outf, 'generator.params'))
@@ -335,6 +350,6 @@ def main():
 
 if __name__ == '__main__':
     if opt.inception_score:
-        print("Use inception_score to metric this DCgan model, the reusult is save as a picture named \"inception_score.png\"!")
+        print("Use inception_score to metric this DCgan model, the reusult is save as a picture "
+              "named \"inception_score.png\"!")
     main()
-
diff --git a/example/gluon/lstm_crf/lstm_crf.py b/example/gluon/lstm_crf/lstm_crf.py
index 9c2218577312..011dcfbc4aea 100644
--- a/example/gluon/lstm_crf/lstm_crf.py
+++ b/example/gluon/lstm_crf/lstm_crf.py
@@ -14,46 +14,50 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""This example demonstrates how the LSTM-CRF model can be implemented
+in Gluon to perform noun-phrase chunking as a sequence labeling task.
+"""
+import sys
 import mxnet as mx
 from mxnet import autograd as ag, ndarray as nd, gluon
 from mxnet.gluon import Block, nn, rnn
 import mxnet.optimizer as optim
-import sys
-
-# This example demonstrates how the LSTM-CRF model can be implemented 
-# in Gluon to perform noun-phrase chunking as a sequence labeling task.
 
 mx.random.seed(1)
 
+
 # Helper functions to make the code more readable.
 def to_scalar(x):
     return int(x.asscalar())
 
+
 def argmax(vec):
     # return the argmax as a python int
     idx = nd.argmax(vec, axis=1)
     return to_scalar(idx)
 
-def prepare_sequence(seq, word2idx):
-    return nd.array([word2idx[w] for w in seq])
+
+def prepare_sequence(seq, word2Idx):
+    return nd.array([word2Idx[w] for w in seq])
+
 
 # Compute log sum exp is numerically more stable than multiplying probabilities
 def log_sum_exp(vec):
     max_score = nd.max(vec).asscalar()
     return nd.log(nd.sum(nd.exp(vec - max_score))) + max_score
 
+
 # Model
 class BiLSTM_CRF(Block):
-    def __init__(self, vocab_size, tag2idx, embedding_dim, hidden_dim):
+    """Get BiLSTM_CRF model"""
+    def __init__(self, vocab_size, tag2Idx, embedding_dim, hidden_dim):
         super(BiLSTM_CRF, self).__init__()
         with self.name_scope():
             self.embedding_dim = embedding_dim
             self.hidden_dim = hidden_dim
             self.vocab_size = vocab_size
-            self.tag2idx = tag2idx
-            self.tagset_size = len(tag2idx)
-
+            self.tag2idx = tag2Idx
+            self.tagset_size = len(tag2Idx)
             self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
             self.lstm = rnn.LSTM(hidden_dim // 2, num_layers=1, bidirectional=True)
 
@@ -62,9 +66,7 @@ def __init__(self, vocab_size, tag2idx, embedding_dim, hidden_dim):
 
             # Matrix of transition parameters.  Entry i,j is the score of
             # transitioning *to* i *from* j.
-            self.transitions = self.params.get("crf_transition_matrix", 
-                                               shape=(self.tagset_size, self.tagset_size))
-            
+            self.transitions = self.params.get("crf_transition_matrix", shape=(self.tagset_size, self.tagset_size))
             self.hidden = self.init_hidden()
 
     def init_hidden(self):
@@ -98,24 +100,25 @@ def _forward_alg(self, feats):
         alpha = log_sum_exp(terminal_var)
         return alpha
 
-    def _get_lstm_features(self, sentence):
+    def _get_lstm_features(self, sentences):
         self.hidden = self.init_hidden()
-        length = sentence.shape[0]
-        embeds = self.word_embeds(sentence).reshape((length, 1, -1))
+        length = sentences.shape[0]
+        embeds = self.word_embeds(sentences).reshape((length, 1, -1))
         lstm_out, self.hidden = self.lstm(embeds, self.hidden)
         lstm_out = lstm_out.reshape((length, self.hidden_dim))
         lstm_feats = self.hidden2tag(lstm_out)
         return nd.split(lstm_feats, num_outputs=length, axis=0, squeeze_axis=True)
 
-    def _score_sentence(self, feats, tags):
+    def _score_sentence(self, feats, tags_array):
         # Gives the score of a provided tag sequence
         score = nd.array([0])
-        tags = nd.concat(nd.array([self.tag2idx[START_TAG]]), *tags, dim=0)
-        for i, feat in enumerate(feats):
+        tags_array = nd.concat(nd.array([self.tag2idx[START_TAG]]), *tags_array, dim=0)
+        for idx, feat in enumerate(feats):
             score = score + \
-                self.transitions.data()[to_scalar(tags[i+1]), to_scalar(tags[i])] + feat[to_scalar(tags[i+1])]
+                    self.transitions.data()[to_scalar(tags_array[idx+1]),
+                                            to_scalar(tags_array[idx])] + feat[to_scalar(tags_array[idx+1])]
         score = score + self.transitions.data()[self.tag2idx[STOP_TAG],
-                                         to_scalar(tags[int(tags.shape[0]-1)])]
+                                                to_scalar(tags.array[int(tags_array.shape[0]-1)])]
         return score
 
     def _viterbi_decode(self, feats):
@@ -160,20 +163,21 @@ def _viterbi_decode(self, feats):
         best_path.reverse()
         return path_score, best_path
 
-    def neg_log_likelihood(self, sentence, tags):
-        feats = self._get_lstm_features(sentence)
+    def neg_log_likelihood(self, sentences, tags_list):
+        feats = self._get_lstm_features(sentences)
         forward_score = self._forward_alg(feats)
-        gold_score = self._score_sentence(feats, tags)
+        gold_score = self._score_sentence(feats, tags_list)
         return forward_score - gold_score
 
-    def forward(self, sentence):  # dont confuse this with _forward_alg above.
+    def forward(self, sentences):  # dont confuse this with _forward_alg above.
         # Get the emission scores from the BiLSTM
-        lstm_feats = self._get_lstm_features(sentence)
+        lstm_feats = self._get_lstm_features(sentences)
 
         # Find the best path, given the features.
         score, tag_seq = self._viterbi_decode(lstm_feats)
         return score, tag_seq
 
+
 # Run training
 START_TAG = "<START>"
 STOP_TAG = "<STOP>"
@@ -210,6 +214,7 @@ def forward(self, sentence):  # dont confuse this with _forward_alg above.
 for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
 
     neg_log_likelihood_acc = 0.
+    iter = 0
     for i, (sentence, tags) in enumerate(training_data):
         # Step 1. Get our inputs ready for the network, that is,
         # turn them into Variables of word indices.
@@ -226,7 +231,8 @@ def forward(self, sentence):  # dont confuse this with _forward_alg above.
             neg_log_likelihood.backward()
         optimizer.step(1)
         neg_log_likelihood_acc += neg_log_likelihood.mean()
-    print("Epoch [{}], Negative Log Likelihood {:.4f}".format(epoch, neg_log_likelihood_acc.asscalar()/(i+1)))
+        iter = i
+    print("Epoch [{}], Negative Log Likelihood {:.4f}".format(epoch, neg_log_likelihood_acc.asscalar()/(iter+1)))
 
 # Check predictions after training
 precheck_sent = prepare_sequence(training_data[0][0], word2idx)
diff --git a/example/image-classification/README.md b/example/image-classification/README.md
index d0ffa3f72a7d..c44ba23f4a64 100644
--- a/example/image-classification/README.md
+++ b/example/image-classification/README.md
@@ -3,7 +3,7 @@
 This fold contains examples for image classification. The goal of image
 classifcation is to identify the objects contained in images. The following
 [example](http://mxnet.io/tutorials/python/predict_image.html) shows
-recognized object classes with corresponding probabilities using a pre-traind
+recognized object classes with corresponding probabilities using a pre-trained
 model.
 
 <img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/predict-dog.png" width="600"/>
diff --git a/example/image-classification/common/util.py b/example/image-classification/common/util.py
index 5f70411ab084..8737b69a7351 100644
--- a/example/image-classification/common/util.py
+++ b/example/image-classification/common/util.py
@@ -19,6 +19,8 @@
 import os
 import errno
 
+import mxnet as mx
+
 def download_file(url, local_fname=None, force_write=False):
     # requests is not default installed
     import requests
@@ -49,8 +51,4 @@ def get_gpus():
     """
     return a list of GPUs
     """
-    try:
-        re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True)
-    except OSError:
-        return []
-    return range(len([i for i in re.split('\n') if 'GPU' in i]))
+    return range(mx.util.get_gpu_count())
diff --git a/example/quantization/README.md b/example/quantization/README.md
index b77537d4fba7..fc9a26755b4e 100644
--- a/example/quantization/README.md
+++ b/example/quantization/README.md
@@ -35,6 +35,7 @@ The following models have been tested on Linux systems.
 |[ResNet152-V2](#8)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/resnet/152-layers/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|76.76%/93.03%|76.48%/92.96%|
 |[Inception-BN](#9)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/inception-bn/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.09%/90.60%|72.00%/90.53%|
 | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | VOC2007/2012  | 0.8366 mAP  | 0.8364 mAP  |
+| [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | COCO2014  | 0.2552 mAP  | 0.253 mAP  |
 
 <h3 id='3'>ResNet50-V1</h3>
 
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
index d807e7f2d19d..2ef137273cca 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -207,19 +207,18 @@ def save_params(fname, arg_params, aux_params, logger=None):
     if args.model == 'imagenet1k-resnet-152':
         rgb_mean = '0,0,0'
         rgb_std = '1,1,1'
-        excluded_sym_names += ['flatten0', 'fc1']
+        excluded_sym_names += ['flatten0']
         if exclude_first_conv:
             excluded_sym_names += ['conv0']
     elif args.model == 'imagenet1k-inception-bn':
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '1,1,1'
-        excluded_sym_names += ['flatten', 'fc1']
+        excluded_sym_names += ['flatten']
         if exclude_first_conv:
             excluded_sym_names += ['conv_1']
     elif args.model in ['resnet50_v1', 'resnet101_v1']:
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
-        excluded_sym_names += ['resnetv10_dense0_fwd']
         if exclude_first_conv:
             excluded_sym_names += ['resnetv10_conv0_fwd']
     elif args.model == 'squeezenet1.0':
@@ -232,14 +231,12 @@ def save_params(fname, arg_params, aux_params, logger=None):
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
         excluded_sym_names += ['mobilenet0_flatten0_flatten0',
-                               'mobilenet0_dense0_fwd',
                                'mobilenet0_pool0_fwd']
         if exclude_first_conv:
             excluded_sym_names += ['mobilenet0_conv0_fwd']
     elif args.model == 'inceptionv3':
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
-        excluded_sym_names += ['inception30_dense0_fwd']
         if exclude_first_conv:
             excluded_sym_names += ['inception30_conv0_fwd']
     elif args.model == 'custom':
diff --git a/example/quantization/imagenet_inference.py b/example/quantization/imagenet_inference.py
index 0725165b0ca5..47e206303e99 100644
--- a/example/quantization/imagenet_inference.py
+++ b/example/quantization/imagenet_inference.py
@@ -19,6 +19,7 @@
 import logging
 import os
 import time
+import numpy as np
 import mxnet as mx
 from mxnet import nd
 from mxnet.contrib.quantization import *
@@ -98,7 +99,7 @@ def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples,
             logger.info(m.get())
 
 
-def benchmark_score(symbol_file, ctx, batch_size, num_batches, logger=None):
+def benchmark_score(symbol_file, ctx, batch_size, num_batches, data_layer_type, logger=None):
     # get mod
     cur_path = os.path.dirname(os.path.realpath(__file__))
     symbol_file_path = os.path.join(cur_path, symbol_file)
@@ -106,14 +107,28 @@ def benchmark_score(symbol_file, ctx, batch_size, num_batches, logger=None):
         logger.info('Loading symbol from file %s' % symbol_file_path)
     sym = mx.sym.load(symbol_file_path)
     mod = mx.mod.Module(symbol=sym, context=ctx)
-    mod.bind(for_training     = False,
-             inputs_need_grad = False,
-             data_shapes      = [('data', (batch_size,)+data_shape)])
+    if data_layer_type == "int8":
+        dshape = mx.io.DataDesc(name='data', shape=(
+            batch_size,) + data_shape, dtype=np.int8)
+    elif data_layer_type == 'uint8':
+        dshape = mx.io.DataDesc(name='data', shape=(
+            batch_size,) + data_shape, dtype=np.uint8)
+    else:  # float32
+        dshape = mx.io.DataDesc(name='data', shape=(
+            batch_size,) + data_shape, dtype=np.float32)
+    mod.bind(for_training=False,
+             inputs_need_grad=False,
+             data_shapes=[dshape])
     mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
 
     # get data
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
-    batch = mx.io.DataBatch(data, []) # empty label
+    if data_layer_type == "float32":
+        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx, dtype=data_layer_type)
+                for _, shape in mod.data_shapes]
+    else:
+        data = [mx.nd.full(shape=shape, val=127, ctx=ctx, dtype=data_layer_type)
+                for _, shape in mod.data_shapes]
+    batch = mx.io.DataBatch(data, [])  # empty label
 
     # run
     dry_run = 5                 # use 5 iterations to warm up
@@ -152,6 +167,9 @@ def benchmark_score(symbol_file, ctx, batch_size, num_batches, logger=None):
                         help='shuffling seed, see'
                              ' https://mxnet.incubator.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
                              ' for more details')
+    parser.add_argument('--data-layer-type', type=str, default="float32",
+                        choices=['float32', 'int8', 'uint8'],
+                        help='data type for data layer')
 
     args = parser.parse_args()
 
@@ -192,24 +210,52 @@ def benchmark_score(symbol_file, ctx, batch_size, num_batches, logger=None):
     data_shape = tuple([int(i) for i in image_shape.split(',')])
     logger.info('Input data shape = %s' % str(data_shape))
 
+    data_layer_type = args.data_layer_type
     if args.benchmark == False:
         dataset = args.dataset
         download_dataset('http://data.mxnet.io/data/val_256_q90.rec', dataset)
         logger.info('Dataset for inference: %s' % dataset)
 
         # creating data iterator
-        data = mx.io.ImageRecordIter(path_imgrec=dataset,
-                                    label_width=1,
-                                    preprocess_threads=data_nthreads,
-                                    batch_size=batch_size,
-                                    data_shape=data_shape,
-                                    label_name=label_name,
-                                    rand_crop=False,
-                                    rand_mirror=False,
-                                    shuffle=True,
-                                    shuffle_chunk_seed=3982304,
-                                    seed=48564309,
-                                    **combine_mean_std)
+        if data_layer_type == 'int8':
+            data = mx.io.ImageRecordInt8Iter(path_imgrec=dataset,
+                                             label_width=1,
+                                             preprocess_threads=data_nthreads,
+                                             batch_size=batch_size,
+                                             data_shape=data_shape,
+                                             label_name=label_name,
+                                             rand_crop=False,
+                                             rand_mirror=False,
+                                             shuffle=args.shuffle_dataset,
+                                             shuffle_chunk_seed=args.shuffle_chunk_seed,
+                                             seed=args.shuffle_seed,
+                                             **combine_mean_std)
+        elif data_layer_type == 'uint8':
+            data = mx.io.ImageRecordUInt8Iter(path_imgrec=dataset,
+                                              label_width=1,
+                                              preprocess_threads=data_nthreads,
+                                              batch_size=batch_size,
+                                              data_shape=data_shape,
+                                              label_name=label_name,
+                                              rand_crop=False,
+                                              rand_mirror=False,
+                                              shuffle=args.shuffle_dataset,
+                                              shuffle_chunk_seed=args.shuffle_chunk_seed,
+                                              seed=args.shuffle_seed,
+                                              **combine_mean_std)
+        else:  #float32
+            data = mx.io.ImageRecordIter(path_imgrec=dataset,
+                                         label_width=1,
+                                         preprocess_threads=data_nthreads,
+                                         batch_size=batch_size,
+                                         data_shape=data_shape,
+                                         label_name=label_name,
+                                         rand_crop=False,
+                                         rand_mirror=False,
+                                         shuffle=args.shuffle_dataset,
+                                         shuffle_chunk_seed=args.shuffle_chunk_seed,
+                                         seed=args.shuffle_seed,
+                                         **combine_mean_std)
 
         # loading model
         sym, arg_params, aux_params = load_model(symbol_file, param_file, logger)
@@ -224,5 +270,5 @@ def benchmark_score(symbol_file, ctx, batch_size, num_batches, logger=None):
             max_num_examples=num_inference_images, logger=logger)
     else:
         logger.info('Running model %s for inference' % symbol_file)
-        speed = benchmark_score(symbol_file, ctx, batch_size, args.num_inference_batches, logger)
+        speed = benchmark_score(symbol_file, ctx, batch_size, args.num_inference_batches, data_layer_type, logger)
         logger.info('batch size %2d, image/sec: %f', batch_size, speed)
diff --git a/example/recommenders/matrix_fact.py b/example/recommenders/matrix_fact.py
index 5f7d173f4e1f..4a438c757710 100644
--- a/example/recommenders/matrix_fact.py
+++ b/example/recommenders/matrix_fact.py
@@ -28,14 +28,14 @@
 def evaluate_network(network, data_iterator, ctx):
     loss_acc = 0.
     l2 = gluon.loss.L2Loss()
-    for i, (users, items, scores) in enumerate(data_iterator):
+    for idx, (users, items, scores) in enumerate(data_iterator):
         users_ = gluon.utils.split_and_load(users, ctx)
         items_ = gluon.utils.split_and_load(items, ctx)
         scores_ =gluon.utils.split_and_load(scores, ctx)
         preds = [network(u, i) for u, i in zip(users_, items_)]
         losses = [l2(p, s).asnumpy() for p, s in zip(preds, scores_)]         
         loss_acc += sum(losses).mean()/len(ctx)
-    return loss_acc/(i+1)
+    return loss_acc/(idx+1)
 
 def train(network, train_data, test_data, epochs, learning_rate=0.01, optimizer='sgd', ctx=mx.gpu(0), num_epoch_lr=5, factor=0.2):
 
@@ -56,7 +56,7 @@ def train(network, train_data, test_data, epochs, learning_rate=0.01, optimizer=
     losses_output = []
     for e in range(epochs):
         loss_acc = 0.
-        for i, (users, items, scores) in enumerate(train_data):
+        for idx, (users, items, scores) in enumerate(train_data):
             
             users_ = gluon.utils.split_and_load(users, ctx)
             items_ = gluon.utils.split_and_load(items, ctx)
@@ -71,7 +71,7 @@ def train(network, train_data, test_data, epochs, learning_rate=0.01, optimizer=
             trainer.update(users.shape[0])
 
         test_loss = evaluate_network(network, test_data, ctx)
-        train_loss = loss_acc/(i+1)
+        train_loss = loss_acc/(idx+1)
         print("Epoch [{}], Training RMSE {:.4f}, Test RMSE {:.4f}".format(e, train_loss, test_loss))
         losses_output.append((train_loss, test_loss))
     return losses_output
\ No newline at end of file
diff --git a/example/ssd/README.md b/example/ssd/README.md
index 6d4caa481bd7..92a125f1892d 100644
--- a/example/ssd/README.md
+++ b/example/ssd/README.md
@@ -42,6 +42,7 @@ remarkable traits of MXNet.
 Due to the permission issue, this example is maintained in this [repository](https://github.com/zhreshold/mxnet-ssd) separately. You can use the link regarding specific per example [issues](https://github.com/zhreshold/mxnet-ssd/issues).
 
 ### What's new
+* Support training and inference on COCO dataset. Int8 inference achieves 0.253 mAP on CPU with MKL-DNN backend, which is a comparable accuracy to FP32 (0.2552 mAP).
 * Support uint8 inference on CPU with MKL-DNN backend. Uint8 inference achieves 0.8364 mAP, which is a comparable accuracy to FP32 (0.8366 mAP).
 * Added live camera capture and detection display (run with --camera flag). Example:
     `./demo.py --camera --cpu --frame-resize 0.5`
@@ -119,9 +120,9 @@ You can use `./demo.py --camera` to use a video capture device with opencv such
 will open a window that will display the camera output together with the detections. You can play
 with the detection threshold to get more or less detections.
 
-### Train the model
+### Train the model on VOC
 * Note that we recommend to use gluon-cv to train the model, please refer to [gluon-cv ssd](https://gluon-cv.mxnet.io/build/examples_detection/train_ssd_voc.html).
-This example only covers training on Pascal VOC dataset. Other datasets should
+This example only covers training on Pascal VOC or MS COCO dataset. Other datasets should
 be easily supported by adding subclass derived from class `Imdb` in `dataset/imdb.py`.
 See example of `dataset/pascal_voc.py` for details.
 * Download the converted pretrained `vgg16_reduced` model [here](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.2-beta/vgg16_reduced.zip), unzip `.param` and `.json` files
@@ -166,16 +167,53 @@ Check `python train.py --help` for more training options. For example, if you ha
 python train.py --gpus 0,1,2,3 --batch-size 32
 ```
 
+### Train the model on COCO
+* Download the COCO2014 dataset, skip this step if you already have one.
+```
+cd /path/to/where_you_store_datasets/
+wget http://images.cocodataset.org/zips/train2014.zip
+wget http://images.cocodataset.org/zips/val2014.zip
+wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
+# Extract the data.
+unzip train2014.zip
+unzip val2014.zip
+unzip annotations_trainval2014.zip
+```
+* We are going to use `train2014,valminusminival2014` set in COCO2014 for training and `minival2014` for evaluation as a common strategy.
+* Then link `COCO2014` folder to `data/coco` by default:
+```
+ln -s /path/to/COCO2014 /path/to/incubator-mxnet/example/ssd/data/coco
+```
+Use hard link instead of copy could save us a bit disk space.
+* Create packed binary file for faster training:
+```
+# cd /path/to/incubator-mxnet/example/ssd
+bash tools/prepare_coco.sh
+# or if you are using windows
+python tools/prepare_dataset.py --dataset coco --set train2014,valminusminival2014 --target ./data/train.lst --root ./data/coco
+python tools/prepare_dataset.py --dataset coco --set minival2014 --target ./data/val.lst --root ./data/coco --no-shuffle
+```
+* Start training:
+```
+# cd /path/to/incubator-mxnet/example/ssd
+python train.py --label-width=560 --num-class=80 --class-names=./dataset/names/coco_label --pretrained="" --num-example=117265 --batch-size=64
+```
+
 ### Evalute trained model
 Make sure you have val.rec as validation dataset. It's the same one as used in training. Use:
 ```
 # cd /path/to/incubator-mxnet/example/ssd
 python evaluate.py --gpus 0,1 --batch-size 128 --epoch 0
+
+# Evaluate on COCO dataset
+python evaluate.py --gpus 0,1 --batch-size 128 --epoch 0 --num-class=80 --class-names=./dataset/names/mscoco.names
 ```
 
 ### Quantize model
 
-Follow the [Train instructions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#train-the-model) to train a FP32 `SSD-VGG16_reduced_300x300` model based on Pascal VOC dataset. You can also download our [SSD-VGG16 pre-trained model](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-dd479559.zip) and [packed binary data](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd-val-fc19a535.zip). Create `model` and `data` directories if they're not exist, extract the zip files, then rename the uncompressed files as follows (eg, rename `ssd-val-fc19a535.idx` to `val.idx`, `ssd-val-fc19a535.lst` to `val.lst`, `ssd-val-fc19a535.rec` to `val.rec`, `ssd_vgg16_reduced_300-dd479559.params` to `ssd_vgg16_reduced_300-0000.params`, `ssd_vgg16_reduced_300-symbol-dd479559.json` to `ssd_vgg16_reduced_300-symbol.json`.)
+To quantize a model on VOC dataset, follow the [Train instructions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#train-the-model-on-VOC) to train a FP32 `SSD-VGG16_reduced_300x300` model based on Pascal VOC dataset. You can also download our [SSD-VGG16 pre-trained model](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-dd479559.zip) and [packed binary data](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd-val-fc19a535.zip). Create `model` and `data` directories if they're not exist, extract the zip files, then rename the uncompressed files as follows (eg, rename `ssd-val-fc19a535.idx` to `val.idx`, `ssd-val-fc19a535.lst` to `val.lst`, `ssd-val-fc19a535.rec` to `val.rec`, `ssd_vgg16_reduced_300-dd479559.params` to `ssd_vgg16_reduced_300-0000.params`, `ssd_vgg16_reduced_300-symbol-dd479559.json` to `ssd_vgg16_reduced_300-symbol.json`.)
+
+To quantize a model on COCO dataset, follow the [Train instructions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#train-the-model-on-COCO) to train a FP32 `SSD-VGG16_reduced_300x300` model based on COCO dataset. You can also download our [SSD-VGG16 pre-trained model](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-7fedd4ad.zip) and [packed binary data](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd_coco-val-e91096e8.zip). Create `model` and `data` directories if they're not exist, extract the zip files, then rename the uncompressed files as follows (eg, rename `ssd_coco-val-e91096e8.idx` to `val.idx`, `ssd_coco-val-e91096e8.lst` to `val.lst`, `ssd_coco-val-e91096e8.rec` to `val.rec`, `ssd_vgg16_reduced_300-7fedd4ad.params` to `ssd_vgg16_reduced_300-0000.params`, `ssd_vgg16_reduced_300-symbol-7fedd4ad.json` to `ssd_vgg16_reduced_300-symbol.json`.)
 
 ```
 data/
@@ -199,12 +237,20 @@ After quantization, INT8 models will be saved in `model/` dictionary.  Use the f
 # USE MKLDNN AS SUBGRAPH BACKEND
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 
-# Launch FP32 Inference
+# Launch FP32 Inference on VOC dataset
 python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/ssd_
 
-# Launch INT8 Inference
+# Launch INT8 Inference on VOC dataset
 python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/cqssd_
 
+# Launch FP32 Inference on COCO dataset
+
+python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/ssd_ --num-class=80 --class-names=./dataset/names/mscoco.names
+
+# Launch INT8 Inference on COCO dataset
+
+python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/cqssd_ --num-class=80 --class-names=./dataset/names/mscoco.names
+
 # Launch dummy data Inference
 python benchmark_score.py --deploy --prefix=./model/ssd_
 python benchmark_score.py --deploy --prefix=./model/cqssd_
diff --git a/example/ssd/dataset/mscoco.py b/example/ssd/dataset/mscoco.py
index 469a15ae2720..dbe6e6909f4d 100644
--- a/example/ssd/dataset/mscoco.py
+++ b/example/ssd/dataset/mscoco.py
@@ -97,6 +97,12 @@ def _load_all(self, anno_file, shuffle):
         labels = []
         coco = COCO(anno_file)
         img_ids = coco.getImgIds()
+        # deal with class names
+        cats = [cat['name'] for cat in coco.loadCats(coco.getCatIds())]
+        class_to_coco_ind = dict(zip(cats, coco.getCatIds()))
+        class_to_ind = dict(zip(self.classes, range(len(self.classes))))
+        coco_ind_to_class_ind = dict([(class_to_coco_ind[cls], class_to_ind[cls])
+                                     for cls in self.classes[0:]])
         for img_id in img_ids:
             # filename
             image_info = coco.loadImgs(img_id)[0]
@@ -109,7 +115,7 @@ def _load_all(self, anno_file, shuffle):
             annos = coco.loadAnns(anno_ids)
             label = []
             for anno in annos:
-                cat_id = int(anno["category_id"])
+                cat_id = coco_ind_to_class_ind[anno['category_id']]
                 bbox = anno["bbox"]
                 assert len(bbox) == 4
                 xmin = float(bbox[0]) / width
@@ -123,7 +129,7 @@ def _load_all(self, anno_file, shuffle):
 
         if shuffle:
             import random
-            indices = range(len(image_set_index))
+            indices = list(range(len(image_set_index)))
             random.shuffle(indices)
             image_set_index = [image_set_index[i] for i in indices]
             labels = [labels[i] for i in indices]
diff --git a/example/ssd/dataset/names/mscoco.names b/example/ssd/dataset/names/mscoco.names
index ca76c80b5b2c..941cb4e13922 100644
--- a/example/ssd/dataset/names/mscoco.names
+++ b/example/ssd/dataset/names/mscoco.names
@@ -1,8 +1,8 @@
 person
 bicycle
 car
-motorbike
-aeroplane
+motorcycle
+airplane
 bus
 train
 truck
@@ -55,12 +55,12 @@ pizza
 donut
 cake
 chair
-sofa
-pottedplant
+couch
+potted plant
 bed
-diningtable
+dining table
 toilet
-tvmonitor
+tv
 laptop
 mouse
 remote
diff --git a/example/ssd/evaluate/eval_metric.py b/example/ssd/evaluate/eval_metric.py
index bb2b77b3d52b..1deb381fb859 100644
--- a/example/ssd/evaluate/eval_metric.py
+++ b/example/ssd/evaluate/eval_metric.py
@@ -140,7 +140,7 @@ def iou(x, ys):
                 dets = pred[indices]
                 pred = np.delete(pred, indices, axis=0)
                 # sort by score, desceding
-                dets[dets[:,1].argsort()[::-1]]
+                dets = dets[dets[:,1].argsort()[::-1]]
                 records = np.hstack((dets[:, 1][:, np.newaxis], np.zeros((dets.shape[0], 1))))
                 # ground-truths
                 label_indices = np.where(label[:, 0].astype(int) == cid)[0]
diff --git a/example/ssd/symbol/legacy_vgg16_ssd_300.py b/example/ssd/symbol/legacy_vgg16_ssd_300.py
index 29fc30be65d4..0acac6e4294b 100644
--- a/example/ssd/symbol/legacy_vgg16_ssd_300.py
+++ b/example/ssd/symbol/legacy_vgg16_ssd_300.py
@@ -200,8 +200,7 @@ def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False,
     loc_preds = net.get_internals()["multibox_loc_pred_output"]
     anchor_boxes = net.get_internals()["multibox_anchors_output"]
 
-    cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
-        name='cls_prob')
+    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
     out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
diff --git a/example/ssd/symbol/legacy_vgg16_ssd_512.py b/example/ssd/symbol/legacy_vgg16_ssd_512.py
index c5c3095dfd77..74d6b37fc11e 100644
--- a/example/ssd/symbol/legacy_vgg16_ssd_512.py
+++ b/example/ssd/symbol/legacy_vgg16_ssd_512.py
@@ -203,8 +203,7 @@ def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=40
     loc_preds = net.get_internals()["multibox_loc_pred_output"]
     anchor_boxes = net.get_internals()["multibox_anchors_output"]
 
-    cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
-        name='cls_prob')
+    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
     out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
diff --git a/example/ssd/symbol/symbol_builder.py b/example/ssd/symbol/symbol_builder.py
index 041f83eb44da..135c42e8be15 100644
--- a/example/ssd/symbol/symbol_builder.py
+++ b/example/ssd/symbol/symbol_builder.py
@@ -175,8 +175,7 @@ def get_symbol(network, num_classes, from_layers, num_filters, sizes, ratios,
         num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
         num_channels=num_filters, clip=False, interm_layer=0, steps=steps)
 
-    cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
-        name='cls_prob')
+    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
     out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
diff --git a/example/ssd/train.py b/example/ssd/train.py
index 09c618a96429..5965aeec6c7a 100755
--- a/example/ssd/train.py
+++ b/example/ssd/train.py
@@ -103,6 +103,8 @@ def parse_args():
                         help='use difficult ground-truths in evaluation')
     parser.add_argument('--no-voc07', dest='use_voc07_metric', action='store_false',
                         help='dont use PASCAL VOC 07 11-point metric')
+    parser.add_argument('--kv-store', type=str, default='local',
+                        help='key-value store type')
     args = parser.parse_args()
     return args
 
@@ -150,4 +152,5 @@ def parse_class_names(args):
               force_nms=args.force_nms,
               ovp_thresh=args.overlap_thresh,
               use_difficult=args.use_difficult,
-              voc07_metric=args.use_voc07_metric)
+              voc07_metric=args.use_voc07_metric,
+              kv_store=args.kv_store)
diff --git a/example/ssd/train/metric.py b/example/ssd/train/metric.py
index 731f8fcc19f4..eeb9796bf4a8 100644
--- a/example/ssd/train/metric.py
+++ b/example/ssd/train/metric.py
@@ -39,6 +39,17 @@ def reset(self):
             self.num_inst = [0] * self.num
             self.sum_metric = [0.0] * self.num
 
+    def reset_local(self):
+        """
+        override reset behavior
+        """
+        if getattr(self, 'num', None) is None:
+            self.num_inst = 0
+            self.sum_metric = 0.0
+        else:
+            self.num_inst = [0] * self.num
+            self.sum_metric = [0.0] * self.num
+
     def update(self, labels, preds):
         """
         Implementation of updating metrics
diff --git a/example/ssd/train/train_net.py b/example/ssd/train/train_net.py
index 304a43b3d949..b37e3d5abcec 100644
--- a/example/ssd/train/train_net.py
+++ b/example/ssd/train/train_net.py
@@ -97,7 +97,7 @@ def train_net(net, train_path, num_classes, batch_size,
               use_difficult=False, class_names=None,
               voc07_metric=False, nms_topk=400, force_suppress=False,
               train_list="", val_path="", val_list="", iter_monitor=0,
-              monitor_pattern=".*", log_file=None):
+              monitor_pattern=".*", log_file=None, kv_store=None):
     """
     Wrapper for training phase.
 
@@ -258,6 +258,9 @@ def train_net(net, train_path, num_classes, batch_size,
     else:
         valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3)
 
+    # create kvstore when there are gpus
+    kv = mx.kvstore.create(kv_store) if kv_store else None
+
     mod.fit(train_iter,
             val_iter,
             eval_metric=MultiBoxMetric(),
@@ -272,4 +275,5 @@ def train_net(net, train_path, num_classes, batch_size,
             arg_params=args,
             aux_params=auxs,
             allow_missing=True,
-            monitor=monitor)
+            monitor=monitor,
+            kvstore=kv)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 76a4995d15c0..0acfde0686d4 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -95,10 +95,22 @@ typedef void *CudaKernelHandle;
 typedef void *ProfileHandle;
 /*! \brief handle to DLManagedTensor*/
 typedef void *DLManagedTensorHandle;
-
+/*! \brief handle to Context */
+typedef const void *ContextHandle;
+/*! \brief handle to Engine FnProperty */
+typedef const void *EngineFnPropertyHandle;
+/*! \brief handle to Engine VarHandle */
+typedef void *EngineVarHandle;
+
+/*! \brief Engine asynchronous operation */
+typedef void (*EngineAsyncFunc)(void*, void*, void*);
+/*! \brief Engine synchronous operation */
+typedef void (*EngineSyncFunc)(void*, void*);
+/*! \brief Callback to free the param for EngineAsyncFunc/EngineSyncFunc */
+typedef void (*EngineFuncParamDeleter)(void*);
 typedef void (*ExecutorMonitorCallback)(const char*,
                                         NDArrayHandle,
-                                        void *);
+                                        void*);
 
 struct NativeOpInfo {
   void (*forward)(int, float**, int*, unsigned**, int*, void*);
@@ -170,7 +182,7 @@ typedef int (*CustomOpFBFunc)(int /*size*/, void** /*ptrs*/, int* /*tags*/,
 typedef int (*CustomOpDelFunc)(void* /*state*/);
 typedef int (*CustomOpListFunc)(char*** /*args*/, void* /*state*/);
 typedef int (*CustomOpInferShapeFunc)(int /*num_input*/, int* /*ndims*/,
-                                      unsigned** /*shapes*/, void* /*state*/);
+                                      int** /*shapes*/, void* /*state*/);
 typedef int (*CustomOpInferStorageTypeFunc)(int /*num_input*/, int* /*stypes*/, void* /*state*/);
 typedef int (*CustomOpBackwardInferStorageTypeFunc)(int /*num_input*/,
                                                     int * /*stypes*/,
@@ -756,7 +768,8 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
                                  bool reverse,
                                  NDArrayHandle *out);
 /*!
- * \brief get the shape of the array
+ * \brief DEPRECATED. Use MXNDArrayGetShapeEx instead.
+ * get the shape of the array
  * \param handle the handle to the narray
  * \param out_dim the output dimension
  * \param out_pdata pointer holder to get data pointer of the shape
@@ -765,6 +778,16 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
 MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
                                 mx_uint *out_dim,
                                 const mx_uint **out_pdata);
+/*!
+ * \brief get the shape of the array
+ * \param handle the handle to the narray
+ * \param out_dim the output dimension
+ * \param out_pdata pointer holder to get data pointer of the shape
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXNDArrayGetShapeEx(NDArrayHandle handle,
+                                  int *out_dim,
+                                  const int **out_pdata);
 /*!
  * \brief get the content of the data in NDArray
  * \param handle the handle to the ndarray
@@ -1036,6 +1059,19 @@ MXNET_DLL int MXAutogradIsRecording(bool* curr);
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXAutogradIsTraining(bool* curr);
+/*!
+ * \brief get whether numpy compatibility is on
+ * \param curr returns the current status
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXIsNumpyCompatible(bool* curr);
+/*!
+ * \brief set numpy compatibility switch
+ * \param is_np_comp 1 when numpy compatibility is on, 0 when off
+ * \param prev returns the previous status before this set
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSetIsNumpyCompatible(int is_np_comp, int* prev);
 /*!
  * \brief mark NDArrays as variables to compute gradient for autograd
  * \param num_var number of variable NDArrays
@@ -1456,7 +1492,8 @@ MXNET_DLL int MXSymbolGrad(SymbolHandle sym,
                            const char** wrt,
                            SymbolHandle* out);
 /*!
- * \brief infer shape of unknown input shapes given the known one.
+ * \brief DEPRECATED. Use MXSymbolInferShapeEx instead.
+ * infer shape of unknown input shapes given the known one.
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
  *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
  *
@@ -1492,8 +1529,47 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
                                  const mx_uint **aux_shape_ndim,
                                  const mx_uint ***aux_shape_data,
                                  int *complete);
+
 /*!
- * \brief partially infer shape of unknown input shapes given the known one.
+ * \brief infer shape of unknown input shapes given the known one.
+ *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
+ *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
+ *
+ * \param sym symbol handle
+ * \param num_args numbe of input arguments.
+ * \param keys the key of keyword args (optional)
+ * \param arg_ind_ptr the head pointer of the rows in CSR
+ * \param arg_shape_data the content of the CSR
+ * \param in_shape_size sizeof the returning array of in_shapes
+ * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param in_shape_data returning array of pointers to head of the input shape.
+ * \param out_shape_size sizeof the returning array of out_shapes
+ * \param out_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param out_shape_data returning array of pointers to head of the input shape.
+ * \param aux_shape_size sizeof the returning array of aux_shapes
+ * \param aux_shape_ndim returning array of shape dimensions of eachs auxiliary shape.
+ * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
+ * \param complete whether infer shape completes or more information is needed.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolInferShapeEx(SymbolHandle sym,
+                                   mx_uint num_args,
+                                   const char** keys,
+                                   const mx_uint *arg_ind_ptr,
+                                   const int *arg_shape_data,
+                                   mx_uint *in_shape_size,
+                                   const int **in_shape_ndim,
+                                   const int ***in_shape_data,
+                                   mx_uint *out_shape_size,
+                                   const int **out_shape_ndim,
+                                   const int ***out_shape_data,
+                                   mx_uint *aux_shape_size,
+                                   const int **aux_shape_ndim,
+                                   const int ***aux_shape_data,
+                                   int *complete);
+/*!
+ * \brief DEPRECATED. Use MXSymbolInferShapePartialEx instead.
+ * partially infer shape of unknown input shapes given the known one.
  *
  *  Return partially inferred results if not all shapes could be inferred.
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
@@ -1532,6 +1608,47 @@ MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym,
                                         const mx_uint ***aux_shape_data,
                                         int *complete);
 
+
+/*!
+ * \brief partially infer shape of unknown input shapes given the known one.
+ *
+ *  Return partially inferred results if not all shapes could be inferred.
+ *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
+ *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
+ *
+ * \param sym symbol handle
+ * \param num_args numbe of input arguments.
+ * \param keys the key of keyword args (optional)
+ * \param arg_ind_ptr the head pointer of the rows in CSR
+ * \param arg_shape_data the content of the CSR
+ * \param in_shape_size sizeof the returning array of in_shapes
+ * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param in_shape_data returning array of pointers to head of the input shape.
+ * \param out_shape_size sizeof the returning array of out_shapes
+ * \param out_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param out_shape_data returning array of pointers to head of the input shape.
+ * \param aux_shape_size sizeof the returning array of aux_shapes
+ * \param aux_shape_ndim returning array of shape dimensions of eachs auxiliary shape.
+ * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
+ * \param complete whether infer shape completes or more information is needed.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolInferShapePartialEx(SymbolHandle sym,
+                                          mx_uint num_args,
+                                          const char** keys,
+                                          const mx_uint *arg_ind_ptr,
+                                          const int *arg_shape_data,
+                                          mx_uint *in_shape_size,
+                                          const int **in_shape_ndim,
+                                          const int ***in_shape_data,
+                                          mx_uint *out_shape_size,
+                                          const int **out_shape_ndim,
+                                          const int ***out_shape_data,
+                                          mx_uint *aux_shape_size,
+                                          const int **aux_shape_ndim,
+                                          const int ***aux_shape_data,
+                                          int *complete);
+
 /*!
  * \brief infer type of unknown input types given the known one.
  *  The types are packed into a CSR matrix represented by arg_ind_ptr and arg_type_data
@@ -1602,8 +1719,8 @@ MXNET_DLL int MXSymbolInferTypePartial(SymbolHandle sym,
  * \param excluded_symbols op names to be excluded from being quantized
  * \param num_offline number of parameters that are quantized offline
  * \param offline_params array of c strings representing the names of params quantized offline
- * \param quantized_dtype the quantized destination type for input data.
- * \param calib_quantize **Deprecated**. quantize op will always be calibrated if could.
+ * \param quantized_dtype the quantized destination type for input data
+ * \param calib_quantize **Deprecated**. quantize op will always be calibrated if could
  */
 MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle, SymbolHandle *ret_sym_handle,
                                const mx_uint num_excluded_symbols,
@@ -1795,7 +1912,8 @@ MXNET_DLL int MXExecutorBindEX(SymbolHandle symbol_handle,
                                NDArrayHandle *aux_states,
                                ExecutorHandle shared_exec,
                                ExecutorHandle *out);
-
+/*! \brief DEPRECATED. Use MXExecutorSimpleBindEx instead.
+ */
 MXNET_DLL int MXExecutorSimpleBind(SymbolHandle symbol_handle,
                                    int dev_type,
                                    int dev_id,
@@ -1831,8 +1949,44 @@ MXNET_DLL int MXExecutorSimpleBind(SymbolHandle symbol_handle,
                                    ExecutorHandle shared_exec_handle,
                                    ExecutorHandle* out);
 
-/*!
- * \brief Return a new executor with the same symbol and shared memory,
+
+MXNET_DLL int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
+                                     int dev_type,
+                                     int dev_id,
+                                     const mx_uint num_g2c_keys,
+                                     const char** g2c_keys,
+                                     const int* g2c_dev_types,
+                                     const int* g2c_dev_ids,
+                                     const mx_uint provided_grad_req_list_len,
+                                     const char** provided_grad_req_names,
+                                     const char** provided_grad_req_types,
+                                     const mx_uint num_provided_arg_shapes,
+                                     const char** provided_arg_shape_names,
+                                     const int* provided_arg_shape_data,
+                                     const mx_uint* provided_arg_shape_idx,
+                                     const mx_uint num_provided_arg_dtypes,
+                                     const char** provided_arg_dtype_names,
+                                     const int* provided_arg_dtypes,
+                                     const mx_uint num_provided_arg_stypes,
+                                     const char** provided_arg_stype_names,
+                                     const int* provided_arg_stypes,
+                                     const mx_uint num_shared_arg_names,
+                                     const char** shared_arg_name_list,
+                                     int* shared_buffer_len,
+                                     const char** shared_buffer_name_list,
+                                     NDArrayHandle* shared_buffer_handle_list,
+                                     const char*** updated_shared_buffer_name_list,
+                                     NDArrayHandle** updated_shared_buffer_handle_list,
+                                     mx_uint* num_in_args,
+                                     NDArrayHandle** in_args,
+                                     NDArrayHandle** arg_grads,
+                                     mx_uint* num_aux_states,
+                                     NDArrayHandle** aux_states,
+                                     ExecutorHandle shared_exec_handle,
+                                     ExecutorHandle* out);
+/*!
+ * \brief DEPRECATED. Use MXExecutorReshapeEx instead.
+ * Return a new executor with the same symbol and shared memory,
  * but different input/output shapes.
  *
  * \param partial_shaping Whether to allow changing the shape of unspecified arguments.
@@ -1871,6 +2025,46 @@ MXNET_DLL int MXExecutorReshape(int partial_shaping,
                                 NDArrayHandle** aux_states,
                                 ExecutorHandle shared_exec,
                                 ExecutorHandle *out);
+/*!
+ * \brief Return a new executor with the same symbol and shared memory,
+ * but different input/output shapes.
+ *
+ * \param partial_shaping Whether to allow changing the shape of unspecified arguments.
+ * \param allow_up_sizing Whether to allow allocating new ndarrays that's larger than the original.
+ * \param dev_type device type of default context
+ * \param dev_id device id of default context
+ * \param num_map_keys size of group2ctx map
+ * \param map_keys keys of group2ctx map
+ * \param map_dev_types device type of group2ctx map
+ * \param map_dev_ids device id of group2ctx map
+ * \param num_in_args length of in_args
+ * \param in_args in args array
+ * \param arg_grads arg grads handle array
+ * \param num_aux_states length of auxiliary states
+ * \param aux_states auxiliary states array
+ * \param shared_exec input executor handle for memory sharing
+ * \param out output executor handle
+ * \return a new executor
+ */
+MXNET_DLL int MXExecutorReshapeEx(int partial_shaping,
+                                  int allow_up_sizing,
+                                  int dev_type,
+                                  int dev_id,
+                                  mx_uint num_map_keys,
+                                  const char** map_keys,
+                                  const int* map_dev_types,
+                                  const int* map_dev_ids,
+                                  const mx_uint num_provided_arg_shapes,
+                                  const char** provided_arg_shape_names,
+                                  const int* provided_arg_shape_data,
+                                  const mx_uint* provided_arg_shape_idx,
+                                  mx_uint* num_in_args,
+                                  NDArrayHandle** in_args,
+                                  NDArrayHandle** arg_grads,
+                                  mx_uint* num_aux_states,
+                                  NDArrayHandle** aux_states,
+                                  ExecutorHandle shared_exec,
+                                  ExecutorHandle *out);
 
 /*!
  * \brief get optimized graph from graph executor
@@ -2530,7 +2724,8 @@ MXNET_DLL int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** ar
 MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid,
                                           int* shared_id);
 /*!
- * \brief Reconstruct NDArray from shared memory handle
+ * \brief DEPRECATED. Use MXNDArrayCreateFromSharedMemEx instead.
+ * Reconstruct NDArray from shared memory handle
  * \param shared_pid shared PID
  * \param shared_id shared memory id
  * \param shape pointer to NDArray dimensions
@@ -2542,6 +2737,64 @@ MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const
                                            mx_uint ndim, int dtype, NDArrayHandle *out);
 
 
+/*!
+ * \brief Reconstruct NDArray from shared memory handle
+ * \param shared_pid shared PID
+ * \param shared_id shared memory id
+ * \param shape pointer to NDArray dimensions
+ * \param ndim number of NDArray dimensions
+ * \param dtype data type of NDArray
+ * \param out constructed NDArray
+ */
+MXNET_DLL int MXNDArrayCreateFromSharedMemEx(int shared_pid, int shared_id, const int *shape,
+                                             int ndim, int dtype, NDArrayHandle *out);
+
+/*!
+  * \brief Push an asynchronous operation to the engine.
+  * \param async_func Execution function whici takes a parameter on_complete
+  *                   that must be called when the execution ompletes.
+  * \param func_param The parameter set on calling async_func, can be NULL.
+  * \param deleter The callback to free func_param, can be NULL.
+  * \param ctx_handle Execution context.
+  * \param const_vars_handle The variables that current operation will use
+  *                          but not mutate.
+  * \param num_const_vars The number of const_vars.
+  * \param mutable_vars_handle The variables that current operation will mutate.
+  * \param num_mutable_vars The number of mutable_vars.
+  * \param prop_handle Property of the function.
+  * \param priority Priority of the action, as hint to the engine.
+  * \param opr_name The operation name.
+  * \param wait Whether this is a WaitForVar operation.
+  */
+MXNET_DLL int MXEnginePushAsync(EngineAsyncFunc async_func, void* func_param,
+                                EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                                EngineVarHandle const_vars_handle, int num_const_vars,
+                                EngineVarHandle mutable_vars_handle, int num_mutable_vars,
+                                EngineFnPropertyHandle prop_handle = NULL, int priority = 0,
+                                const char* opr_name = NULL, bool wait = false);
+
+/*!
+  * \brief Push a synchronous operation to the engine.
+  * \param sync_func Execution function that executes the operation.
+  * \param func_param The parameter set on calling sync_func, can be NULL.
+  * \param deleter The callback to free func_param, can be NULL.
+  * \param ctx_handle Execution context.
+  * \param const_vars_handle The variables that current operation will use
+  *                          but not mutate.
+  * \param num_const_vars The number of const_vars.
+  * \param mutable_vars_handle The variables that current operation will mutate.
+  * \param num_mutable_vars The number of mutable_vars.
+  * \param prop_handle Property of the function.
+  * \param priority Priority of the action, as hint to the engine.
+  * \param opr_name The operation name.
+  */
+MXNET_DLL int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
+                               EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                               EngineVarHandle const_vars_handle, int num_const_vars,
+                               EngineVarHandle mutable_vars_handle, int num_mutable_vars,
+                               EngineFnPropertyHandle prop_handle = NULL, int priority = 0,
+                               const char* opr_name = NULL);
+
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
diff --git a/include/mxnet/c_api_test.h b/include/mxnet/c_api_test.h
index fe6fc7fe9cc4..ce2670e9a6f6 100644
--- a/include/mxnet/c_api_test.h
+++ b/include/mxnet/c_api_test.h
@@ -38,7 +38,7 @@ extern "C" {
  * to the input graph for partitioning. This function should be
  * used only for the testing purpose.
  */
-MXNET_DLL int MXPartitionGraphByOpNames(SymbolHandle sym_handle,
+MXNET_DLL int MXBuildSubgraphByOpNames(SymbolHandle sym_handle,
                                         const char* prop_name,
                                         const mx_uint num_ops,
                                         const char** op_names,
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index 408a70a5feed..9d6367509f79 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -106,7 +106,9 @@ enum class FnProperty {
   /*! \brief Delete variable call */
   kDeleteVar,
   /*! \brief Prioritized sync operation on GPU */
-  kGPUPrioritized
+  kGPUPrioritized,
+  /*! \brief Operation not to be skipped even with associated exception */
+  kNoSkip
 };  // enum class FnProperty
 
 /*!
@@ -230,6 +232,8 @@ class MXNET_API Engine {
    * \brief Wait until all the activity of engine finishes.
    */
   virtual void WaitForAll() = 0;
+  /*!\brief Throw if threre are associated exception with var */
+  virtual void Throw(VarHandle var) = 0;
   /*!\brief virtual destructor */
   virtual ~Engine() noexcept(false) {}
   /*!
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index 7ea60df33028..ad209913ac53 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -97,6 +97,16 @@ class Imperative {
       is_recording_ = is_recording;
       return old;
   }
+  /*! brief whether numpy compatibility is on. */
+  bool is_np_comp() const {
+    return is_np_comp_;
+  }
+  /*! brief turn on or turn off numpy compatibility switch. */
+  bool set_is_np_comp(bool is_np_comp) {
+    bool old = is_np_comp_;
+    is_np_comp_ = is_np_comp;
+    return old;
+  }
   /*! \brief to record operator, return corresponding node. */
   void RecordOp(nnvm::NodeAttrs&& attrs,
                 const std::vector<NDArray*>& inputs,
@@ -129,14 +139,31 @@ class Imperative {
                                  bool create_graph);
   /*! \return AutogradRuntime singleton */
   static Imperative* Get();
+  /*! \brief Should op execution bulking be employed during inference. */
+  static bool PreferBulkExecInference() {
+    return dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_INFERENCE", true);
+  }
+  /*! \brief Should op execution bulking be employed during training. */
+  static bool PreferBulkExecTrain() {
+    return dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_TRAIN", true);
+  }
+  /*! \brief The max number of op nodes in a bulk during forward pass of training. */
+  static int BulkExecMaxNodeTrainFwd() {
+    return dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD",
+                        dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15));
+  }
+  /*! \brief The max number of op nodes in a bulk during backward pass of training. */
+  static int BulkExecMaxNodeTrainBwd() {
+    return dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD",
+                        dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15));
+  }
 
  private:
   friend class NDArray;
   /*! \brief make constructor protected. */
   Imperative() {
-    if (dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_TRAIN", 1)) {
-      backward_bulk_size_ =  dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15);
-    }
+    if (PreferBulkExecTrain())
+      backward_bulk_size_ = BulkExecMaxNodeTrainBwd();
   }
   /*! \brief find the input/output ndarrays that are needed for backward */
   void GetBackwardDependency(
@@ -148,9 +175,15 @@ class Imperative {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local bool is_train_;
   static thread_local bool is_recording_;
+  // TOOD(junwu): Added numpy compatibility switch for backward compatibility.
+  // Delete it in the next major release.
+  static thread_local bool is_np_comp_;
 #else
   static MX_THREAD_LOCAL bool is_train_;
   static MX_THREAD_LOCAL bool is_recording_;
+  // TOOD(junwu): Added numpy compatibility switch for backward compatibility.
+  // Delete it in the next major release.
+  static MX_THREAD_LOCAL bool is_np_comp_;
 #endif
   /*! \brief node count used for naming */
   std::atomic<uint64_t> node_count_{0};
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index c55cb01b4688..05d3fa45683e 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -850,26 +850,35 @@ class NDArray {
     mxnet::ShapeVector aux_shapes;
     /*! \brief Reference to the storage to ensure proper destruct order */
     std::shared_ptr<Storage> storage_ref_;
+    /*! \brief Reference to the engine to ensure we cleanup without calling a destructed engine */
+    std::weak_ptr<Engine> engine_ref_;
 
-    /*! \brief default cosntructor */
+
+    /*! \brief default constructor */
     Chunk() : static_data(true), delay_alloc(false),
-              storage_ref_(Storage::_GetSharedRef()) {}
+              storage_ref_(Storage::_GetSharedRef()),
+              engine_ref_(Engine::_GetSharedRef()) {}
 
     /*! \brief construct a new chunk */
     Chunk(mxnet::TShape shape, Context ctx_, bool delay_alloc_, int dtype)
         : static_data(false), delay_alloc(true), ctx(ctx_),
-          storage_ref_(Storage::_GetSharedRef()) {
-      auto size = shape.Size();
+          storage_ref_(Storage::_GetSharedRef()),
+          engine_ref_(Engine::_GetSharedRef()) {
       storage_shape = shape;
+      if (shape_is_known(storage_shape)) {
+        shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
+      }
       var = Engine::Get()->NewVariable();
-      shandle.size = size * mshadow::mshadow_sizeof(dtype);
       shandle.ctx = ctx_;
-      if (!delay_alloc_) this->CheckAndAlloc();
+      if (!delay_alloc_) {
+        this->CheckAndAlloc();
+      }
     }
 
     Chunk(const TBlob &data, int dev_id)
         : static_data(true), delay_alloc(false),
-          storage_ref_(Storage::_GetSharedRef()) {
+          storage_ref_(Storage::_GetSharedRef()),
+          engine_ref_(Engine::_GetSharedRef()) {
       CHECK(storage_type == kDefaultStorage);
       var = Engine::Get()->NewVariable();
       if (data.dev_mask() == cpu::kDevMask) {
@@ -887,7 +896,8 @@ class NDArray {
 
     Chunk(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
         : static_data(false), delay_alloc(false),
-          storage_ref_(Storage::_GetSharedRef()) {
+          storage_ref_(Storage::_GetSharedRef()),
+          engine_ref_(Engine::_GetSharedRef()) {
       var = Engine::Get()->NewVariable();
       ctx = Context::CPUShared(0);
       shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
@@ -903,7 +913,8 @@ class NDArray {
           const mxnet::ShapeVector &aux_shapes_)
         : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
           aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
-          aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()) {
+          aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()),
+          engine_ref_(Engine::_GetSharedRef()) {
       shandle.ctx = ctx;
       var = Engine::Get()->NewVariable();
       // aux_handles always reflect the correct number of aux data
@@ -921,7 +932,7 @@ class NDArray {
     Chunk(const NDArrayStorageType storage_type_, const TBlob &data,
           const std::vector<TBlob> &aux_data, int dev_id)
         : static_data(true), delay_alloc(false), storage_type(storage_type_),
-          storage_ref_(Storage::_GetSharedRef()) {
+          storage_ref_(Storage::_GetSharedRef()), engine_ref_(Engine::_GetSharedRef()) {
       using namespace mshadow;
       CHECK_NE(storage_type, kDefaultStorage);
       // init var
@@ -953,7 +964,7 @@ class NDArray {
     /*! \brief set the shape for ith aux data, and update storage shape if necessary */
     inline void set_aux_shape(const size_t i, const mxnet::TShape& shape) {
       aux_shapes[i] = shape;
-      if (storage_shape.ndim() > 0) {
+      if (storage_shape.ndim() >= 0) {
         if (storage_type == kRowSparseStorage && i == rowsparse::kIdx) {
           storage_shape[0] = shape[0];
         } else if (storage_type == kCSRStorage && i == csr::kIdx) {
@@ -986,8 +997,8 @@ class NDArray {
 #endif
         delay_alloc = false;
       } else if (shandle.size < dbytes) {
-        // free storage if necessary and alloc again
-        if (shandle.size > 0) Storage::Get()->Free(shandle);
+        // free storage
+        Storage::Get()->Free(shandle);
         // init storage
         shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
 #if MXNET_USE_MKLDNN == 1
@@ -1055,8 +1066,8 @@ class NDArray {
       }
       size_t aux_bytes = shape.Size() * mshadow::mshadow_sizeof(aux_types[i]);
       if (aux_handles[i].size < aux_bytes) {
-        // free storage if necessary and alloc again
-        if (aux_handles[i].size > 0) Storage::Get()->Free(aux_handles[i]);
+        // free storage
+        Storage::Get()->Free(aux_handles[i]);
         // init aux storage
         aux_handles[i] = Storage::Get()->Alloc(aux_bytes, ctx);
       }
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 7d059025b03e..a7a57266dab8 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -198,7 +198,6 @@ class TBlob {
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
     return mshadow::Tensor<Device, 2, DType>(static_cast<DType*>(dptr_),
                                              shape_.FlatTo2D(),
-                                             shape_[shape_.ndim() - 1],
                                              stream);
   }
   /*!
@@ -419,6 +418,8 @@ class TBlob {
 namespace dmlc {
 // Add a few patches to support mxnet::TShape in dmlc/parameter.
 DMLC_DECLARE_TYPE_NAME(mxnet::TShape, "Shape(tuple)");
+DMLC_DECLARE_TYPE_NAME(mxnet::Tuple<int>, "Shape(tuple)");
+DMLC_DECLARE_TYPE_NAME(mxnet::Tuple<dmlc::optional<int>>, "Shape(tuple)");
 DMLC_DECLARE_TYPE_NAME(nnvm::Tuple<int>, "Shape(tuple)");
 DMLC_DECLARE_TYPE_NAME(nnvm::Tuple<dmlc::optional<int>>, "Shape(tuple)");
 
diff --git a/include/mxnet/tuple.h b/include/mxnet/tuple.h
index 7c1367333630..8431bbb23b96 100644
--- a/include/mxnet/tuple.h
+++ b/include/mxnet/tuple.h
@@ -17,7 +17,7 @@
  * under the License.
  */
 /*!
- *  Copyright (c) 2016 by Contributors
+ *  Copyright (c) 2019 by Contributors
  * \file mxnet/tuple.h
  * \brief Data structure Tuple and TShape to store dynamic sized shapes.
  */
@@ -39,11 +39,14 @@ namespace mxnet {
 
 /*!
  * \brief A dynamic sized array data structure that is optimized for storing
- *        small number of elements with same type.
+ * small number of elements with same type.
  *
  *  Data will be stored in stack when number of elements is small.
  *  It is suitable to hold shape of Tensor.
  *
+ *  The ndim of a valid tuple is an integer in range [0, inf).
+ *  ndim = 0 means the tuple is empty.
+ *
  * \tparam ValueType The type of data stored inside tuple.
  * \sa TShape
  */
@@ -61,7 +64,11 @@ class Tuple {
    * \param s the source tuple
    */
   inline Tuple(const Tuple<ValueType>& s) {
-    this->assign(s.begin(), s.end());
+    if (s.ndim() == -1) {
+      this->SetDim(-1);
+    } else {
+      this->assign(s.begin(), s.end());
+    }
   }
   /*!
    * \brief constructor from initializer list
@@ -106,6 +113,7 @@ class Tuple {
   inline void assign(RandomAccessIterator begin,
                      RandomAccessIterator end) {
     this->SetDim(end - begin);
+    CHECK_GE(ndim(), 0);
     std::copy(begin, end, this->begin());
   }
   /*!
@@ -124,7 +132,11 @@ class Tuple {
    * \return reference of self
    */
   inline Tuple<ValueType>& operator=(const Tuple<ValueType>& src) {
-    this->assign(src.begin(), src.end());
+    if (src.ndim() == -1) {
+      this->SetDim(-1);
+    } else {
+      this->assign(src.begin(), src.end());
+    }
     return *this;
   }
   /*!
@@ -151,6 +163,7 @@ class Tuple {
    */
   inline bool operator==(const Tuple<ValueType> &s) const {
     if (ndim_ != s.ndim_) return false;
+    if (ndim() == -1) return true;
     return std::equal(begin(), end(), s.begin());
   }
   /*!
@@ -177,7 +190,7 @@ class Tuple {
     return ndim_ <= kStackCache ? (data_stack_ + ndim_): (data_heap_ + ndim_);
   }
   /*! \return number of dimension of the tuple */
-  inline uint32_t ndim() const {
+  inline int ndim() const {
     return ndim_;
   }
   /*!
@@ -185,7 +198,8 @@ class Tuple {
    * \param i dimension index
    * \return the corresponding dimension size
    */
-  inline ValueType& operator[](size_t i) {
+  inline ValueType& operator[](int i) {
+    CHECK(i >= 0 && i < ndim()) << "index = " << i << " must be in range [0, " << ndim() << ")";
     return begin()[i];
   }
   /*!
@@ -193,7 +207,8 @@ class Tuple {
    * \param i dimension index
    * \return the corresponding dimension size
    */
-  inline const ValueType& operator[](size_t i) const {
+  inline const ValueType& operator[](int i) const {
+    CHECK(i >= 0 && i < ndim()) << "index = " << i << " must be in range [0, " << ndim() << ")";
     return begin()[i];
   }
   /*!
@@ -220,6 +235,13 @@ class Tuple {
    * \return the ostream
    */
   friend std::ostream &operator<<(std::ostream &os, const Tuple<ValueType> &t) {
+    if (t.ndim() == -1) {
+      // If t is an unknown shape, return string "None".
+      // This is consistent with returning unknown shape in Python and generating
+      // C++ operator APIs by OpWrapperGenerator.py (defaultString) in cpp-package.
+      os << "None";
+      return os;
+    }
     os << '[';
     const ValueType* begin = t.begin();
     const ValueType* end = t.end();
@@ -252,14 +274,16 @@ class Tuple {
       if (!isspace(ch)) {
         is.setstate(std::ios::failbit);
         return is;
+      }
     }
-    }
-    // Handle empty tuple
+    // Handle empty tuple. A tensor whose shape is an empty tuple
+    // represents a scalar with ndim = 0.
     while (isspace(is.peek())) {
       is.get();
     }
     if (is.peek() == ')' || is.peek() == ']') {
       is.get();
+      t.SetDim(0);
       return is;
     }
     // Handle non-empty tuple
@@ -316,48 +340,85 @@ class Tuple {
 
  protected:
   // stack cache size
-  static const uint32_t kStackCache = 4;
+  static const int kStackCache = 4;
   /*! \brief number of dimension of the tuple */
-  uint32_t ndim_{0};
+  int ndim_{0};
   /*! \brief number of cells allocated in data_heap_ */
-  uint32_t num_heap_allocated_{0};
+  int num_heap_allocated_{0};
   /*! \brief in stack space used to store shape when it is small */
   ValueType data_stack_[kStackCache];
   /*! \brief space to store shape when dimension is big*/
   ValueType* data_heap_{nullptr};
   // internal function to change the dimension
-  inline void SetDim(uint32_t ndim) {
+  inline void SetDim(int ndim) {
+    CHECK_GE(ndim, -1) << "ndim cannot be less than -1, received " << ndim;
     if (ndim > kStackCache &&
         ndim > num_heap_allocated_) {
       delete [] data_heap_;
       data_heap_ = new ValueType[ndim];
       num_heap_allocated_ = ndim;
+    } else if (ndim <= 0 && data_heap_ != nullptr) {
+      delete [] data_heap_;
+      data_heap_ = nullptr;
+      num_heap_allocated_ = 0;
     }
     ndim_ = ndim;
   }
 };
 
+
+/*! brief check if a shape's ndim is known. */
+inline bool ndim_is_known(const int ndim) {
+  CHECK_GE(ndim, -1) << "shape ndim must be >= -1, while received " << ndim;
+  return ndim != -1;
+}
+
+/*! brief check if a shape's dim size is known. */
+inline bool dim_size_is_known(const dim_t dim_size) {
+  CHECK_GE(dim_size, -1) << "shape dim size must be >= -1, while received " << dim_size;
+  return dim_size != -1;
+}
+
 /*!
  * \brief A Shape class that is used to represent shape of each tensor.
+ *
+ * The ndim of a valid shape is an integer in range [-1, inf).
+ * ndim = -1 means the shape information is unknown and need to be inferred.
+ * ndim = 0 means the tensor with the shape is a scalar.
+ *
+ * The dimension size of a valid shape is an integer in range [-1, inf).
+ * dim_size = -1 means the size of that dimension is unknown and need to be inferred.
+ * dim_size = 0 means that dimension is empty.
+ *
+ * The definition of ndim = 0 and dim_size = 0 is consistent with NumPy.
  */
 class TShape : public Tuple<dim_t> {
  public:
   /*! \brief default constructor */
-  TShape() = default;
+  TShape() {
+    this->SetDim(-1);
+  }
   /*!
-   * constructor to construct a shape with all 1.
+   * constructor to construct a shape with all `value`.
    * \param ndim the number of dimension
+   * \param value the dimension size for all dims
    */
-  inline TShape(uint32_t ndim) {  // NOLINT(*)
+  inline TShape(const int ndim, const dim_t value) {  // NOLINT(*)
     this->SetDim(ndim);
-    std::fill_n(begin(), ndim, 1);
+    if (ndim > 0) {
+      std::fill_n(begin(), ndim, value);
+    }
   }
   /*!
    * \brief copy constructor of TShape
    * \param s source shape.
    */
   inline TShape(const Tuple<dim_t>& s) { // NOLINT(*)
-    this->assign(s.begin(), s.end());
+    if (s.ndim() == -1) {
+      this->SetDim(-1);
+    } else {
+      this->assign(s.begin(), s.end());
+    }
   }
   /*!
    * \brief constructor from initializer list
@@ -374,12 +435,17 @@ class TShape : public Tuple<dim_t> {
     this->swap(s);
   }
   /*!
-   * \brief construct the Tuple from content of iterator
+   * \brief construct the Tuple from content of iterator.
+   * This function is enforced with template arguments of random access iterator types.
+   * This is necessary to distinguish from another constructor: TShape(const int, const dim_t).
    * \param begin the beginning of iterator
    * \param end end the end of the iterator
    * \tparam RandomAccessIterator iterator type
    */
-  template<typename RandomAccessIterator>
+  template<typename RandomAccessIterator,
+           typename std::enable_if<
+               std::is_same<typename std::iterator_traits<RandomAccessIterator>::iterator_category,
+                            std::random_access_iterator_tag>::value, int>::type = 0>
   inline TShape(RandomAccessIterator begin,
                 RandomAccessIterator end) {
     this->assign(begin, end);
@@ -390,7 +456,11 @@ class TShape : public Tuple<dim_t> {
    * \return self.
    */
   inline TShape& operator=(const Tuple<dim_t>& src) {
-    this->assign(src.begin(), src.end());
+    if (src.ndim() == -1) {
+      this->SetDim(-1);
+    } else {
+      this->assign(src.begin(), src.end());
+    }
     return *this;
   }
   /*!
@@ -404,9 +474,11 @@ class TShape : public Tuple<dim_t> {
   }
   /*! \return total number of elements in the shape */
   inline size_t Size() const {
+    CHECK(ndim_is_known(this->ndim())) << "Shape is unknown.";
     dim_t size = 1;
     const dim_t* start = begin(), *fin = end();
     for (const dim_t* it = start; it != fin; ++it) {
+      CHECK(dim_size_is_known(*it)) << "Shape dim size cannot be a negative value " << *it;
       size *= *it;
     }
     return size;
@@ -417,9 +489,14 @@ class TShape : public Tuple<dim_t> {
    * \param dimend end dimension
    */
   inline size_t ProdShape(int dimstart, int dimend) const {
+    CHECK(ndim_is_known(this->ndim())) << "Shape is unknown.";
+    CHECK_GE(dimstart, 0) << "dimstart must be >= 0, while received " << dimstart;
+    CHECK_LE(dimend, this->ndim()) << "dimend must be <= " << this->ndim()
+                                   << ", while received " << dimend;
     dim_t num = 1;
     const dim_t *d = this->data();
     for (int i = dimstart; i < dimend; ++i) {
+      CHECK(dim_size_is_known(d[i])) << "Shape dim size must be known, while received " << d[i];
       num *= d[i];
     }
     return num;
@@ -460,7 +537,7 @@ class TShape : public Tuple<dim_t> {
    */
   template<int dim>
   inline mshadow::Shape<dim> get() const {
-    CHECK_EQ(dim, static_cast<int>(ndim()))
+    CHECK_EQ(dim, ndim())
         << "dimension do not match target dimension " << dim << " vs " << ndim();
     const dim_t *d = this->data();
     mshadow::Shape<dim> s;
@@ -475,11 +552,12 @@ class TShape : public Tuple<dim_t> {
    */
   inline mshadow::Shape<2> FlatTo2D(void) const {
     mshadow::Shape<2> s;
-    if (ndim() == 0) return mshadow::Shape2(0, 0);
+    CHECK(ndim_is_known(ndim())) << "shape must have a valid ndim";
+    if (ndim() == 0) return mshadow::Shape2(1, 1);
     const dim_t *d = this->data();
     s.shape_[1] = d[ndim() - 1];
     dim_t ymax = 1;
-    for (size_t i = 1; i < ndim(); ++i) {
+    for (int i = 1; i < ndim(); ++i) {
       ymax *= d[i - 1];
     }
     s.shape_[0] = ymax;
@@ -494,7 +572,8 @@ class TShape : public Tuple<dim_t> {
   inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const {
     CHECK(axis_end >= axis_begin);
     mshadow::Shape<3> s;
-    if (ndim() == 0) return mshadow::Shape3(0, 0, 0);
+    CHECK(ndim_is_known(ndim())) << "shape must have a valid ndim";
+    if (ndim() == 0) return mshadow::Shape3(1, 1, 1);
     const dim_t *d = this->data();
     s.shape_[0] = 1;
     s.shape_[1] = 1;
@@ -506,7 +585,7 @@ class TShape : public Tuple<dim_t> {
     for (size_t i = axis_begin; i <= axis_end; ++i) {
       s.shape_[1] *= d[i];
     }
-    for (size_t i = axis_end + 1; i < ndim(); ++i) {
+    for (int i = axis_end + 1; i < ndim(); ++i) {
       s.shape_[2] *= d[i];
     }
     return s;
@@ -552,6 +631,28 @@ class TShape : public Tuple<dim_t> {
 #endif
 };
 
+/*! brief check if a shape's ndim is known. */
+inline bool ndim_is_known(const TShape& x) {
+  return ndim_is_known(x.ndim());
+}
+
+/*! brief check if a shape's dim size is known. */
+inline bool dim_size_is_known(const TShape& x, const int idx) {
+  CHECK(idx >= 0 && idx < x.ndim())
+      << "idx = " << idx << " exceeds shape dimension range [0, " << x.ndim() << ")";
+  return dim_size_is_known(x[idx]);
+}
+
+/*! brief check if shape is known using the NumPy compatible definition.
+ * zero-dim and zero-size tensors are valid. -1 means unknown.*/
+inline bool shape_is_known(const TShape& x) {
+  if (!ndim_is_known(x)) return false;
+  for (int i = 0; i < x.ndim(); ++i) {
+    if (!dim_size_is_known(x, i)) return false;
+  }
+  return true;
+}
+
 /*! \brief helper function to cast type of container elements */
 template<typename SrcIter, typename DstIter>
 inline DstIter ShapeTypeCast(const SrcIter begin,
@@ -567,7 +668,7 @@ inline DstIter ShapeTypeCast(const SrcIter begin,
 template<typename SrcIter>
 inline TShape ShapeTypeCast(const SrcIter begin, const SrcIter end) {
   size_t ndim = std::distance(begin, end);
-  TShape res(ndim);
+  TShape res(ndim, -1);
   ShapeTypeCast(begin, end, res.begin());
   return res;
 }
@@ -613,7 +714,7 @@ struct hash<mxnet::Tuple<T> > {
   size_t operator()(const mxnet::Tuple<T>& val) const {
     std::hash<uint32_t> hash_uint;
     size_t res = hash_uint(val.ndim());
-    for (uint32_t i = 0; i < val.ndim(); ++i) {
+    for (int i = 0; i < val.ndim(); ++i) {
       res = dmlc::HashCombine(res, val[i]);
     }
     return res;
@@ -627,7 +728,7 @@ struct hash<mxnet::TShape> {
   size_t operator()(const mxnet::TShape& val) const {
     std::hash<uint32_t> hash_uint;
     size_t res = hash_uint(val.ndim());
-    for (uint32_t i = 0; i < val.ndim(); ++i) {
+    for (int i = 0; i < val.ndim(); ++i) {
       res = dmlc::HashCombine(res, val[i]);
     }
     return res;
@@ -638,6 +739,7 @@ struct hash<mxnet::TShape> {
 namespace dmlc {
 /*! \brief description for optional TShape */
 DMLC_DECLARE_TYPE_NAME(optional<mxnet::TShape>, "Shape or None");
+DMLC_DECLARE_TYPE_NAME(optional<mxnet::Tuple<int>>, "Shape or None");
 // avoid low version of MSVC
 #if !defined(_MSC_VER)
 template<typename T>
diff --git a/julia/NEWS.md b/julia/NEWS.md
index 4a6c1a210e17..3cd616288bf8 100644
--- a/julia/NEWS.md
+++ b/julia/NEWS.md
@@ -19,8 +19,6 @@
 
 * Following material from `mx` module got exported (#TBD):
     * `NDArray`
-        * `clip()`
-        * `clip!()`
         * `context()`
         * `expand_dims()`
         * `@inplace`
@@ -373,11 +371,12 @@
    99.9889  100.533  100.072
   ```
 
-* Signature of `clip` changed, it doesn't require any keyword argument now.
+* Signature of `clip` changed and renamed to `clamp`.
+  It doesn't require any keyword argument now.
   (#TBD)
 
   Before: `clip(x, a_min = -4, a_max = 4)`
-  After: `clip(x, -4, 4)`
+  After: `clamp(x, -4, 4)`
 
 ### Optimizer
 
diff --git a/julia/deps/build.jl b/julia/deps/build.jl
index 9a719be850f5..badca65be577 100644
--- a/julia/deps/build.jl
+++ b/julia/deps/build.jl
@@ -51,9 +51,10 @@ elseif Sys.islinux()
 end
 
 if Sys.isunix()
-  try
-    push!(CUDAPATHS, replace(strip(read(`which nvcc`, String)), "bin/nvcc", "lib64"))
-  catch
+  nvcc_path = Sys.which("nvcc")
+  if nvcc_path ≢ nothing
+    @info "Found nvcc: $nvcc_path"
+    push!(CUDAPATHS, replace(nvcc_path, "bin/nvcc", "lib64"))
   end
 end
 
diff --git a/julia/models/Inception/get.sh b/julia/models/Inception/get.sh
index 16452a361d98..7b7895d65539 100755
--- a/julia/models/Inception/get.sh
+++ b/julia/models/Inception/get.sh
@@ -18,5 +18,5 @@
 # under the License.
 
 
-wget -c http://data.dmlc.ml/mxnet/data/Inception.zip
+wget -c http://data.mxnet.io/mxnet/data/Inception.zip
 unzip Inception.zip
diff --git a/julia/src/MXNet.jl b/julia/src/MXNet.jl
index 68663d1e561e..89ec88b52cdc 100644
--- a/julia/src/MXNet.jl
+++ b/julia/src/MXNet.jl
@@ -50,8 +50,6 @@ export SymbolicNode,
 
 # ndarray.jl
 export NDArray,
-       clip,
-       clip!,
        context,
        expand_dims,
        @inplace,
@@ -139,6 +137,7 @@ export to_graphviz
 
 include("base.jl")
 
+include("runtime.jl")
 include("context.jl")
 include("util.jl")
 
diff --git a/julia/src/base.jl b/julia/src/base.jl
index 61779d194a94..683146402620 100644
--- a/julia/src/base.jl
+++ b/julia/src/base.jl
@@ -85,12 +85,11 @@ function mx_get_last_error()
 end
 
 "Utility macro to call MXNet API functions"
-macro mxcall(fv, argtypes, args...)
-  f = eval(fv)
+macro mxcall(f, argtypes, args...)
   args = map(esc, args)
   quote
-    _mxret = ccall(($(QuoteNode(f)), $MXNET_LIB),
-                   Cint, $argtypes, $(args...))
+    _mxret = ccall(($f, $MXNET_LIB),
+                   Cint, $(esc(argtypes)), $(args...))
     if _mxret != 0
       err_msg = mx_get_last_error()
       throw(MXError(err_msg))
diff --git a/julia/src/deprecated.jl b/julia/src/deprecated.jl
index 70079b8dcd62..7c49b66b14b1 100644
--- a/julia/src/deprecated.jl
+++ b/julia/src/deprecated.jl
@@ -72,8 +72,6 @@ end
 @deprecate softmax(x::NDArray; axis = ndims(x))     softmax.(x, axis)
 @deprecate log_softmax(x::NDArray; axis = ndims(x)) log_softmax.(x, axis)
 
-@deprecate clip(x; a_min = 0, a_max = 0) clip(x, a_min, a_max)
-
 function broadcast_plus(x::NDArray, y::NDArray)
   @warn("broadcast_plus(x, y) is deprecated, use x .+ y instead.")
   x .+ y
@@ -194,3 +192,9 @@ function empty(dims::Int...)
         "use `NDArray(undef, dims...)` instead.")
   NDArray(undef, dims...)
 end
+
+# replaced by Base.clamp
+@deprecate clip(x::NDArray, lo::Real, hi::Real)  clamp(x, lo, hi)
+@deprecate clip!(x::NDArray, lo::Real, hi::Real) clamp!(x, lo, hi)
+@deprecate clip(x; a_min = 0, a_max = 0) clamp(x, a_min, a_max)
+
diff --git a/julia/src/ndarray/arithmetic.jl b/julia/src/ndarray/arithmetic.jl
index 60dde6bdecdf..4c467a2d96dd 100644
--- a/julia/src/ndarray/arithmetic.jl
+++ b/julia/src/ndarray/arithmetic.jl
@@ -218,40 +218,52 @@ broadcasted(::typeof(^), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
 broadcasted(::typeof(^), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
   _broadcast_power(x, y)
 
-_nddoc[:clip] = _nddoc[:clip!] =
 """
-    clip(x::NDArray, min, max)
-    clip!(x::NDArray, min, max)
+    clamp(x::NDArray, lo, hi)
 
-Clips (limits) the values in `NDArray`.
+Clamps (limits) the values in `NDArray`.
 Given an interval, values outside the interval are clipped to the interval edges.
-Clipping `x` between `min` and `x` would be:
+Clamping `x` between low `lo` and high `hi` would be:
 
 ```julia
-clip(x, min_, max_) = max(min(x, max_), min_))
+clamp(x, lo, hi) = max(min(x, lo), hi))
 ```
 
+The storage type of clip output depends on storage types of inputs and the
+`lo`, `hi` parameter values:
+
+- clamp(default) -> default
+- clamp(row_sparse, lo <= 0, hi >= 0) -> row_sparse
+- clamp(csr, lo <= 0, hi >= 0) -> csr
+- clamp(row_sparse, lo < 0, hi < 0) -> default
+- clamp(row_sparse, lo > 0, hi > 0) -> default
+- clamp(csr, lo < 0, hi < 0) -> csr
+- clamp(csr, lo > 0, hi > 0) -> csr
+
+## Examples
+
 ```jldoctest
 julia> x = NDArray(1:9);
 
-julia> mx.clip(x, 2, 8)'
+julia> clamp(x, 2, 8)'
 1×9 mx.NDArray{Int64,2} @ CPU0:
  2  2  3  4  5  6  7  8  8
-```
 
-The storage type of clip output depends on storage types of inputs and the
-`min`, `max` parameter values:
-
-- clip(default) = default
-- clip(row_sparse, min <= 0, max >= 0) = row_sparse
-- clip(csr, min <= 0, max >= 0) = csr
-- clip(row_sparse, min < 0, max < 0) = default
-- clip(row_sparse, min > 0, max > 0) = default
-- clip(csr, min < 0, max < 0) = csr
-- clip(csr, min > 0, max > 0) = csr
+julia> clamp(x, 8, 2)'
+1×9 NDArray{Int64,2} @ CPU0:
+ 8  8  2  2  2  2  2  2  2
+ ```
+"""
+Base.clamp(x::NDArray, lo::Real, hi::Real) = _clamp(x, lo, hi)
+@_remap _clamp(x::NDArray, lo::Real, hi::Real) clip(x; a_min = lo, a_max = hi)
+
+"""
+    clamp!(x::NDArray, lo, hi)
+
+See also [`clamp`](@ref).
 """
-@_remap clip(x::NDArray, min::Real, max::Real) clip(x; a_min = min, a_max = max)
-@_remap clip!(x::NDArray, min::Real, max::Real) clip(x; a_min = min, a_max = max)
+Base.clamp!(x::NDArray, lo::Real, hi::Real) = _clamp!(x, lo, hi)
+@_remap _clamp!(x::NDArray, lo::Real, hi::Real) clip(x; a_min = lo, a_max = hi)
 
 ################################################################################
 # remapping to solving type unstablility
diff --git a/julia/src/ndarray/remap.jl b/julia/src/ndarray/remap.jl
index e6515e43ded9..86cb0373164e 100644
--- a/julia/src/ndarray/remap.jl
+++ b/julia/src/ndarray/remap.jl
@@ -67,6 +67,18 @@ function _docsig(fname::Symbol, sig::Expr, opname::String)
   end
 end
 
+"""
+    @_remap(sig::Expr, imp::Expr)
+
+Creating a function in signature `sig` with the function implementation `imp`.
+
+## Arguments
+- `sig` is the function signature.
+  If the function name ends with `!`, it will invoke the corresponding inplace
+  call.
+- `imp` is the underlying libmxnet API call
+
+"""
 macro _remap(sig::Expr, imp::Expr)
   d = splitdef(:($sig = $imp))
   @capture d[:name] (M_.fname_|fname_)
diff --git a/julia/src/optimizer.jl b/julia/src/optimizer.jl
index 46726500f81f..6eda53e6d5b3 100644
--- a/julia/src/optimizer.jl
+++ b/julia/src/optimizer.jl
@@ -291,7 +291,7 @@ function normgrad!(opt::AbstractOptimizer, W::NDArray, ∇::NDArray)
   !iszero(s) && @inplace ∇ .*= s
   # gradient clipping
   c = opt.clip
-  c > 0 && clip!(∇, -c, c)
+  c > 0 && clamp!(∇, -c, c)
   # weight decay
   λ = opt.λ
   λ > 0 && @inplace ∇ += λ .* W
diff --git a/julia/src/runtime.jl b/julia/src/runtime.jl
new file mode 100644
index 000000000000..cedcced9d29a
--- /dev/null
+++ b/julia/src/runtime.jl
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# License); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# runtime detection of compile time features in the native library
+
+module MXRuntime
+
+using ..mx
+
+export LibFeature
+export feature_list, isenabled
+
+# defined in include/mxnet/c_api.h
+struct LibFeature
+  _name::Ptr{Cchar}
+  enabled::Bool
+end
+
+function Base.getproperty(x::LibFeature, p::Symbol)
+  (p == :name) && return unsafe_string(getfield(x, :_name))
+  getfield(x, p)
+end
+
+Base.show(io::IO, x::LibFeature) =
+  print(io, ifelse(x.enabled, "✔", "✖"), " ", x.name)
+
+"""
+    feature_list()
+
+Check the library for compile-time features.
+The list of features are maintained in libinfo.h and libinfo.cc
+"""
+function feature_list()
+  ref = Ref{Ptr{LibFeature}}(C_NULL)
+  s = Ref{Csize_t}(C_NULL)
+  @mx.mxcall(:MXLibInfoFeatures, (Ref{Ptr{LibFeature}}, Ref{Csize_t}), ref, s)
+  unsafe_wrap(Array, ref[], s[])
+end
+
+"""
+    isenabled(x::Symbol)::Bool
+
+Returns the given runtime feature is enabled or not.
+
+```julia-repl
+julia> mx.isenabled(:CUDA)
+false
+
+julia> mx.isenabled(:CPU_SSE)
+true
+```
+
+See also `mx.feature_list()`.
+"""
+isenabled(x::Symbol) =
+  any(feature_list()) do i
+    Symbol(i.name) == x && i.enabled
+  end
+
+end  # module MXRuntime
+
+using .MXRuntime
diff --git a/julia/src/symbolic-node.jl b/julia/src/symbolic-node.jl
index 8b7a8be0999d..ba1c595c97e4 100644
--- a/julia/src/symbolic-node.jl
+++ b/julia/src/symbolic-node.jl
@@ -15,983 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""
-    SymbolicNode
-
-SymbolicNode is the basic building block of the symbolic graph in MXNet.jl.
-It's a callable object and supports following calls:
-
-    (s::SymbolicNode)(args::SymbolicNode...)
-    (s::SymbolicNode)(; kwargs...)
-
-Make a new node by composing `s` with `args`. Or the arguments
-can be specified using keyword arguments.
-"""
-mutable struct SymbolicNode
-  handle::MX_SymbolHandle
-end
-
-const SymbolicNodeOrReal = Union{SymbolicNode,Real}
-
-# @unfuse SymbolicNode  # for broadcasting
-
-Base.unsafe_convert(::Type{MX_handle}, obj::SymbolicNode) =
-  Base.unsafe_convert(MX_handle, obj.handle)
-Base.convert(t::Type{MX_handle}, obj::SymbolicNode) = Base.unsafe_convert(t, obj)
-Base.cconvert(t::Type{MX_handle}, obj::SymbolicNode) = Base.unsafe_convert(t, obj)
-
-"""
-    deepcopy(s::SymbolicNode)
-
-Make a deep copy of a SymbolicNode.
-"""
-function Base.deepcopy(s::SymbolicNode)
-  ref_hdr = Ref{MX_handle}(C_NULL)
-  @mxcall(:MXSymbolCopy, (MX_handle, Ref{MX_handle}), s, ref_hdr)
-  SymbolicNode(MX_SymbolHandle(ref_hdr[]))
-end
-
-"""
-    copy(s::SymbolicNode)
-
-Make a copy of a SymbolicNode. The same as making a deep copy.
-"""
-Base.copy(s::SymbolicNode) = Base.deepcopy(s)
-
-
-function (s::SymbolicNode)(args::SymbolicNode...)
-  s = deepcopy(s)
-  _compose!(s, args...)
-end
-
-function (s::SymbolicNode)(; kwargs...)
-  s = deepcopy(s)
-  _compose!(s; kwargs...)
-end
-
-macro _list_symbol_info(self, func_name)
-  quote
-    ref_sz    = Ref{MX_uint}(0)
-    ref_names = Ref{char_pp}(0)
-    @mxcall($func_name, (MX_handle, Ref{MX_uint}, Ref{char_pp}),
-            $(esc(self)), ref_sz, ref_names)
-    narg = ref_sz[]
-    names = unsafe_wrap(Array, ref_names[], narg)
-    names = [Symbol(unsafe_string(x)) for x in names]
-    return names
-  end
-end
-
-"""
-    list_arguments(s::SymbolicNode)
-
-List all the arguments of this node. The argument for a node contains both
-the inputs and parameters. For example, a `FullyConnected` node will
-have both data and weights in its arguments. A composed node (e.g. a MLP) will
-list all the arguments for intermediate nodes.
-
-Returns a list of symbols indicating the names of the arguments.
-"""
-list_arguments(s::SymbolicNode) = @_list_symbol_info(s, :MXSymbolListArguments)
-
-"""
-    list_outputs(s::SymbolicNode)
-
-List all the outputs of this node.
-
-Returns a list of symbols indicating the names of the outputs.
-"""
-list_outputs(s::SymbolicNode) = @_list_symbol_info(s, :MXSymbolListOutputs)
-
-
-"""
-    list_auxiliary_states(s::SymbolicNode)
-
-
-List all auxiliary states in the symbool.
-
-Auxiliary states are special states of symbols that do not corresponds to an argument,
-and do not have gradient. But still be useful for the specific operations.
-A common example of auxiliary state is the moving_mean and moving_variance in BatchNorm.
-Most operators do not have Auxiliary states.
-
-Returns a list of symbols indicating the names of the auxiliary states.
-"""
-list_auxiliary_states(s::SymbolicNode) =
-  @_list_symbol_info(s, :MXSymbolListAuxiliaryStates)
-
-"""
-    get_internals(s::SymbolicNode)
-
-Get a new grouped `SymbolicNode` whose output contains all the internal outputs of
-this `SymbolicNode`.
-"""
-function get_internals(s::SymbolicNode)
-  ref_hdr = Ref{MX_handle}(0)
-  @mxcall(:MXSymbolGetInternals, (MX_handle, Ref{MX_handle}), s, ref_hdr)
-  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
-end
-
-"""
-    get_children(x::SymbolicNode)
-
-Gets a new grouped `SymbolicNode` whose output contains inputs to output
-nodes of the original symbol.
-
-```julia
-julia> x = mx.Variable(:x)
-MXNet.mx.SymbolicNode x
-
-julia> y = mx.Variable(:y)
-MXNet.mx.SymbolicNode y
-
-julia> z = x + y
-MXNet.mx.SymbolicNode _plus1
-
-julia> a |> mx.get_children |> mx.list_outputs
-2-element Array{Symbol,1}:
- :x
- :y
-```
-"""
-function get_children(x::SymbolicNode)
-  hdl = Ref{MX_handle}(C_NULL)
-  @mxcall(:MXSymbolGetChildren, (MX_handle, Ref{MX_handle}), x, hdl)
-  sym = hdl[] |> MX_SymbolHandle |> SymbolicNode
-  isempty(list_outputs(sym)) ? nothing : sym
-end
-
-"""
-    get_attr(s::SymbolicNode, key::Symbol)
-
-Get attribute attached to this `SymbolicNode` belonging to key.
-
-Returns the value belonging to key as a `String`.
-If not available, returns `missing`.
-"""
-function get_attr(s::SymbolicNode, key::Symbol)
-  key_s = string(key)
-  ref_out = Ref{Cstring}()
-  ref_success = Ref{Cint}(-1)
-  @mxcall(:MXSymbolGetAttr, (MX_handle, Cstring, Ref{Cstring}, Ref{Cint}),
-          s, key_s, ref_out, ref_success)
-  if ref_success[] == 1
-    unsafe_string(ref_out[])
-  else
-    missing
-  end
-end
-
-"""
-    list_attr(s::SymbolicNode)
-
-Get all attributes from a symbol.
-
-Returns a dictionary of attributes.
-"""
-function list_attr(s::SymbolicNode)
-  ref_sz    = Ref{MX_uint}(0)
-  ref_strings = Ref{char_pp}(0)
-  @mxcall(:MXSymbolListAttrShallow, (MX_handle, Ref{MX_uint}, Ref{char_pp}),
-            s, ref_sz, ref_strings)
-  narg = 2*ref_sz[]
-  strings = unsafe_wrap(Array, ref_strings[], narg)
-  out = Dict{Symbol, String}()
-  for i in 1:2:narg
-    key = Symbol(unsafe_string(strings[i]))
-    value = unsafe_string(strings[i+1]) # Creates a copy of string
-    out[key] = value
-  end
-  return out
-end
-
-"""
-    list_all_attr(s::SymbolicNode)
-
-Get all attributes from the symbol graph.
-
-Returns a dictionary of attributes.
-"""
-function list_all_attr(s::SymbolicNode)
-  ref_sz    = Ref{MX_uint}(0)
-  ref_strings = Ref{char_pp}(0)
-  @mxcall(:MXSymbolListAttr, (MX_handle, Ref{MX_uint}, Ref{char_pp}),
-            s, ref_sz, ref_strings)
-  narg = 2*ref_sz[]
-  strings = unsafe_wrap(Array, ref_strings[], narg)
-  out = Dict{Symbol, String}()
-  for i in 1:2:narg
-    key = Symbol(unsafe_string(strings[i]))
-    value = unsafe_string(strings[i+1])
-    out[key] = value
-  end
-  return out
-end
-
-"""
-    set_attr(s::SymbolicNode, key::Symbol, value::AbstractString)
-
-Set the attribute key to value for this `SymbolicNode`.
-
-!!! note
-    It is encouraged not to call this function directly, unless you know exactly what you are doing. The
-    recommended way of setting attributes is when creating the `SymbolicNode`. Changing
-    the attributes of a `SymbolicNode` that is already been used somewhere else might
-    cause unexpected behavior and inconsistency.
-"""
-function set_attr(s::SymbolicNode, key::Symbol, value::AbstractString)
-  key_s = string(key)
-  value_s = String(value)
-
-  @mxcall(:MXSymbolSetAttr, (MX_handle, Cstring, Cstring), s, key_s, value_s)
-end
-
-"""
-    get_name(s::SymbolicNode)
-
-Get the name of the symbol.
-
-    julia> x = mx.Variable(:data)
-    julia> mx.get_name(x)
-    :data
-
-    julia> y = mx.FullyConnected(x, num_hidden = 128)
-    julia> mx.get_name(y)
-    :fullyconnected0
-"""
-function get_name(s::mx.SymbolicNode)
-    name = Ref{mx.char_p}(C_NULL)
-    success = Ref(0)
-    @mxcall(:MXSymbolGetName, (MX_handle, Ref{char_p}, Ref{Int}), s.handle.value, name, success)
-    @assert success[] != -1
-
-    str = name[]
-    if str == C_NULL  # e.g. the symbol returned via get_internals
-        string(s.handle.value)
-    else
-        Symbol(unsafe_string(str))
-    end
-end
-
-Base.show(io::IO, sym::SymbolicNode) =
-  print(io, "$(typeof(sym)) $(get_name(sym))")
-
-"""
-    print([io::IO], sym::SymbolicNode)
-
-Print the content of symbol, used for debug.
-
-```julia
-julia> layer = @mx.chain mx.Variable(:data)           =>
-         mx.FullyConnected(name=:fc1, num_hidden=128) =>
-         mx.Activation(name=:relu1, act_type=:relu)
-MXNet.mx.SymbolicNode(MXNet.mx.MX_SymbolHandle(Ptr{Nothing} @0x000055b29b9c3520))
-
-julia> print(layer)
-Symbol Outputs:
-        output[0]=relu1(0)
-Variable:data
-Variable:fc1_weight
-Variable:fc1_bias
---------------------
-Op:FullyConnected, Name=fc1
-Inputs:
-        arg[0]=data(0) version=0
-        arg[1]=fc1_weight(0) version=0
-        arg[2]=fc1_bias(0) version=0
-Attrs:
-        num_hidden=128
---------------------
-Op:Activation, Name=relu1
-Inputs:
-        arg[0]=fc1(0)
-Attrs:
-        act_type=relu
-```
-"""
-function Base.print(io::IO, sym::SymbolicNode)
-  out = Ref{mx.char_p}(C_NULL)
-  @mx.mxcall(:MXSymbolPrint, (mx.MX_SymbolHandle, Ref{mx.char_p}), sym.handle, out)
-  print(io, unsafe_string(out[]))
-end
-
-Base.print(sym::SymbolicNode) = print(STDOUT, sym)
-
-"""
-    grad(s::SymbolicNode, wrt::Vector{Symbol})
-
-Get the autodiff gradient of the current `SymbolicNode`. This function can
-only be used if the current symbol is a loss function.
-
-# Arguments:
-* `s::SymbolicNode`: current node.
-* `wrt::Vector{Symbol}`: the names of the arguments to the gradient.
-
-Returns a gradient symbol of the corresponding gradient.
-"""
-function grad(s::SymbolicNode, wrt::Vector{Symbol})
-  hdr_ref = Ref{MX_handle}(C_NULL)
-  keys = string.(key)
-
-  @mxcall(:MXSymbolGrad, (MX_handle, MX_uint, char_pp, Ptr{MX_handle}),
-          self, length(keys), keys, hdr_ref)
-  return SymbolicNode(MX_SymbolHandle(hdr_ref[]))
-end
-
-"""
-    Variable(name :: Union{Symbol, AbstractString})
-
-Create a symbolic variable with the given name. This is typically used as a placeholder.
-For example, the data node, acting as the starting point of a network architecture.
-
-# Arguments
-* Dict{Symbol, AbstractString} attrs: The attributes associated with this `Variable`.
-"""
-function Variable(name :: Union{Symbol, AbstractString}; attrs = Dict())
-  attrs = convert(Dict{Symbol, AbstractString}, attrs)
-  hdr_ref = Ref{MX_handle}(0)
-  @mxcall(:MXSymbolCreateVariable, (char_p, Ref{MX_handle}), name, hdr_ref)
-  node = SymbolicNode(MX_SymbolHandle(hdr_ref[]))
-  for (k, v) in attrs
-    set_attr(node, k, v)
-  end
-  node
-end
-
-"""
-    @var <symbols>...
-
-A handy macro for creating `mx.Variable`.
-
-```julia
-julia> x = @mx.var x
-MXNet.mx.SymbolicNode x
-
-julia> x, y, z = @mx.var x y z
-(MXNet.mx.SymbolicNode x, MXNet.mx.SymbolicNode y, MXNet.mx.SymbolicNode z)
-```
-"""
-macro var(n::Symbol)
-  Expr(:call, :Variable, QuoteNode(n))
-end
-
-macro var(names::Symbol...)
-  Expr(:tuple, map(n -> Expr(:call, :Variable, QuoteNode(n)), names)...)
-end
-
-"""
-    Group(nodes :: SymbolicNode...)
-
-Create a `SymbolicNode` by grouping nodes together.
-"""
-function Group(nodes :: SymbolicNode...)
-  handles = MX_handle[nodes...]
-  ref_hdr = Ref{MX_handle}(0)
-  @mxcall(:MXSymbolCreateGroup, (MX_uint, Ptr{MX_handle}, Ref{MX_handle}),
-          length(handles), handles, ref_hdr)
-  SymbolicNode(MX_SymbolHandle(ref_hdr[]))
-end
-
-function _build_shapes(shape_size::MX_uint, shape_ndim::Ptr{MX_uint}, shape_data::Ptr{Ptr{MX_uint}})
-  shape_ndim = unsafe_wrap(Array, shape_ndim, shape_size)
-  shape_data = unsafe_wrap(Array, shape_data, shape_size)
-  shapes = map(1:shape_size) do i
-    my_shape = unsafe_wrap(Array, shape_data[i], shape_ndim[i])
-    tuple(reverse(Int[my_shape...], dims = 1)...)
-  end
-  convert(Vector{Tuple}, shapes)
-end
-
-function _infer_shape(self, keys, indptr, sdata)
-  ref_arg_shape_size = Ref{MX_uint}(0)
-  ref_arg_shape_ndim = Ref{Ptr{MX_uint}}(0)
-  ref_arg_shape_data = Ref{Ptr{Ptr{MX_uint}}}(0)
-  ref_out_shape_size = Ref{MX_uint}(0)
-  ref_out_shape_ndim = Ref{Ptr{MX_uint}}(0)
-  ref_out_shape_data = Ref{Ptr{Ptr{MX_uint}}}(0)
-  ref_aux_shape_size = Ref{MX_uint}(0)
-  ref_aux_shape_ndim = Ref{Ptr{MX_uint}}(0)
-  ref_aux_shape_data = Ref{Ptr{Ptr{MX_uint}}}(0)
-  ref_complete       = Ref{Cint}(0)
-  @mxcall(:MXSymbolInferShape,
-          (MX_handle, MX_uint, char_pp, Ptr{MX_uint}, Ptr{MX_uint},
-           Ref{MX_uint}, Ref{Ptr{MX_uint}}, Ref{Ptr{Ptr{MX_uint}}},
-           Ref{MX_uint}, Ref{Ptr{MX_uint}}, Ref{Ptr{Ptr{MX_uint}}},
-           Ref{MX_uint}, Ref{Ptr{MX_uint}}, Ref{Ptr{Ptr{MX_uint}}},
-           Ref{Cint}),
-          self, length(indptr)-1, keys, indptr, sdata,
-          ref_arg_shape_size, ref_arg_shape_ndim, ref_arg_shape_data,
-          ref_out_shape_size, ref_out_shape_ndim, ref_out_shape_data,
-          ref_aux_shape_size, ref_aux_shape_ndim, ref_aux_shape_data,
-          ref_complete)
-  if ref_complete[] == 0
-    return (nothing, nothing, nothing)
-  else
-    return (
-      _build_shapes(ref_arg_shape_size[], ref_arg_shape_ndim[], ref_arg_shape_data[]),
-      _build_shapes(ref_out_shape_size[], ref_out_shape_ndim[], ref_out_shape_data[]),
-      _build_shapes(ref_aux_shape_size[], ref_aux_shape_ndim[], ref_aux_shape_data[])
-    )
-  end
-end
-
-"""
-    infer_shape(self :: SymbolicNode, args...)
-    infer_shape(self :: SymbolicNode; kwargs...)
-
-Do shape inference according to the input shapes. The input shapes could be provided
-as a list of shapes, which should specify the shapes of inputs in the same order as
-the arguments returned by [`list_arguments`](@ref). Alternatively, the shape information
-could be specified via keyword arguments.
-
-Returns a 3-tuple containing shapes of all the arguments, shapes of all the outputs and
-shapes of all the auxiliary variables. If shape inference failed due to incomplete
-or incompatible inputs, the return value will be `(nothing, nothing, nothing)`.
-"""
-function infer_shape(self :: SymbolicNode; kwargs...)
-  sdata  = MX_uint[]
-  indptr = MX_uint[0]
-  for (k,v) in kwargs
-    append!(sdata, reverse([v...], dims = 1))
-    push!(indptr, length(sdata))
-  end
-  keys = AbstractString[string(x[1]) for x in kwargs]
-  _infer_shape(self, keys, indptr, sdata)
-end
-function infer_shape(self :: SymbolicNode, args::Union{Tuple, Cvoid}...)
-  sdata  = MX_uint[]
-  indptr = MX_uint[0]
-  for arg in args
-    if isa(arg, Cvoid); continue; end
-    append!(sdata, reverse([arg...], dims = 1))
-    push!(indptr, length(sdata))
-  end
-  keys = Ptr{char_p}(0)
-  _infer_shape(self, keys, indptr, sdata)
-end
-
-function _infer_type(self, keys, arg_type_data)
-  ref_in_type_size  = Ref{MX_uint}()
-  ref_in_type_data  = Ref{Ptr{Cint}}()
-  ref_out_type_size = Ref{MX_uint}()
-  ref_out_type_data = Ref{Ptr{Cint}}()
-  ref_aux_type_size = Ref{MX_uint}()
-  ref_aux_type_data = Ref{Ptr{Cint}}()
-  ref_complete      = Ref{Cint}()
-
-  @mxcall(:MXSymbolInferType,
-          (MX_handle, MX_uint, char_pp, Ptr{Cint},
-           Ref{MX_uint}, Ref{Ptr{Cint}},
-           Ref{MX_uint}, Ref{Ptr{Cint}},
-           Ref{MX_uint}, Ref{Ptr{Cint}},
-           Ref{Cint}),
-          self, length(arg_type_data)-1, keys, arg_type_data,
-          ref_in_type_size, ref_in_type_data,
-          ref_out_type_size, ref_out_type_data,
-          ref_aux_type_size, ref_aux_type_data,
-          ref_complete)
-
-  if ref_complete[] == 0
-    return (nothing, nothing, nothing)
-  else
-    in_type = unsafe_wrap(Array, ref_in_type_data[], ref_in_type_size[])
-    out_type = unsafe_wrap(Array, ref_out_type_data[], ref_out_type_size[])
-    aux_type = unsafe_wrap(Array, ref_aux_type_data[], ref_aux_type_size[])
-    return ([fromTypeFlag(TypeFlag(t)) for t in in_type],
-            [fromTypeFlag(TypeFlag(t)) for t in out_type],
-            [fromTypeFlag(TypeFlag(t)) for t in aux_type])
-  end
-end
-
-"""
-    infer_type(self :: SymbolicNode; kwargs...)
-    infer_type(self :: SymbolicNode, args...)
-
-Do type inference according to the input types. The input types could be provided
-as a list of types, which should specify the types of inputs in the same order as
-the arguments returned by [`list_arguments`](@ref). Alternatively, the type information
-could be specified via keyword arguments.
-
-Returns a 3-tuple containing types of all the arguments, types of all the outputs and
-types of all the auxiliary variables. If type inference failed due to incomplete
-or incompatible inputs, the return value will be `(nothing, nothing, nothing)`.
-"""
-function infer_type(self :: SymbolicNode; kwargs...)
-  types = Cint[toTypeFlag(x[2]) for x in kwargs]
-  keys = AbstractString[string(x[1]) for x in kwargs]
-  _infer_type(self, keys, types)
-end
-
-function infer_type(self :: SymbolicNode, args :: Union{Tuple,Cvoid}...)
-  types = Cint[]
-  keys = Ptr{char_p}(0)
-
-  for arg in args
-    if isa(arg, Cvoid); continue; end
-    push!(types, toTypeFlag(arg))
-  end
-  _infer_type(self, keys, types)
-end
-
-"""
-    getindex(self :: SymbolicNode, idx :: Union{Int, Base.Symbol, AbstractString})
-
-Get a node representing the specified output of this node. The index could be
-a symbol or string indicating the name of the output, or a 1-based integer
-indicating the index, as in the list of [`list_outputs`](@ref).
-"""
-function Base.getindex(self :: SymbolicNode, idx :: Union{Base.Symbol, AbstractString})
-  idx   = Symbol(idx)
-  i_idx = findall(idx .== list_outputs(self))
-  @assert(length(i_idx) > 0, "Cannot find output with name '$idx'")
-  @assert(length(i_idx) < 2, "Found duplicated output with name '$idx'")
-  Base.getindex(self, i_idx[1])
-end
-function Base.getindex(self :: SymbolicNode, idx :: Int)
-  ref_hdr = Ref{MX_handle}(0)
-  # note Julia is 1-based, while MXNet is 0-based
-  @mxcall(:MXSymbolGetOutput, (MX_handle, MX_uint, Ref{MX_handle}), self, idx-1, ref_hdr)
-  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
-end
-
-import Base: +
-
-"""
-    +(args...)
-    .+(args...)
-
-Elementwise summation of `SymbolicNode`.
-"""
-function +(x::SymbolicNode, ys::SymbolicNodeOrReal...)
-  ret = x
-  for y ∈ ys
-    if y isa SymbolicNode
-      ret = _plus(ret, y)
-    else
-      ret = _plus_scalar(ret, scalar=MX_float(y))
-    end
-  end
-  ret
-end
-
-+(s::Real, x::SymbolicNode, ys::SymbolicNodeOrReal...) = +(x + s, ys...)
-
-broadcasted(::typeof(+), x::SymbolicNode, ys::SymbolicNodeOrReal...) = +(x, ys...)
-broadcasted(::typeof(+), s::Real, x::SymbolicNode, ys::SymbolicNodeOrReal...) = +(x + s, ys...)
-
-import Base: -
-
-"""
-    -(x, y)
-    .-(x, y)
-
-Elementwise substraction of `SymbolicNode`.
-Operating with `Real` is available.
-"""
-x::SymbolicNode - y::SymbolicNode = _minus(x, y)
-x::SymbolicNode - s::Real         = _minus_scalar(x,  scalar=MX_float(s))
-s::Real         - x::SymbolicNode = _rminus_scalar(x, scalar=MX_float(s))
-
--(x::SymbolicNode) = 0 - x
-
-broadcasted(::typeof(-), x::SymbolicNode, y::SymbolicNodeOrReal) = x - y
-broadcasted(::typeof(-), s::Real, x::SymbolicNode) = s - x
-
-import Base: *
-
-"""
-    .*(x, y)
-
-Elementwise multiplication of `SymbolicNode`.
-"""
-x::SymbolicNode * s::Real = _mul_scalar(x, scalar=MX_float(s))
-s::Real * x::SymbolicNode = _mul_scalar(x, scalar=MX_float(s))
-
-function broadcasted(::typeof(*), x::SymbolicNode, ys::SymbolicNodeOrReal...)
-  ret = x
-  for y in ys
-    if y isa SymbolicNode
-      ret = _mul(ret, y)
-    else
-      ret = _mul_scalar(ret, scalar=MX_float(y))
-    end
-  end
-  ret
-end
-
-broadcasted(::typeof(*), s::Real, x::SymbolicNode, ys::SymbolicNodeOrReal...) =
-  broadcasted(*, x * s, ys...)
-
-import Base: /
-
-"""
-    ./(x, y)
-
-* Elementwise dividing a `SymbolicNode` by a scalar or another `SymbolicNode`
-of the same shape.
-
-* Elementwise divide a scalar by an `SymbolicNode`.
-
-* Matrix division (solving linear systems) is not implemented yet.
-"""
-x::SymbolicNode / s::Real = _DivScalar(x, scalar=MX_float(s))
-
-broadcasted(::typeof(/), x::SymbolicNode, y::SymbolicNode) = _div(x, y)
-broadcasted(::typeof(/), x::SymbolicNode, s::Real) = _div_scalar(x,  scalar=MX_float(s))
-broadcasted(::typeof(/), s::Real, x::SymbolicNode) = _rdiv_scalar(x, scalar=MX_float(s))
-
-
-import Base: ^
-
-"""
-    .^(x, y)
-
-Elementwise power of `SymbolicNode` and `NDArray`.
-Operating with `Real` is available.
-"""
-^
-
-broadcasted(::typeof(^), x::SymbolicNode, y::SymbolicNode) = _power(x, y)
-broadcasted(::typeof(^), x::SymbolicNode, s::Real) = _power_scalar(x,  scalar = s)
-broadcasted(::typeof(^), s::Real, x::SymbolicNode) = _rpower_scalar(x, scalar = s)
-broadcasted(::typeof(Base.literal_pow), ::typeof(^), x::SymbolicNode, ::Val{s}) where {s} =
-  _power_scalar(x, scalar = s)
-
-broadcasted(::typeof(^), ::Irrational{:ℯ}, x::SymbolicNode) = exp(x)
-broadcasted(::typeof(^), x::SymbolicNode, s::Irrational) =
-  _power_scalar(x, scalar=MX_float(s))
-broadcasted(::typeof(^), s::Irrational, x::SymbolicNode) =
-  _rpower_scalar(x, scalar=MX_float(s))
-
-function _compose!(node::SymbolicNode; kwargs...)
-  name     = char_p(C_NULL)
-  arg_keys = AbstractString[]  # FIXME: can it be String[] ?
-  arg_vals = MX_handle[]
-
-  for (k, v) in kwargs
-    if k == :name
-      name = string(v)
-    else
-      @assert(isa(v, SymbolicNode), "Compose expect `SymbolicNode` as arguments")
-      push!(arg_keys, string(k))
-      push!(arg_vals, v)
-    end
-  end
-
-  @mxcall(:MXSymbolCompose,
-          (MX_handle, char_p, MX_uint, Ptr{char_p}, Ptr{MX_handle}),
-          node, name, length(arg_keys), arg_keys, arg_vals)
-  node
-end
-_compose!(node::SymbolicNode, args::SymbolicNode...) =
-  _compose!(node, char_p(0), args...)
-function _compose!(node::SymbolicNode, name::Union{Symbol, char_p}, args::SymbolicNode...)
-  if name isa Symbol
-    name = string(name)
-  end
-  arg_keys = Ptr{char_p}(C_NULL)
-  arg_vals = MX_handle[args...]
-
-  @mxcall(:MXSymbolCompose,
-          (MX_handle, char_p, MX_uint, Ptr{char_p}, Ptr{MX_handle}),
-          node, name, length(arg_vals), arg_keys, arg_vals)
-  node
-end
-
-"""
-    to_json(self :: SymbolicNode)
-
-Convert a `SymbolicNode` into a JSON string.
-"""
-function to_json(self :: SymbolicNode)
-  ref_json = Ref{char_p}(0)
-  @mxcall(:MXSymbolSaveToJSON, (MX_handle, Ref{char_p}), self, ref_json)
-  return unsafe_string(ref_json[])
-end
-
-"""
-    from_json(repr :: AbstractString, ::Type{SymbolicNode})
-
-Load a `SymbolicNode` from a JSON string representation.
-"""
-function from_json(repr :: AbstractString, ::Type{SymbolicNode})
-  ref_hdr = Ref{MX_handle}(0)
-  @mxcall(:MXSymbolCreateFromJSON, (char_p, Ref{MX_handle}), repr, ref_hdr)
-  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
-end
-
-"""
-    load(filename :: AbstractString, ::Type{SymbolicNode})
-
-Load a `SymbolicNode` from a JSON file.
-"""
-function load(filename :: AbstractString, ::Type{SymbolicNode})
-  ref_hdr = Ref{MX_handle}(0)
-  @mxcall(:MXSymbolCreateFromFile, (char_p, Ref{MX_handle}), filename, ref_hdr)
-  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
-end
-
-"""
-    save(filename :: AbstractString, node :: SymbolicNode)
-
-Save a `SymbolicNode` to a JSON file.
-"""
-function save(filename :: AbstractString, node :: SymbolicNode)
-  @mxcall(:MXSymbolSaveToFile, (MX_handle, char_p), node, filename)
-end
-
-import Base: reshape
-
-"""
-    reshape(sym::SymbolicNode, dim; reverse=false, name)
-    reshape(sym::SymbolicNode, dim...; reverse=false, name)
-
-Reshape SymbolicNode operator
-
-Some dimensions of the shape can take special values from the set
-{0, -1, -2, -3, -4}.
-The significance of each is explained below:
-
-- `0`  copy this dimension from the input to the output shape.
-
-  Example:
-
-  - input shape = (2,3,4), shape = (4,0,2), output shape = (4,3,2)
-  - input shape = (2,3,4), shape = (2,0,0), output shape = (2,3,4)
-
-- `-1` infers the dimension of the output shape by using the remainder of the
-  input dimensions keeping the size of the new array same as that of the input
-  array. At most one dimension of shape can be -1.
-
-  Example:
-
-  - input shape = (2,3,4), shape = (6,1,-1), output shape = (6,1,4)
-  - input shape = (2,3,4), shape = (3,-1,8), output shape = (3,1,8)
-  - input shape = (2,3,4), shape=(-1,), output shape = (24,)
-
-- `-2` copy all/remainder of the input dimensions to the output shape.
-
-  Example:
-
-  - input shape = (2,3,4), shape = (-2,), output shape = (2,3,4)
-  - input shape = (2,3,4), shape = (2,-2), output shape = (2,3,4)
-  - input shape = (2,3,4), shape = (-2,1,1), output shape = (2,3,4,1,1)
-
-- `-3` use the product of two consecutive dimensions of the input shape as the
-  output dimension.
-
-  Example:
-
-  - input shape = (2,3,4), shape = (-3,4), output shape = (6,4)
-  - input shape = (2,3,4,5), shape = (-3,-3), output shape = (6,20)
-  - input shape = (2,3,4), shape = (0,-3), output shape = (2,12)
-  - input shape = (2,3,4), shape = (-3,-2), output shape = (6,4)
-
-- `-4` split one dimension of the input into two dimensions passed subsequent
-  to -4 in shape (can contain -1).
-
-  Example:
-
-  - input shape = (2,3,4), shape = (-4,1,2,-2), output shape = (1,2,3,4)
-  - input shape = (2,3,4), shape = (2,-4,-1,3,-2), output shape = (2,1,3,4)
-
-If the argument `reverse` is set to `1`, then the special values are inferred
-from right to left.
-
-  Example:
-
-  - with `reverse=false`, for input shape = (10,5,4), shape = (-1,0),
-    output shape would be (40,5)
-  - with `reverse=true`, output shape will be (50,4).
-"""
-reshape(sym::SymbolicNode, dim::NTuple{N, Integer}; kwargs...) where {N} =
-  _reshape(sym, dim; kwargs...)
-reshape(sym::SymbolicNode, dim::Integer...; kwargs...) =
-  _reshape(sym, dim; kwargs...)
-
-@inline function _reshape(sym::SymbolicNode, dim::NTuple{N, Integer};
-                          reverse::Bool=false, name::String="") where N
-  op = _get_cached_libmx_op_handle("reshape")
-  node = _create_atomic_symbol(op.value, ["shape", "reverse"],
-                               [dump_mx_param(dim), dump_mx_param(!reverse)])
-  name = get!(DEFAULT_NAME_MANAGER, name, "reshape")
-  _compose!(node, name=name, data=sym)
-end
-
-################################################################################
-# Atomic SymbolicNode functions dynamically imported from libmxnet
-################################################################################
-@inline function _create_atomic_symbol(creator::MX_handle, keys::Vector{String},
-                                       vals::Vector{String})
-  ref_sym_hdr = Ref{MX_handle}(C_NULL)
-  @mxcall(:MXSymbolCreateAtomicSymbol,
-          (MX_handle, MX_uint, Ptr{char_p}, Ptr{char_p}, Ref{MX_handle}),
-          creator, length(keys), keys, vals, ref_sym_hdr)
-  SymbolicNode(MX_SymbolHandle(ref_sym_hdr[]))
-end
-
-@inline function _create_atomic_symbol(creator::MX_handle, keys::Vector{String},
-                                       vals::Vector{String},
-                                       attrs::Dict{Symbol, String})
-  node = _create_atomic_symbol(creator, keys, vals)
-  # set attrs
-  for (k, v) in attrs
-    set_attr(node, k, v)
-  end
-  node
-end
-
-function _define_atomic_symbol_creator(name::String)
-  handle = _get_libmx_op_handle(name)
-  f_desc, key_narg = _get_libmx_op_description(name, handle)
-
-  f_desc *= "* `name::Symbol`: The name of the `SymbolicNode`. (e.g. `:my_symbol`), optional.\n"
-  f_desc *= "* `attrs::Dict{Symbol,String}`: The attributes associated with this `SymbolicNode`.\n\n"
-
-  func_name = Symbol(name)
-  import_expr = _import_expr(func_name)
-
-  func_def = quote
-  function $func_name(::Type{SymbolicNode}, args::SymbolicNode...; name = "", kwargs...)
-
-    # NOTE: hacky way of solving the problem that the arguments of `dot` should be swapped
-    # See https://github.com/dmlc/MXNet.jl/issues/55
-    if $name == "dot"
-      args = reverse(args)
-    end
-
-    # NOTE: hacky way of solving the semantic difference of the axes parameter in Julia
-    # and in libmxnet.
-    # See https://github.com/dmlc/MXNet.jl/pull/123
-    if $name == "transpose"
-      kwargs = Any[key != :axes ? (key, arg) : (key, reverse(map(i->length(arg)-i, arg))) for (key, arg) in kwargs]
-    end
-
-    param_keys = String[]
-    param_vals = String[]
-    symbol_kws = Dict{Symbol,SymbolicNode}()
-    attrs = Dict{Symbol,String}()
-
-    $(if key_narg != ""
-      quote
-        if !in($key_narg, param_keys)
-          push!(param_keys, $key_narg)
-          push!(param_vals, string(length(args)))
-        end
-      end
-    end)
-
-    for (k,v) in kwargs
-      if k == :name; continue; end
-      if isa(v, SymbolicNode)
-        symbol_kws[k] = v
-      elseif k == :attrs
-        if isa(v, Dict)
-          attrs = convert(Dict{Symbol, String}, v)
-        else
-          throw(ArgumentError("attrs needs to be a Dictionary"))
-        end
-      else
-        push!(param_keys, string(k))
-        push!(param_vals, dump_mx_param(v))
-      end
-    end
-
-    if length(args) > 1 && length(symbol_kws) != 0
-      @assert(false, $name * " only accepts SymbolicNode either as positional or keyword arguments with optional positional `data` argument, not both.")
-    end
-    $(if key_narg != ""
-      quote
-        if length(symbol_kws) > 0
-          @assert(false, $name * " takes variable number of SymbolicNode arguments, " *
-                         "please pass input Symbols via positional arguments, instead of keyword arguments.")
-        end
-      end
-    end)
-
-    local op = _get_cached_libmx_op_handle($name)
-    node = _create_atomic_symbol(op.value, param_keys, param_vals, attrs)
-
-    # generate a new name for the new symbol if user not provided in kwargs
-    hint = lowercase($name)
-    name = get!(DEFAULT_NAME_MANAGER, name, hint)
-
-    if length(symbol_kws) == 0
-      _compose!(node, name, args...)
-    elseif length(args) == 1
-      _compose!(node; name=name, data=args[1], symbol_kws...)
-    else
-      _compose!(node; name=name, symbol_kws...)
-    end
-
-    return node
-  end # function
-  end # quote
-
-  func_def2 = quote
-  @doc $f_desc
-  function $func_name(args::SymbolicNode...; kwargs...)
-    $func_name(SymbolicNode, args...; kwargs...)
-  end # function
-  end # quote
-
-  return quote
-    $import_expr
-    $func_def
-    $func_def2
-  end
-end
-
-macro _import_atomic_symbol_creators()
-  # NOTE: those are operators defined for NDArray, we exclude them here
-  # because the calling convention for the type signature is not strong
-  # enough to disambiguate the method for NDArray and SymbolicNode
-  ignored_ops = ("_set_value", "reshape")  # in lowercase
-
-  op_names = _get_libmx_op_names()
-  func_exprs = map(op_names) do name
-    if lowercase(name) ∉ ignored_ops
-      expr = _define_atomic_symbol_creator(name)
-    end
-  end
-
-  esc(quote
-    $(func_exprs...)
-  end)
-end
-
-@_import_atomic_symbol_creators
-
-################################################################################
-# Utility macros to chain up symbols
-################################################################################
-macro chain(layers)
-    exprs = []
-    last_layer = nothing
-
-    function _chain_layer(layer, last_layer)
-        if last_layer ≡ nothing
-            return esc(layer)
-        else
-            if @capture(layer, f_(x__))
-                x′ = esc.(x)
-                return :($f($last_layer, $(x′...)))
-            else
-                throw(AssertionError("$layer is not a valid function call and cannot be chained."))
-            end
-        end
-    end
-
-    while true
-        if @capture(layers, l1_=>l2_)
-            new_layer = gensym()
-            push!(exprs, :($new_layer = $(_chain_layer(l1, last_layer))))
-            last_layer = new_layer
-            layers = l2
-        else
-            push!(exprs, _chain_layer(layers, last_layer))
-            break
-        end
-    end
-    Expr(:block, exprs...)
-end
+include("symbolic-node/type.jl")
+include("symbolic-node/show.jl")
+include("symbolic-node/arithmetic.jl")
+include("symbolic-node/io.jl")  # save/load and json utils
+include("symbolic-node/array.jl")
+include("symbolic-node/op.jl")
+include("symbolic-node/autodiff.jl")  # AD and  shape inference stuffs
diff --git a/julia/src/symbolic-node/arithmetic.jl b/julia/src/symbolic-node/arithmetic.jl
new file mode 100644
index 000000000000..75b87c7dcab4
--- /dev/null
+++ b/julia/src/symbolic-node/arithmetic.jl
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import Base: +
+
+"""
+    +(args...)
+    .+(args...)
+
+Elementwise summation of `SymbolicNode`.
+"""
+function +(x::SymbolicNode, ys::SymbolicNodeOrReal...)
+  ret = x
+  for y ∈ ys
+    if y isa SymbolicNode
+      ret = _plus(ret, y)
+    else
+      ret = _plus_scalar(ret, scalar=MX_float(y))
+    end
+  end
+  ret
+end
+
++(s::Real, x::SymbolicNode, ys::SymbolicNodeOrReal...) = +(x + s, ys...)
+
+broadcasted(::typeof(+), x::SymbolicNode, ys::SymbolicNodeOrReal...) = +(x, ys...)
+broadcasted(::typeof(+), s::Real, x::SymbolicNode, ys::SymbolicNodeOrReal...) = +(x + s, ys...)
+
+import Base: -
+
+"""
+    -(x, y)
+    .-(x, y)
+
+Elementwise substraction of `SymbolicNode`.
+Operating with `Real` is available.
+"""
+x::SymbolicNode - y::SymbolicNode = _minus(x, y)
+x::SymbolicNode - s::Real         = _minus_scalar(x,  scalar=MX_float(s))
+s::Real         - x::SymbolicNode = _rminus_scalar(x, scalar=MX_float(s))
+
+-(x::SymbolicNode) = 0 - x
+
+broadcasted(::typeof(-), x::SymbolicNode, y::SymbolicNodeOrReal) = x - y
+broadcasted(::typeof(-), s::Real, x::SymbolicNode) = s - x
+
+import Base: *
+
+"""
+    .*(x, y)
+
+Elementwise multiplication of `SymbolicNode`.
+"""
+x::SymbolicNode * s::Real = _mul_scalar(x, scalar=MX_float(s))
+s::Real * x::SymbolicNode = _mul_scalar(x, scalar=MX_float(s))
+
+function broadcasted(::typeof(*), x::SymbolicNode, ys::SymbolicNodeOrReal...)
+  ret = x
+  for y in ys
+    if y isa SymbolicNode
+      ret = _mul(ret, y)
+    else
+      ret = _mul_scalar(ret, scalar=MX_float(y))
+    end
+  end
+  ret
+end
+
+broadcasted(::typeof(*), s::Real, x::SymbolicNode, ys::SymbolicNodeOrReal...) =
+  broadcasted(*, x * s, ys...)
+
+import Base: /
+
+"""
+    ./(x, y)
+
+* Elementwise dividing a `SymbolicNode` by a scalar or another `SymbolicNode`
+of the same shape.
+
+* Elementwise divide a scalar by an `SymbolicNode`.
+
+* Matrix division (solving linear systems) is not implemented yet.
+"""
+x::SymbolicNode / s::Real = _DivScalar(x, scalar=MX_float(s))
+
+broadcasted(::typeof(/), x::SymbolicNode, y::SymbolicNode) = _div(x, y)
+broadcasted(::typeof(/), x::SymbolicNode, s::Real) = _div_scalar(x,  scalar=MX_float(s))
+broadcasted(::typeof(/), s::Real, x::SymbolicNode) = _rdiv_scalar(x, scalar=MX_float(s))
+
+
+import Base: ^
+
+"""
+    .^(x, y)
+
+Elementwise power of `SymbolicNode` and `NDArray`.
+Operating with `Real` is available.
+"""
+^
+
+broadcasted(::typeof(^), x::SymbolicNode, y::SymbolicNode) = _power(x, y)
+broadcasted(::typeof(^), x::SymbolicNode, s::Real) = _power_scalar(x,  scalar = s)
+broadcasted(::typeof(^), s::Real, x::SymbolicNode) = _rpower_scalar(x, scalar = s)
+broadcasted(::typeof(Base.literal_pow), ::typeof(^), x::SymbolicNode, ::Val{s}) where {s} =
+  _power_scalar(x, scalar = s)
+
+broadcasted(::typeof(^), ::Irrational{:ℯ}, x::SymbolicNode) = exp(x)
+broadcasted(::typeof(^), x::SymbolicNode, s::Irrational) =
+  _power_scalar(x, scalar=MX_float(s))
+broadcasted(::typeof(^), s::Irrational, x::SymbolicNode) =
+  _rpower_scalar(x, scalar=MX_float(s))
+
+
diff --git a/julia/src/symbolic-node/array.jl b/julia/src/symbolic-node/array.jl
new file mode 100644
index 000000000000..95446a03ab89
--- /dev/null
+++ b/julia/src/symbolic-node/array.jl
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Base.Array related interface
+
+import Base: reshape
+
+"""
+    reshape(sym::SymbolicNode, dim; reverse=false, name)
+    reshape(sym::SymbolicNode, dim...; reverse=false, name)
+
+Reshape SymbolicNode operator
+
+Some dimensions of the shape can take special values from the set
+{0, -1, -2, -3, -4}.
+The significance of each is explained below:
+
+- `0`  copy this dimension from the input to the output shape.
+
+  Example:
+
+  - input shape = (2,3,4), shape = (4,0,2), output shape = (4,3,2)
+  - input shape = (2,3,4), shape = (2,0,0), output shape = (2,3,4)
+
+- `-1` infers the dimension of the output shape by using the remainder of the
+  input dimensions keeping the size of the new array same as that of the input
+  array. At most one dimension of shape can be -1.
+
+  Example:
+
+  - input shape = (2,3,4), shape = (6,1,-1), output shape = (6,1,4)
+  - input shape = (2,3,4), shape = (3,-1,8), output shape = (3,1,8)
+  - input shape = (2,3,4), shape=(-1,), output shape = (24,)
+
+- `-2` copy all/remainder of the input dimensions to the output shape.
+
+  Example:
+
+  - input shape = (2,3,4), shape = (-2,), output shape = (2,3,4)
+  - input shape = (2,3,4), shape = (2,-2), output shape = (2,3,4)
+  - input shape = (2,3,4), shape = (-2,1,1), output shape = (2,3,4,1,1)
+
+- `-3` use the product of two consecutive dimensions of the input shape as the
+  output dimension.
+
+  Example:
+
+  - input shape = (2,3,4), shape = (-3,4), output shape = (6,4)
+  - input shape = (2,3,4,5), shape = (-3,-3), output shape = (6,20)
+  - input shape = (2,3,4), shape = (0,-3), output shape = (2,12)
+  - input shape = (2,3,4), shape = (-3,-2), output shape = (6,4)
+
+- `-4` split one dimension of the input into two dimensions passed subsequent
+  to -4 in shape (can contain -1).
+
+  Example:
+
+  - input shape = (2,3,4), shape = (-4,1,2,-2), output shape = (1,2,3,4)
+  - input shape = (2,3,4), shape = (2,-4,-1,3,-2), output shape = (2,1,3,4)
+
+If the argument `reverse` is set to `1`, then the special values are inferred
+from right to left.
+
+  Example:
+
+  - with `reverse=false`, for input shape = (10,5,4), shape = (-1,0),
+    output shape would be (40,5)
+  - with `reverse=true`, output shape will be (50,4).
+"""
+reshape(sym::SymbolicNode, dim::NTuple{N, Integer}; kwargs...) where {N} =
+  _reshape(sym, dim; kwargs...)
+reshape(sym::SymbolicNode, dim::Integer...; kwargs...) =
+  _reshape(sym, dim; kwargs...)
+
+@inline function _reshape(sym::SymbolicNode, dim::NTuple{N,Integer};
+                          reverse::Bool=false, name::String="") where N
+  op = _get_cached_libmx_op_handle("reshape")
+  node = _create_atomic_symbol(op.value, ["shape", "reverse"],
+                               [dump_mx_param(dim), dump_mx_param(!reverse)])
+  name = get!(DEFAULT_NAME_MANAGER, name, "reshape")
+  _compose!(node, name=name, data=sym)
+end
+
+################################################################################
+# Base.getindex
+################################################################################
+
+"""
+    getindex(self :: SymbolicNode, idx :: Union{Int, Base.Symbol, AbstractString})
+
+Get a node representing the specified output of this node. The index could be
+a symbol or string indicating the name of the output, or a 1-based integer
+indicating the index, as in the list of [`list_outputs`](@ref).
+"""
+function Base.getindex(self :: SymbolicNode, idx :: Union{Base.Symbol, AbstractString})
+  idx   = Symbol(idx)
+  i_idx = findall(idx .== list_outputs(self))
+  @assert(length(i_idx) > 0, "Cannot find output with name '$idx'")
+  @assert(length(i_idx) < 2, "Found duplicated output with name '$idx'")
+  Base.getindex(self, i_idx[1])
+end
+function Base.getindex(self :: SymbolicNode, idx :: Int)
+  ref_hdr = Ref{MX_handle}(0)
+  # note Julia is 1-based, while MXNet is 0-based
+  @mxcall(:MXSymbolGetOutput, (MX_handle, MX_uint, Ref{MX_handle}), self, idx-1, ref_hdr)
+  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
+
diff --git a/julia/src/symbolic-node/autodiff.jl b/julia/src/symbolic-node/autodiff.jl
new file mode 100644
index 000000000000..ea4af87626ec
--- /dev/null
+++ b/julia/src/symbolic-node/autodiff.jl
@@ -0,0 +1,178 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    grad(s::SymbolicNode, wrt::Vector{Symbol})
+
+Get the autodiff gradient of the current `SymbolicNode`. This function can
+only be used if the current symbol is a loss function.
+
+# Arguments:
+* `s::SymbolicNode`: current node.
+* `wrt::Vector{Symbol}`: the names of the arguments to the gradient.
+
+Returns a gradient symbol of the corresponding gradient.
+"""
+function grad(s::SymbolicNode, wrt::Vector{Symbol})
+  hdr_ref = Ref{MX_handle}(C_NULL)
+  keys = string.(key)
+
+  @mxcall(:MXSymbolGrad, (MX_handle, MX_uint, char_pp, Ptr{MX_handle}),
+          self, length(keys), keys, hdr_ref)
+  return SymbolicNode(MX_SymbolHandle(hdr_ref[]))
+end
+
+function _build_shapes(shape_size::MX_uint, shape_ndim::Ptr{MX_uint}, shape_data::Ptr{Ptr{MX_uint}})
+  shape_ndim = unsafe_wrap(Array, shape_ndim, shape_size)
+  shape_data = unsafe_wrap(Array, shape_data, shape_size)
+  shapes = map(1:shape_size) do i
+    my_shape = unsafe_wrap(Array, shape_data[i], shape_ndim[i])
+    tuple(reverse(Int[my_shape...], dims = 1)...)
+  end
+  convert(Vector{Tuple}, shapes)
+end
+
+function _infer_shape(self, keys, indptr, sdata)
+  ref_arg_shape_size = Ref{MX_uint}(0)
+  ref_arg_shape_ndim = Ref{Ptr{MX_uint}}(0)
+  ref_arg_shape_data = Ref{Ptr{Ptr{MX_uint}}}(0)
+  ref_out_shape_size = Ref{MX_uint}(0)
+  ref_out_shape_ndim = Ref{Ptr{MX_uint}}(0)
+  ref_out_shape_data = Ref{Ptr{Ptr{MX_uint}}}(0)
+  ref_aux_shape_size = Ref{MX_uint}(0)
+  ref_aux_shape_ndim = Ref{Ptr{MX_uint}}(0)
+  ref_aux_shape_data = Ref{Ptr{Ptr{MX_uint}}}(0)
+  ref_complete       = Ref{Cint}(0)
+  @mxcall(:MXSymbolInferShape,
+          (MX_handle, MX_uint, char_pp, Ptr{MX_uint}, Ptr{MX_uint},
+           Ref{MX_uint}, Ref{Ptr{MX_uint}}, Ref{Ptr{Ptr{MX_uint}}},
+           Ref{MX_uint}, Ref{Ptr{MX_uint}}, Ref{Ptr{Ptr{MX_uint}}},
+           Ref{MX_uint}, Ref{Ptr{MX_uint}}, Ref{Ptr{Ptr{MX_uint}}},
+           Ref{Cint}),
+          self, length(indptr)-1, keys, indptr, sdata,
+          ref_arg_shape_size, ref_arg_shape_ndim, ref_arg_shape_data,
+          ref_out_shape_size, ref_out_shape_ndim, ref_out_shape_data,
+          ref_aux_shape_size, ref_aux_shape_ndim, ref_aux_shape_data,
+          ref_complete)
+  if ref_complete[] == 0
+    return (nothing, nothing, nothing)
+  else
+    return (
+      _build_shapes(ref_arg_shape_size[], ref_arg_shape_ndim[], ref_arg_shape_data[]),
+      _build_shapes(ref_out_shape_size[], ref_out_shape_ndim[], ref_out_shape_data[]),
+      _build_shapes(ref_aux_shape_size[], ref_aux_shape_ndim[], ref_aux_shape_data[])
+    )
+  end
+end
+
+"""
+    infer_shape(self :: SymbolicNode, args...)
+    infer_shape(self :: SymbolicNode; kwargs...)
+
+Do shape inference according to the input shapes. The input shapes could be provided
+as a list of shapes, which should specify the shapes of inputs in the same order as
+the arguments returned by [`list_arguments`](@ref). Alternatively, the shape information
+could be specified via keyword arguments.
+
+Returns a 3-tuple containing shapes of all the arguments, shapes of all the outputs and
+shapes of all the auxiliary variables. If shape inference failed due to incomplete
+or incompatible inputs, the return value will be `(nothing, nothing, nothing)`.
+"""
+function infer_shape(self :: SymbolicNode; kwargs...)
+  sdata  = MX_uint[]
+  indptr = MX_uint[0]
+  for (k,v) in kwargs
+    append!(sdata, reverse([v...], dims = 1))
+    push!(indptr, length(sdata))
+  end
+  keys = AbstractString[string(x[1]) for x in kwargs]
+  _infer_shape(self, keys, indptr, sdata)
+end
+function infer_shape(self :: SymbolicNode, args::Union{Tuple, Cvoid}...)
+  sdata  = MX_uint[]
+  indptr = MX_uint[0]
+  for arg in args
+    if isa(arg, Cvoid); continue; end
+    append!(sdata, reverse([arg...], dims = 1))
+    push!(indptr, length(sdata))
+  end
+  keys = Ptr{char_p}(0)
+  _infer_shape(self, keys, indptr, sdata)
+end
+
+function _infer_type(self, keys, arg_type_data)
+  ref_in_type_size  = Ref{MX_uint}()
+  ref_in_type_data  = Ref{Ptr{Cint}}()
+  ref_out_type_size = Ref{MX_uint}()
+  ref_out_type_data = Ref{Ptr{Cint}}()
+  ref_aux_type_size = Ref{MX_uint}()
+  ref_aux_type_data = Ref{Ptr{Cint}}()
+  ref_complete      = Ref{Cint}()
+
+  @mxcall(:MXSymbolInferType,
+          (MX_handle, MX_uint, char_pp, Ptr{Cint},
+           Ref{MX_uint}, Ref{Ptr{Cint}},
+           Ref{MX_uint}, Ref{Ptr{Cint}},
+           Ref{MX_uint}, Ref{Ptr{Cint}},
+           Ref{Cint}),
+          self, length(arg_type_data)-1, keys, arg_type_data,
+          ref_in_type_size, ref_in_type_data,
+          ref_out_type_size, ref_out_type_data,
+          ref_aux_type_size, ref_aux_type_data,
+          ref_complete)
+
+  if ref_complete[] == 0
+    return (nothing, nothing, nothing)
+  else
+    in_type = unsafe_wrap(Array, ref_in_type_data[], ref_in_type_size[])
+    out_type = unsafe_wrap(Array, ref_out_type_data[], ref_out_type_size[])
+    aux_type = unsafe_wrap(Array, ref_aux_type_data[], ref_aux_type_size[])
+    return ([fromTypeFlag(TypeFlag(t)) for t in in_type],
+            [fromTypeFlag(TypeFlag(t)) for t in out_type],
+            [fromTypeFlag(TypeFlag(t)) for t in aux_type])
+  end
+end
+
+"""
+    infer_type(self :: SymbolicNode; kwargs...)
+    infer_type(self :: SymbolicNode, args...)
+
+Do type inference according to the input types. The input types could be provided
+as a list of types, which should specify the types of inputs in the same order as
+the arguments returned by [`list_arguments`](@ref). Alternatively, the type information
+could be specified via keyword arguments.
+
+Returns a 3-tuple containing types of all the arguments, types of all the outputs and
+types of all the auxiliary variables. If type inference failed due to incomplete
+or incompatible inputs, the return value will be `(nothing, nothing, nothing)`.
+"""
+function infer_type(self :: SymbolicNode; kwargs...)
+  types = Cint[toTypeFlag(x[2]) for x in kwargs]
+  keys = AbstractString[string(x[1]) for x in kwargs]
+  _infer_type(self, keys, types)
+end
+
+function infer_type(self :: SymbolicNode, args :: Union{Tuple,Cvoid}...)
+  types = Cint[]
+  keys = Ptr{char_p}(0)
+
+  for arg in args
+    if isa(arg, Cvoid); continue; end
+    push!(types, toTypeFlag(arg))
+  end
+  _infer_type(self, keys, types)
+end
diff --git a/julia/src/symbolic-node/io.jl b/julia/src/symbolic-node/io.jl
new file mode 100644
index 000000000000..ed461eb07c42
--- /dev/null
+++ b/julia/src/symbolic-node/io.jl
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    to_json(s::SymbolicNode)
+
+Convert a `SymbolicNode` into a JSON string.
+"""
+function to_json(s::SymbolicNode)
+  ref_json = Ref{char_p}(0)
+  @mxcall(:MXSymbolSaveToJSON, (MX_handle, Ref{char_p}), s, ref_json)
+  return unsafe_string(ref_json[])
+end
+
+"""
+    from_json(repr :: AbstractString, ::Type{SymbolicNode})
+
+Load a `SymbolicNode` from a JSON string representation.
+"""
+function from_json(repr :: AbstractString, ::Type{SymbolicNode})
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXSymbolCreateFromJSON, (char_p, Ref{MX_handle}), repr, ref_hdr)
+  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
+
+"""
+    load(filename :: AbstractString, ::Type{SymbolicNode})
+
+Load a `SymbolicNode` from a JSON file.
+"""
+function load(filename :: AbstractString, ::Type{SymbolicNode})
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXSymbolCreateFromFile, (char_p, Ref{MX_handle}), filename, ref_hdr)
+  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
+
+"""
+    save(filename :: AbstractString, node :: SymbolicNode)
+
+Save a `SymbolicNode` to a JSON file.
+"""
+function save(filename :: AbstractString, node :: SymbolicNode)
+  @mxcall(:MXSymbolSaveToFile, (MX_handle, char_p), node, filename)
+end
diff --git a/julia/src/symbolic-node/op.jl b/julia/src/symbolic-node/op.jl
new file mode 100644
index 000000000000..dfdf93df4a92
--- /dev/null
+++ b/julia/src/symbolic-node/op.jl
@@ -0,0 +1,444 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# compute graph related operators
+
+################################################################################
+# SymbolicNode attribute getter and setter
+################################################################################
+
+macro _list_symbol_info(self, func_name)
+  quote
+    ref_sz    = Ref{MX_uint}(0)
+    ref_names = Ref{char_pp}(0)
+    @mxcall($func_name, (MX_handle, Ref{MX_uint}, Ref{char_pp}),
+            $(esc(self)), ref_sz, ref_names)
+    narg = ref_sz[]
+    names = unsafe_wrap(Array, ref_names[], narg)
+    names = [Symbol(unsafe_string(x)) for x in names]
+    return names
+  end
+end
+
+"""
+    list_arguments(s::SymbolicNode)
+
+List all the arguments of this node. The argument for a node contains both
+the inputs and parameters. For example, a `FullyConnected` node will
+have both data and weights in its arguments. A composed node (e.g. a MLP) will
+list all the arguments for intermediate nodes.
+
+Returns a list of symbols indicating the names of the arguments.
+"""
+list_arguments(s::SymbolicNode) = @_list_symbol_info(s, :MXSymbolListArguments)
+
+"""
+    list_outputs(s::SymbolicNode)
+
+List all the outputs of this node.
+
+Returns a list of symbols indicating the names of the outputs.
+"""
+list_outputs(s::SymbolicNode) = @_list_symbol_info(s, :MXSymbolListOutputs)
+
+
+"""
+    list_auxiliary_states(s::SymbolicNode)
+
+
+List all auxiliary states in the symbool.
+
+Auxiliary states are special states of symbols that do not corresponds to an argument,
+and do not have gradient. But still be useful for the specific operations.
+A common example of auxiliary state is the moving_mean and moving_variance in BatchNorm.
+Most operators do not have Auxiliary states.
+
+Returns a list of symbols indicating the names of the auxiliary states.
+"""
+list_auxiliary_states(s::SymbolicNode) =
+  @_list_symbol_info(s, :MXSymbolListAuxiliaryStates)
+
+"""
+    get_internals(s::SymbolicNode)
+
+Get a new grouped `SymbolicNode` whose output contains all the internal outputs of
+this `SymbolicNode`.
+"""
+function get_internals(s::SymbolicNode)
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXSymbolGetInternals, (MX_handle, Ref{MX_handle}), s, ref_hdr)
+  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
+
+"""
+    get_children(x::SymbolicNode)
+
+Gets a new grouped `SymbolicNode` whose output contains inputs to output
+nodes of the original symbol.
+
+```julia
+julia> x, y = @mx.var x y
+(SymbolicNode x, SymbolicNode y)
+
+julia> z = x + y
+SymbolicNode _plus0
+
+julia> z |> mx.get_children |> mx.list_outputs
+2-element Array{Symbol,1}:
+ :x
+ :y
+```
+"""
+function get_children(x::SymbolicNode)
+  hdl = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXSymbolGetChildren, (MX_handle, Ref{MX_handle}), x, hdl)
+  sym = hdl[] |> MX_SymbolHandle |> SymbolicNode
+  isempty(list_outputs(sym)) ? nothing : sym
+end
+
+"""
+    get_attr(s::SymbolicNode, key::Symbol)
+
+Get attribute attached to this `SymbolicNode` belonging to key.
+
+Returns the value belonging to key as a `String`.
+If not available, returns `missing`.
+"""
+function get_attr(s::SymbolicNode, key::Symbol)
+  key_s = string(key)
+  ref_out = Ref{Cstring}()
+  ref_success = Ref{Cint}(-1)
+  @mxcall(:MXSymbolGetAttr, (MX_handle, Cstring, Ref{Cstring}, Ref{Cint}),
+          s, key_s, ref_out, ref_success)
+  if ref_success[] == 1
+    unsafe_string(ref_out[])
+  else
+    missing
+  end
+end
+
+"""
+    list_attr(s::SymbolicNode)
+
+Get all attributes from a symbol.
+
+Returns a dictionary of attributes.
+"""
+function list_attr(s::SymbolicNode)
+  ref_sz    = Ref{MX_uint}(0)
+  ref_strings = Ref{char_pp}(0)
+  @mxcall(:MXSymbolListAttrShallow, (MX_handle, Ref{MX_uint}, Ref{char_pp}),
+            s, ref_sz, ref_strings)
+  narg = 2*ref_sz[]
+  strings = unsafe_wrap(Array, ref_strings[], narg)
+  out = Dict{Symbol, String}()
+  for i in 1:2:narg
+    key = Symbol(unsafe_string(strings[i]))
+    value = unsafe_string(strings[i+1]) # Creates a copy of string
+    out[key] = value
+  end
+  return out
+end
+
+"""
+    list_all_attr(s::SymbolicNode)
+
+Get all attributes from the symbol graph.
+
+Returns a dictionary of attributes.
+"""
+function list_all_attr(s::SymbolicNode)
+  ref_sz    = Ref{MX_uint}(0)
+  ref_strings = Ref{char_pp}(0)
+  @mxcall(:MXSymbolListAttr, (MX_handle, Ref{MX_uint}, Ref{char_pp}),
+            s, ref_sz, ref_strings)
+  narg = 2*ref_sz[]
+  strings = unsafe_wrap(Array, ref_strings[], narg)
+  out = Dict{Symbol, String}()
+  for i in 1:2:narg
+    key = Symbol(unsafe_string(strings[i]))
+    value = unsafe_string(strings[i+1])
+    out[key] = value
+  end
+  return out
+end
+
+"""
+    set_attr(s::SymbolicNode, key::Symbol, value::AbstractString)
+
+Set the attribute key to value for this `SymbolicNode`.
+
+!!! note
+    It is encouraged not to call this function directly, unless you know exactly what you are doing. The
+    recommended way of setting attributes is when creating the `SymbolicNode`. Changing
+    the attributes of a `SymbolicNode` that is already been used somewhere else might
+    cause unexpected behavior and inconsistency.
+"""
+function set_attr(s::SymbolicNode, key::Symbol, value::AbstractString)
+  key_s = string(key)
+  value_s = String(value)
+
+  @mxcall(:MXSymbolSetAttr, (MX_handle, Cstring, Cstring), s, key_s, value_s)
+end
+
+"""
+    get_name(s::SymbolicNode)
+
+Get the name of the symbol.
+
+    julia> x = mx.Variable(:data)
+    julia> mx.get_name(x)
+    :data
+
+    julia> y = mx.FullyConnected(x, num_hidden = 128)
+    julia> mx.get_name(y)
+    :fullyconnected0
+"""
+function get_name(s::mx.SymbolicNode)
+    name = Ref{mx.char_p}(C_NULL)
+    success = Ref(0)
+    @mxcall(:MXSymbolGetName, (MX_handle, Ref{char_p}, Ref{Int}), s.handle.value, name, success)
+    @assert success[] != -1
+
+    str = name[]
+    if str == C_NULL  # e.g. the symbol returned via get_internals
+        string(s.handle.value)
+    else
+        Symbol(unsafe_string(str))
+    end
+end
+
+################################################################################
+# Atomic SymbolicNode functions dynamically imported from libmxnet
+################################################################################
+
+@inline function _create_atomic_symbol(creator::MX_handle, keys::Vector{String},
+                                       vals::Vector{String})
+  ref_sym_hdr = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXSymbolCreateAtomicSymbol,
+          (MX_handle, MX_uint, Ptr{char_p}, Ptr{char_p}, Ref{MX_handle}),
+          creator, length(keys), keys, vals, ref_sym_hdr)
+  SymbolicNode(MX_SymbolHandle(ref_sym_hdr[]))
+end
+
+@inline function _create_atomic_symbol(creator::MX_handle, keys::Vector{String},
+                                       vals::Vector{String},
+                                       attrs::Dict{Symbol, String})
+  node = _create_atomic_symbol(creator, keys, vals)
+  # set attrs
+  for (k, v) in attrs
+    set_attr(node, k, v)
+  end
+  node
+end
+
+function _define_atomic_symbol_creator(name::String)
+  handle = _get_libmx_op_handle(name)
+  f_desc, key_narg = _get_libmx_op_description(name, handle)
+
+  f_desc *= "* `name::Symbol`: The name of the `SymbolicNode`. (e.g. `:my_symbol`), optional.\n"
+  f_desc *= "* `attrs::Dict{Symbol,String}`: The attributes associated with this `SymbolicNode`.\n\n"
+
+  func_name = Symbol(name)
+  import_expr = _import_expr(func_name)
+
+  func_def = quote
+  function $func_name(::Type{SymbolicNode}, args::SymbolicNode...; name = "", kwargs...)
+
+    # NOTE: hacky way of solving the problem that the arguments of `dot` should be swapped
+    # See https://github.com/dmlc/MXNet.jl/issues/55
+    if $name == "dot"
+      args = reverse(args)
+    end
+
+    # NOTE: hacky way of solving the semantic difference of the axes parameter in Julia
+    # and in libmxnet.
+    # See https://github.com/dmlc/MXNet.jl/pull/123
+    if $name == "transpose"
+      kwargs = Any[key != :axes ? (key, arg) : (key, reverse(map(i->length(arg)-i, arg))) for (key, arg) in kwargs]
+    end
+
+    param_keys = String[]
+    param_vals = String[]
+    symbol_kws = Dict{Symbol,SymbolicNode}()
+    attrs = Dict{Symbol,String}()
+
+    $(if key_narg != ""
+      quote
+        if !in($key_narg, param_keys)
+          push!(param_keys, $key_narg)
+          push!(param_vals, string(length(args)))
+        end
+      end
+    end)
+
+    for (k,v) in kwargs
+      if k == :name; continue; end
+      if isa(v, SymbolicNode)
+        symbol_kws[k] = v
+      elseif k == :attrs
+        if isa(v, Dict)
+          attrs = convert(Dict{Symbol, String}, v)
+        else
+          throw(ArgumentError("attrs needs to be a Dictionary"))
+        end
+      else
+        push!(param_keys, string(k))
+        push!(param_vals, dump_mx_param(v))
+      end
+    end
+
+    if length(args) > 1 && length(symbol_kws) != 0
+      @assert(false, $name * " only accepts SymbolicNode either as positional or keyword arguments with optional positional `data` argument, not both.")
+    end
+    $(if key_narg != ""
+      quote
+        if length(symbol_kws) > 0
+          @assert(false, $name * " takes variable number of SymbolicNode arguments, " *
+                         "please pass input Symbols via positional arguments, instead of keyword arguments.")
+        end
+      end
+    end)
+
+    local op = _get_cached_libmx_op_handle($name)
+    node = _create_atomic_symbol(op.value, param_keys, param_vals, attrs)
+
+    # generate a new name for the new symbol if user not provided in kwargs
+    hint = lowercase($name)
+    name = get!(DEFAULT_NAME_MANAGER, name, hint)
+
+    if length(symbol_kws) == 0
+      _compose!(node, name, args...)
+    elseif length(args) == 1
+      _compose!(node; name=name, data=args[1], symbol_kws...)
+    else
+      _compose!(node; name=name, symbol_kws...)
+    end
+
+    return node
+  end # function
+  end # quote
+
+  func_def2 = quote
+  @doc $f_desc
+  function $func_name(args::SymbolicNode...; kwargs...)
+    $func_name(SymbolicNode, args...; kwargs...)
+  end # function
+  end # quote
+
+  return quote
+    $import_expr
+    $func_def
+    $func_def2
+  end
+end
+
+macro _import_atomic_symbol_creators()
+  # NOTE: those are operators defined for NDArray, we exclude them here
+  # because the calling convention for the type signature is not strong
+  # enough to disambiguate the method for NDArray and SymbolicNode
+  ignored_ops = ("_set_value", "reshape")  # in lowercase
+
+  op_names = _get_libmx_op_names()
+  func_exprs = map(op_names) do name
+    if lowercase(name) ∉ ignored_ops
+      expr = _define_atomic_symbol_creator(name)
+    end
+  end
+
+  esc(quote
+    $(func_exprs...)
+  end)
+end
+
+@_import_atomic_symbol_creators
+
+################################################################################
+# Utility macros to chain up symbols
+################################################################################
+
+macro chain(layers)
+    exprs = []
+    last_layer = nothing
+
+    function _chain_layer(layer, last_layer)
+        if last_layer ≡ nothing
+            return esc(layer)
+        else
+            if @capture(layer, f_(x__))
+                x′ = esc.(x)
+                return :($f($last_layer, $(x′...)))
+            else
+                throw(AssertionError("$layer is not a valid function call and cannot be chained."))
+            end
+        end
+    end
+
+    while true
+        if @capture(layers, l1_=>l2_)
+            new_layer = gensym()
+            push!(exprs, :($new_layer = $(_chain_layer(l1, last_layer))))
+            last_layer = new_layer
+            layers = l2
+        else
+            push!(exprs, _chain_layer(layers, last_layer))
+            break
+        end
+    end
+    Expr(:block, exprs...)
+end
+
+################################################################################
+# compose
+################################################################################
+
+function _compose!(node::SymbolicNode; kwargs...)
+  name     = char_p(C_NULL)
+  arg_keys = AbstractString[]  # FIXME: can it be String[] ?
+  arg_vals = MX_handle[]
+
+  for (k, v) in kwargs
+    if k == :name
+      name = string(v)
+    else
+      @assert(isa(v, SymbolicNode), "Compose expect `SymbolicNode` as arguments")
+      push!(arg_keys, string(k))
+      push!(arg_vals, v)
+    end
+  end
+
+  @mxcall(:MXSymbolCompose,
+          (MX_handle, char_p, MX_uint, Ptr{char_p}, Ptr{MX_handle}),
+          node, name, length(arg_keys), arg_keys, arg_vals)
+  node
+end
+
+_compose!(node::SymbolicNode, args::SymbolicNode...) =
+  _compose!(node, char_p(0), args...)
+
+function _compose!(node::SymbolicNode, name::Union{Symbol, char_p}, args::SymbolicNode...)
+  if name isa Symbol
+    name = string(name)
+  end
+  arg_keys = Ptr{char_p}(C_NULL)
+  arg_vals = MX_handle[args...]
+
+  @mxcall(:MXSymbolCompose,
+          (MX_handle, char_p, MX_uint, Ptr{char_p}, Ptr{MX_handle}),
+          node, name, length(arg_vals), arg_keys, arg_vals)
+  node
+end
diff --git a/julia/src/symbolic-node/show.jl b/julia/src/symbolic-node/show.jl
new file mode 100644
index 000000000000..f07c6b4655ee
--- /dev/null
+++ b/julia/src/symbolic-node/show.jl
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+Base.show(io::IO, sym::SymbolicNode) =
+  print(io, "$(typeof(sym)) $(get_name(sym))")
+
+"""
+    print([io::IO], sym::SymbolicNode)
+
+Print the content of symbol, used for debug.
+
+```julia
+julia> layer = @mx.chain mx.Variable(:data)           =>
+         mx.FullyConnected(name=:fc1, num_hidden=128) =>
+         mx.Activation(name=:relu1, act_type=:relu)
+MXNet.mx.SymbolicNode(MXNet.mx.MX_SymbolHandle(Ptr{Nothing} @0x000055b29b9c3520))
+
+julia> print(layer)
+Symbol Outputs:
+        output[0]=relu1(0)
+Variable:data
+Variable:fc1_weight
+Variable:fc1_bias
+--------------------
+Op:FullyConnected, Name=fc1
+Inputs:
+        arg[0]=data(0) version=0
+        arg[1]=fc1_weight(0) version=0
+        arg[2]=fc1_bias(0) version=0
+Attrs:
+        num_hidden=128
+--------------------
+Op:Activation, Name=relu1
+Inputs:
+        arg[0]=fc1(0)
+Attrs:
+        act_type=relu
+```
+"""
+function Base.print(io::IO, sym::SymbolicNode)
+  out = Ref{mx.char_p}(C_NULL)
+  @mx.mxcall(:MXSymbolPrint, (mx.MX_SymbolHandle, Ref{mx.char_p}), sym.handle, out)
+  print(io, unsafe_string(out[]))
+end
+
+Base.print(sym::SymbolicNode) = print(STDOUT, sym)
+
+
diff --git a/julia/src/symbolic-node/type.jl b/julia/src/symbolic-node/type.jl
new file mode 100644
index 000000000000..60f2b5030246
--- /dev/null
+++ b/julia/src/symbolic-node/type.jl
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    SymbolicNode
+
+SymbolicNode is the basic building block of the symbolic graph in MXNet.jl.
+It's a callable object and supports following calls:
+
+    (s::SymbolicNode)(args::SymbolicNode...)
+    (s::SymbolicNode)(; kwargs...)
+
+Make a new node by composing `s` with `args`. Or the arguments
+can be specified using keyword arguments.
+"""
+mutable struct SymbolicNode
+  handle::MX_SymbolHandle
+end
+
+const SymbolicNodeOrReal = Union{SymbolicNode,Real}
+
+Base.unsafe_convert(::Type{MX_handle}, s::SymbolicNode) =
+  Base.unsafe_convert(MX_handle, s.handle)
+Base.convert(T::Type{MX_handle}, s::SymbolicNode) = Base.unsafe_convert(T, s)
+Base.cconvert(T::Type{MX_handle}, s::SymbolicNode) = Base.unsafe_convert(T, s)
+
+"""
+    deepcopy(s::SymbolicNode)
+
+Make a deep copy of a SymbolicNode.
+"""
+function Base.deepcopy(s::SymbolicNode)
+  ref_hdr = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXSymbolCopy, (MX_handle, Ref{MX_handle}), s, ref_hdr)
+  SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
+
+"""
+    copy(s::SymbolicNode)
+
+Make a copy of a SymbolicNode. The same as making a deep copy.
+"""
+Base.copy(s::SymbolicNode) = Base.deepcopy(s)
+
+
+function (s::SymbolicNode)(args::SymbolicNode...)
+  s = deepcopy(s)
+  _compose!(s, args...)
+end
+
+function (s::SymbolicNode)(; kwargs...)
+  s = deepcopy(s)
+  _compose!(s; kwargs...)
+end
+
+"""
+    Variable(name::Union{Symbol,AbstractString}; attrs)
+
+Create a symbolic variable with the given name. This is typically used as a placeholder.
+For example, the data node, acting as the starting point of a network architecture.
+
+## Arguments
+
+* `attrs::Dict{Symbol,<:AbstractString}`: The attributes associated with this `Variable`.
+"""
+function Variable(name::Union{Symbol,AbstractString}; attrs = Dict())
+  attrs = convert(Dict{Symbol, AbstractString}, attrs)
+  hdr_ref = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXSymbolCreateVariable, (char_p, Ref{MX_handle}), name, hdr_ref)
+  node = SymbolicNode(MX_SymbolHandle(hdr_ref[]))
+  for (k, v) in attrs
+    set_attr(node, k, v)
+  end
+  node
+end
+
+"""
+    @var <symbols>...
+
+A handy macro for creating `mx.Variable`.
+
+```julia
+julia> x = @mx.var x
+MXNet.mx.SymbolicNode x
+
+julia> x, y, z = @mx.var x y z
+(MXNet.mx.SymbolicNode x, MXNet.mx.SymbolicNode y, MXNet.mx.SymbolicNode z)
+```
+"""
+macro var(n::Symbol)
+  Expr(:call, :Variable, QuoteNode(n))
+end
+
+macro var(names::Symbol...)
+  Expr(:tuple, map(n -> Expr(:call, :Variable, QuoteNode(n)), names)...)
+end
+
+"""
+    Group(nodes::SymbolicNode...)
+
+Create a `SymbolicNode` by grouping nodes together.
+"""
+function Group(nodes::SymbolicNode...)
+  handles = MX_handle[nodes...]
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXSymbolCreateGroup, (MX_uint, Ptr{MX_handle}, Ref{MX_handle}),
+          length(handles), handles, ref_hdr)
+  SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
diff --git a/julia/test/unittest/ndarray.jl b/julia/test/unittest/ndarray.jl
index eb69a736a6e4..26695e8c4769 100644
--- a/julia/test/unittest/ndarray.jl
+++ b/julia/test/unittest/ndarray.jl
@@ -885,24 +885,24 @@ function test_saveload()
   rm(fname)
 end
 
-function test_clip()
+function test_clamp()
   dims = rand_dims()
-  @info("NDArray::clip::dims = $dims")
+  @info("NDArray::clamp::dims = $dims")
 
   j_array, nd_array = rand_tensors(dims)
   clip_up   = maximum(abs.(j_array)) / 2
   clip_down = 0
-  clipped   = clip(nd_array, clip_down, clip_up)
+  clipped   = clamp(nd_array, clip_down, clip_up)
 
   # make sure the original array is not modified
   @test copy(nd_array) ≈ j_array
 
   @test all(clip_down .<= copy(clipped) .<= clip_up)
 
-  @info("NDArray::clip!")
+  @info("NDArray::clamp!")
   let
     x = NDArray(1.0:20)
-    clip!(x, 5, 15)
+    clamp!(x, 5, 15)
     @test all(5 .<= copy(x) .<= 15)
   end
 end
@@ -1571,7 +1571,7 @@ end
   test_mod()
   test_gd()
   test_saveload()
-  test_clip()
+  test_clamp()
   test_power()
   test_sqrt()
   test_eltype()
diff --git a/make/config.mk b/make/config.mk
index 8a1aa2c165c4..d4431a97173d 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -89,6 +89,10 @@ USE_NCCL_PATH = NONE
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 #whether use libjpeg-turbo for image decode without OpenCV wrapper
 USE_LIBJPEG_TURBO = 0
@@ -179,7 +183,8 @@ USE_S3 = 0
 USE_OPERATOR_TUNING = 1
 
 # Use gperftools if found
-USE_GPERFTOOLS = 1
+# Disable because of #8968
+USE_GPERFTOOLS = 0
 
 # path to gperftools (tcmalloc) library in case of a non-standard installation
 USE_GPERFTOOLS_PATH =
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
index 171f846d20dd..f0c89d6239e6 100644
--- a/make/crosscompile.jetson.mk
+++ b/make/crosscompile.jetson.mk
@@ -89,6 +89,10 @@ USE_NCCL_PATH = NONE
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 0
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 #whether use libjpeg-turbo for image decode without OpenCV wrapper
 USE_LIBJPEG_TURBO = 0
@@ -162,7 +166,8 @@ USE_S3 = 0
 USE_OPERATOR_TUNING = 1
 
 # Use gperftools if found
-USE_GPERFTOOLS = 1
+# Disable because of #8968
+USE_GPERFTOOLS = 0
 
 # path to gperftools (tcmalloc) library in case of a non-standard installation
 USE_GPERFTOOLS_PATH =
diff --git a/make/maven/maven_darwin_mkl.mk b/make/maven/maven_darwin_mkl.mk
index f5b77ae784ad..f68f1565f990 100644
--- a/make/maven/maven_darwin_mkl.mk
+++ b/make/maven/maven_darwin_mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=apple
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 0
diff --git a/make/maven/maven_linux_cu90mkl.mk b/make/maven/maven_linux_cu90mkl.mk
index 661f444ceac3..3d3b2c48c58b 100644
--- a/make/maven/maven_linux_cu90mkl.mk
+++ b/make/maven/maven_linux_cu90mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/maven/maven_linux_cu92mkl.mk b/make/maven/maven_linux_cu92mkl.mk
index ecd252769216..ab93801b0b56 100644
--- a/make/maven/maven_linux_cu92mkl.mk
+++ b/make/maven/maven_linux_cu92mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/maven/maven_linux_mkl.mk b/make/maven/maven_linux_mkl.mk
index 6cb9f326bb7d..dfe5065b9a9c 100644
--- a/make/maven/maven_linux_mkl.mk
+++ b/make/maven/maven_linux_mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 0
diff --git a/make/osx.mk b/make/osx.mk
index 3e2e592323f4..7e32d81a5d71 100644
--- a/make/osx.mk
+++ b/make/osx.mk
@@ -75,6 +75,10 @@ USE_CUDNN = 0
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # use openmp for parallelization
 # apple-clang by default does not have openmp built-in
diff --git a/make/pip/pip_darwin_cpu.mk b/make/pip/pip_darwin_cpu.mk
index 87a967933f9e..2c80c42387ad 100644
--- a/make/pip/pip_darwin_cpu.mk
+++ b/make/pip/pip_darwin_cpu.mk
@@ -58,6 +58,10 @@ USE_BLAS=apple
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 0
diff --git a/make/pip/pip_darwin_mkl.mk b/make/pip/pip_darwin_mkl.mk
index 339c95fdd502..2fac512140d0 100644
--- a/make/pip/pip_darwin_mkl.mk
+++ b/make/pip/pip_darwin_mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=apple
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 0
diff --git a/make/pip/pip_linux_cpu.mk b/make/pip/pip_linux_cpu.mk
index d680f6da38d2..9ed88bb5bd1b 100644
--- a/make/pip/pip_linux_cpu.mk
+++ b/make/pip/pip_linux_cpu.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 0
diff --git a/make/pip/pip_linux_cu100.mk b/make/pip/pip_linux_cu100.mk
index 0f3f84a417a8..f3bea65a7260 100644
--- a/make/pip/pip_linux_cu100.mk
+++ b/make/pip/pip_linux_cu100.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_cu100mkl.mk b/make/pip/pip_linux_cu100mkl.mk
index b4792e3d81bb..4dfcb213feb1 100644
--- a/make/pip/pip_linux_cu100mkl.mk
+++ b/make/pip/pip_linux_cu100mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_cu75.mk b/make/pip/pip_linux_cu75.mk
index ff05ec80c68d..c686c20db456 100644
--- a/make/pip/pip_linux_cu75.mk
+++ b/make/pip/pip_linux_cu75.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_cu75mkl.mk b/make/pip/pip_linux_cu75mkl.mk
index a8259205a5d7..ec87676c6c96 100644
--- a/make/pip/pip_linux_cu75mkl.mk
+++ b/make/pip/pip_linux_cu75mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_cu80.mk b/make/pip/pip_linux_cu80.mk
index 7abf09f02eb6..72a04deeb400 100644
--- a/make/pip/pip_linux_cu80.mk
+++ b/make/pip/pip_linux_cu80.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_cu80mkl.mk b/make/pip/pip_linux_cu80mkl.mk
index be42cca0ef2b..1bd295138936 100644
--- a/make/pip/pip_linux_cu80mkl.mk
+++ b/make/pip/pip_linux_cu80mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_cu90.mk b/make/pip/pip_linux_cu90.mk
index 12bc7741bef4..b831d6586795 100644
--- a/make/pip/pip_linux_cu90.mk
+++ b/make/pip/pip_linux_cu90.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_cu90mkl.mk b/make/pip/pip_linux_cu90mkl.mk
index 770f292598c3..8ab13075b3d0 100644
--- a/make/pip/pip_linux_cu90mkl.mk
+++ b/make/pip/pip_linux_cu90mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_cu91.mk b/make/pip/pip_linux_cu91.mk
index 9e399f97d18f..cb66394683a9 100644
--- a/make/pip/pip_linux_cu91.mk
+++ b/make/pip/pip_linux_cu91.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_cu91mkl.mk b/make/pip/pip_linux_cu91mkl.mk
index 80ef690a00e5..a94874503c92 100644
--- a/make/pip/pip_linux_cu91mkl.mk
+++ b/make/pip/pip_linux_cu91mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_cu92.mk b/make/pip/pip_linux_cu92.mk
index b0731addbc2d..8796a3bbdaa7 100644
--- a/make/pip/pip_linux_cu92.mk
+++ b/make/pip/pip_linux_cu92.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_cu92mkl.mk b/make/pip/pip_linux_cu92mkl.mk
index 768fcc9c1cd0..b9766ec36dc8 100644
--- a/make/pip/pip_linux_cu92mkl.mk
+++ b/make/pip/pip_linux_cu92mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 1
diff --git a/make/pip/pip_linux_mkl.mk b/make/pip/pip_linux_mkl.mk
index ea11061e1e82..95146ee225ba 100644
--- a/make/pip/pip_linux_mkl.mk
+++ b/make/pip/pip_linux_mkl.mk
@@ -58,6 +58,10 @@ USE_BLAS=openblas
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDA during compile
 USE_CUDA = 0
diff --git a/make/readthedocs.mk b/make/readthedocs.mk
index 0a45e6f03151..b33dd3c5d21f 100644
--- a/make/readthedocs.mk
+++ b/make/readthedocs.mk
@@ -36,6 +36,10 @@ USE_CUDA_PATH = NONE
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 0
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
 
 # whether use CUDNN R3 library
 USE_CUDNN = 0
diff --git a/matlab/get_inception_model.sh b/matlab/get_inception_model.sh
index af2479b33b83..3c0cb5c4b052 100755
--- a/matlab/get_inception_model.sh
+++ b/matlab/get_inception_model.sh
@@ -31,5 +31,5 @@ cd ${DATA_DIR}
 wget --no-check-certificate https://raw.githubusercontent.com/dmlc/mxnet.js/master/data/cat.png;
 
 # Get inception model
-wget --no-check-certificate http://data.dmlc.ml/mxnet/models/imagenet/inception-bn.tar.gz;
+wget --no-check-certificate http://data.mxnet.io/models/imagenet/inception-bn.tar.gz
 tar -zxvf inception-bn.tar.gz
diff --git a/mkldnn.mk b/mkldnn.mk
index d79bbe7d2a0e..be45ce5df5d4 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -23,10 +23,12 @@ ifeq ($(UNAME_S), Darwin)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.dylib
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml.dylib
 	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.0.dylib
+	MKLDNN_LIB64FILE = $(MKLDNNROOT)/lib64/libmkldnn.0.dylib
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
 	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so.0
+	MKLDNN_LIB64FILE = $(MKLDNNROOT)/lib64/libmkldnn.so.0
 endif
 endif
 
@@ -40,6 +42,9 @@ $(MKLDNN_LIBFILE):
 	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
 	$(MAKE) -C $(MKLDNN_BUILDDIR) VERBOSE=1
 	$(MAKE) -C $(MKLDNN_BUILDDIR) install
+	if [ -f "$(MKLDNN_LIB64FILE)" ]; then \
+		mv $(MKLDNNROOT)/lib64/libmkldnn* $(MKLDNNROOT)/lib/; \
+	fi
 	mkdir -p $(MXNET_LIBDIR)
 	cp $(OMP_LIBFILE) $(MXNET_LIBDIR)
 	cp $(MKLML_LIBFILE) $(MXNET_LIBDIR)
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet.pm b/perl-package/AI-MXNet/lib/AI/MXNet.pm
index 80699b14311c..ffc72f9513d3 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet.pm
@@ -263,24 +263,24 @@ AI::MXNet - Perl interface to MXNet machine learning library
 =head1 DESCRIPTION
 
     Perl interface to MXNet machine learning library.
-    MXNet supports the Perl programming language. 
-    The MXNet Perl package brings flexible and efficient GPU computing and 
+    MXNet supports the Perl programming language.
+    The MXNet Perl package brings flexible and efficient GPU computing and
     state-of-art deep learning to Perl.
     It enables you to write seamless tensor/matrix computation with multiple GPUs in Perl.
     It also lets you construct and customize the state-of-art deep learning models in Perl,
     and apply them to tasks, such as image classification and data science challenges.
 
     One important thing to internalize is that Perl interface is written to be as close as possible to the Python’s API,
-    so most, if not all of Python’s documentation and examples should just work in Perl after making few changes 
-    in order to make the code a bit more Perlish. In nutshell just add $ sigils and replace . = \n with -> => ; 
+    so most, if not all of Python’s documentation and examples should just work in Perl after making few changes
+    in order to make the code a bit more Perlish. In nutshell just add $ sigils and replace . = \n with -> => ;
     and in 99% of cases that’s all that is needed there.
     In addition please refer to very detailed L<MXNet Python API Documentation|http://mxnet.io/api/python/index.html>.
 
     AI::MXNet supports new imperative PyTorch like Gluon MXNet interface.
-    Please get acquainted with this new interface at L<Deep Learning - The Straight Dope|https://gluon.mxnet.io/>.
+    Please get acquainted with this new interface at L<Dive into Deep Learning|https://www.d2l.ai/>.
 
     For specific Perl Gluon usage please refer to Perl examples and tests directories on github,
-    but be assured that the Python and Perl usage are extremely close in order to make the use 
+    but be assured that the Python and Perl usage are extremely close in order to make the use
     of the Python Gluon docs and examples as easy as possible.
 
     AI::MXNet is seamlessly glued with L<PDL|https://metacpan.org/pod/PDL>, the C++ level state can be easily initialized from PDL
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
index 573abbf588f2..5844302fce16 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
@@ -471,7 +471,7 @@ method reshape(HashRef[Shape] $kwargs, Int :$partial_shaping=0, Int :$allow_up_s
     my $shared_handle = $self->handle;
 
     my ($in_args_and_grad_handles, $aux_state_handles, $handle) = check_call(
-        AI::MXNetCAPI::ExecutorReshape(
+        AI::MXNetCAPI::ExecutorReshapeEx(
             $partial_shaping,
             $allow_up_sizing,
             $self->_ctx->device_type_id,
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm b/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
index 297ceb8c0b24..19e7cfdb8fe3 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
@@ -642,6 +642,7 @@ extends 'AI::MXNet::DataIter';
     mx->io->CSVIter                     Returns the CSV file iterator.
     mx->io->LibSVMIter                  Returns the LibSVM iterator which returns data with csr storage type.
     mx->io->ImageRecordIter             Iterates on image RecordIO files
+    mx->io->ImageRecordInt8Iter         Iterating on image RecordIO files
     mx->io->ImageRecordUInt8Iter        Iterating on image RecordIO files
     mx->io->MNISTIter                   Iterating on the MNIST dataset.
     mx->recordio->MXRecordIO            Reads/writes RecordIO data format, supporting sequential read and write.
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
index 72f6cc772178..f466aaa11a3d 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
@@ -535,7 +535,7 @@ method wait_to_read()
 
 method shape()
 {
-    return scalar(check_call(AI::MXNetCAPI::NDArrayGetShape($self->handle)));
+    return scalar(check_call(AI::MXNetCAPI::NDArrayGetShapeEx($self->handle)));
 }
 
 =head2 size
@@ -1460,7 +1460,7 @@ func _new_alloc_handle($shape, $ctx, $delay_alloc, $dtype)
 method _new_from_shared_mem($shared_pid, $shared_id, $shape, $dtype)
 {
     my $hdl = check_call(
-        AI::MXNetCAPI::NDArrayCreateFromSharedMem(
+        AI::MXNetCAPI::NDArrayCreateFromSharedMemEx(
             $shared_pid,
             $shared_id,
             $shape,
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
index 04dd1cbfc441..e4953f17031a 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
@@ -662,7 +662,7 @@ method _infer_shape_impl(Maybe[Str|Shape] @args)
             push @{ $indptr }, scalar(@{ $sdata });
         }
     }
-    my $infer_func = $partial ? \&AI::MXNetCAPI::SymbolInferShapePartial : \&AI::MXNetCAPI::SymbolInferShape;
+    my $infer_func = $partial ? \&AI::MXNetCAPI::SymbolInferShapePartialEx : \&AI::MXNetCAPI::SymbolInferShapeEx;
     my ($arg_shapes, $out_shapes, $aux_shapes, $complete) = check_call(
         $infer_func->(
             $self->handle,
@@ -937,7 +937,7 @@ method simple_bind(
         ($updated_shared_data, $in_arg_handles, $arg_grad_handles, $aux_state_handles, $exe_handle)
             =
         check_call(
-            AI::MXNetCAPI::ExecutorSimpleBind(
+            AI::MXNetCAPI::ExecutorSimpleBindEx(
                 $self->handle,
                 $ctx->device_type_id,
                 $ctx->device_id,
diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
index 0e6a05ea9695..e38402c56100 100644
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -640,9 +640,9 @@ int MXNDArrayReshape64(NDArrayHandle handle,
  * \param out_pdata pointer holder to get data pointer of the shape
  * \return 0 when success, -1 when failure happens
  */
-int MXNDArrayGetShape(NDArrayHandle handle,
-                                mx_uint *out_dim,
-                                const mx_uint **out_pdata);
+int MXNDArrayGetShapeEx(NDArrayHandle handle,
+                                int *out_dim,
+                                const int **out_pdata);
 /*!
  * \brief get the content of the data in NDArray
  * \param handle the handle to the ndarray
@@ -1289,21 +1289,21 @@ int MXSymbolGrad(SymbolHandle sym,
  * \param complete whether infer shape completes or more information is needed.
  * \return 0 when success, -1 when failure happens
  */
-int MXSymbolInferShape(SymbolHandle sym,
-                                 mx_uint num_args,
-                                 const char** in,
-                                 const mx_uint *in,
-                                 const mx_uint *in,
-                                 mx_uint *in_shape_size,
-                                 const mx_uint **in_shape_ndim,
-                                 const mx_uint ***in_shape_data,
-                                 mx_uint *out_shape_size,
-                                 const mx_uint **out_shape_ndim,
-                                 const mx_uint ***out_shape_data,
-                                 mx_uint *aux_shape_size,
-                                 const mx_uint **aux_shape_ndim,
-                                 const mx_uint ***aux_shape_data,
-                                 int *out);
+int MXSymbolInferShapeEx(SymbolHandle sym,
+                         mx_uint num_args,
+                         const char** in,
+                         const mx_uint *in,
+                         const int *in,
+                         mx_uint *in_shape_size,
+                         const int **in_shape_ndim,
+                         const int ***in_shape_data,
+                         mx_uint *out_shape_size,
+                         const int **out_shape_ndim,
+                         const int ***out_shape_data,
+                         mx_uint *aux_shape_size,
+                         const int **aux_shape_ndim,
+                         const int ***aux_shape_data,
+                         int *out);
 /*!
  * \brief partially infer shape of unknown input shapes given the known one.
  *
@@ -1328,21 +1328,21 @@ int MXSymbolInferShape(SymbolHandle sym,
  * \param complete whether infer shape completes or more information is needed.
  * \return 0 when success, -1 when failure happens
  */
-int MXSymbolInferShapePartial(SymbolHandle sym,
-                                 mx_uint num_args,
-                                 const char** in,
-                                 const mx_uint *in,
-                                 const mx_uint *in,
-                                 mx_uint *in_shape_size,
-                                 const mx_uint **in_shape_ndim,
-                                 const mx_uint ***in_shape_data,
-                                 mx_uint *out_shape_size,
-                                 const mx_uint **out_shape_ndim,
-                                 const mx_uint ***out_shape_data,
-                                 mx_uint *aux_shape_size,
-                                 const mx_uint **aux_shape_ndim,
-                                 const mx_uint ***aux_shape_data,
-                                 int *out);
+int MXSymbolInferShapePartialEx(SymbolHandle sym,
+                                mx_uint num_args,
+                                const char** in,
+                                const mx_uint *in,
+                                const int *in,
+                                mx_uint *in_shape_size,
+                                const int **in_shape_ndim,
+                                const int ***in_shape_data,
+                                mx_uint *out_shape_size,
+                                const int **out_shape_ndim,
+                                const int ***out_shape_data,
+                                mx_uint *aux_shape_size,
+                                const int **aux_shape_ndim,
+                                const int ***aux_shape_data,
+                                int *out);
 
 /*!
  * \brief infer type of unknown input types given the known one.
@@ -1535,40 +1535,40 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
                                ExecutorHandle shared_exec,
                                ExecutorHandle *out);
 
-int MXExecutorSimpleBind(SymbolHandle symbol_handle,
-                         int dev_type,
-                         int dev_id,
-                         const mx_uint num_g2c_keys,
-                         const char** in, // g2c_keys,
-                         const int* in, // g2c_dev_types,
-                         const int* in, // g2c_dev_ids,
-                         const mx_uint provided_grad_req_list_len,
-                         const char** in, // provided_grad_req_names,
-                         const char** in, // provided_grad_req_types,
-                         const mx_uint num_provided_arg_shapes,
-                         const char** in, // provided_arg_shape_names,
-                         const mx_uint* in, // provided_arg_shape_data,
-                         const mx_uint* in, // provided_arg_shape_idx,
-                         const mx_uint num_provided_arg_dtypes,
-                         const char** in, // provided_arg_dtype_names,
-                         const int* in, // provided_arg_dtypes,
-                         const mx_uint num_provided_arg_stypes,
-                         const char** in, // provided_arg_stype_names,
-                         const int* in, // provided_arg_stypes,
-                         const mx_uint num_shared_arg_names,
-                         const char** in, // shared_arg_name_list,
-                         int* shared_buffer_len,
-                         const char** shared_buffer_name_list,
-                         NDArrayHandle* shared_buffer_handle_list,
-                         const char*** updated_shared_buffer_name_list,
-                         NDArrayHandle** updated_shared_buffer_handle_list,
-                         mx_uint* num_in_args,
-                         NDArrayHandle** in_args,
-                         NDArrayHandle** arg_grads,
-                         mx_uint* num_aux_states,
-                         NDArrayHandle** aux_states,
-                         ExecutorHandle shared_exec_handle,
-                         ExecutorHandle* out
+int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
+                           int dev_type,
+                           int dev_id,
+                           const mx_uint num_g2c_keys,
+                           const char** in, // g2c_keys,
+                           const int* in, // g2c_dev_types,
+                           const int* in, // g2c_dev_ids,
+                           const mx_uint provided_grad_req_list_len,
+                           const char** in, // provided_grad_req_names,
+                           const char** in, // provided_grad_req_types,
+                           const mx_uint num_provided_arg_shapes,
+                           const char** in, // provided_arg_shape_names,
+                           const int* in, // provided_arg_shape_data,
+                           const mx_uint* in, // provided_arg_shape_idx,
+                           const mx_uint num_provided_arg_dtypes,
+                           const char** in, // provided_arg_dtype_names,
+                           const int* in, // provided_arg_dtypes,
+                           const mx_uint num_provided_arg_stypes,
+                           const char** in, // provided_arg_stype_names,
+                           const int* in, // provided_arg_stypes,
+                           const mx_uint num_shared_arg_names,
+                           const char** in, // shared_arg_name_list,
+                           int* shared_buffer_len,
+                           const char** shared_buffer_name_list,
+                           NDArrayHandle* shared_buffer_handle_list,
+                           const char*** updated_shared_buffer_name_list,
+                           NDArrayHandle** updated_shared_buffer_handle_list,
+                           mx_uint* num_in_args,
+                           NDArrayHandle** in_args,
+                           NDArrayHandle** arg_grads,
+                           mx_uint* num_aux_states,
+                           NDArrayHandle** aux_states,
+                           ExecutorHandle shared_exec_handle,
+                           ExecutorHandle* out
 );
 
 /*!
@@ -1592,25 +1592,25 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
  * \param out output executor handle
  * \return a new executor
  */
-int MXExecutorReshape(int partial_shaping,
-                                int allow_up_sizing,
-                                int dev_type,
-                                int dev_id,
-                                mx_uint num_map_keys,
-                                const char** in,
-                                const int* in,
-                                const int* in,
-                                const mx_uint num_provided_arg_shapes,
-                                const char** in,
-                                const mx_uint* in,
-                                const mx_uint* in,
-                                mx_uint* couple_out_size,
-                                NDArrayHandle** out_first_array,
-                                NDArrayHandle** out_second_array,
-                                mx_uint* out_size,
-                                NDArrayHandle** out_array,
-                                ExecutorHandle shared_exec,
-                                ExecutorHandle *out);
+int MXExecutorReshapeEx(int partial_shaping,
+                        int allow_up_sizing,
+                        int dev_type,
+                        int dev_id,
+                        mx_uint num_map_keys,
+                        const char** in,
+                        const int* in,
+                        const int* in,
+                        const mx_uint num_provided_arg_shapes,
+                        const char** in,
+                        const int* in,
+                        const mx_uint* in,
+                        mx_uint* couple_out_size,
+                        NDArrayHandle** out_first_array,
+                        NDArrayHandle** out_second_array,
+                        mx_uint* out_size,
+                        NDArrayHandle** out_array,
+                        ExecutorHandle shared_exec,
+                        ExecutorHandle *out);
 
 /*!
  * \brief set a call back to notify the completion of operation
diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
index 50296c2aaba5..3ec9f95ea9c3 100644
--- a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
+++ b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
@@ -524,13 +524,13 @@
     }
 }
 
-%typemap(in,numinputs=0) (mx_uint *out_dim, const mx_uint **out_pdata) (mx_uint temp_dim, mx_uint *temp_pdata)
+%typemap(in,numinputs=0) (int *out_dim, const int **out_pdata) (int temp_dim, int *temp_pdata)
 {
     $1 = &temp_dim;
     $2 = &temp_pdata;
 }
 
-%typemap(argout) (mx_uint *out_dim, const mx_uint **out_pdata)
+%typemap(argout) (int *out_dim, const int **out_pdata)
 {
     if(!result)
     {
@@ -956,12 +956,12 @@
     }
 }
 
-%typemap(in,numinputs=0) (mx_uint *in_shape_size, const mx_uint **in_shape_ndim, const mx_uint ***in_shape_data) 
-                         (mx_uint temp1, mx_uint *temp2, mx_uint **temp3),
-                         (mx_uint *out_shape_size, const mx_uint **out_shape_ndim, const mx_uint ***out_shape_data) 
-                         (mx_uint temp1, mx_uint *temp2, mx_uint **temp3),
-                         (mx_uint *aux_shape_size, const mx_uint **aux_shape_ndim, const mx_uint ***aux_shape_data) 
-                         (mx_uint temp1, mx_uint *temp2, mx_uint **temp3)
+%typemap(in,numinputs=0) (mx_uint *in_shape_size, const int **in_shape_ndim, const int ***in_shape_data) 
+                         (mx_uint temp1, int *temp2, int **temp3),
+                         (mx_uint *out_shape_size, const int **out_shape_ndim, const int ***out_shape_data) 
+                         (mx_uint temp1, int *temp2, int **temp3),
+                         (mx_uint *aux_shape_size, const int **aux_shape_ndim, const int ***aux_shape_data) 
+                         (mx_uint temp1, int *temp2, int **temp3)
 {
     $1 = &temp1;
     $2 = &temp2;
@@ -969,9 +969,9 @@
     *$1 = 0;
 }
 
-%typemap(argout) (mx_uint *in_shape_size, const mx_uint **in_shape_ndim, const mx_uint ***in_shape_data),
-                 (mx_uint *out_shape_size, const mx_uint **out_shape_ndim, const mx_uint ***out_shape_data),
-                 (mx_uint *aux_shape_size, const mx_uint **aux_shape_ndim, const mx_uint ***aux_shape_data)
+%typemap(argout) (mx_uint *in_shape_size, const int **in_shape_ndim, const int ***in_shape_data),
+                 (mx_uint *out_shape_size, const int **out_shape_ndim, const int ***out_shape_data),
+                 (mx_uint *aux_shape_size, const int **aux_shape_ndim, const int ***aux_shape_data)
 {
     if(!result && *arg15)
     {
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 374a3b50bbb5..79eb1f10f427 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -23,7 +23,7 @@
 
 from .context import Context, current_context, cpu, gpu, cpu_pinned
 from . import engine
-from .base import MXNetError
+from .base import MXNetError, is_np_compat, set_np_compat, np_compat, use_np_compat
 from . import base
 from . import contrib
 from . import ndarray
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index feb4d70b6533..58f222dc1e85 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -20,17 +20,18 @@
 """ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
+from functools import wraps
 import atexit
 import ctypes
 import os
 import sys
 import inspect
 import platform
-import numpy as np
+import numpy as _np
 
 from . import libinfo
 
-__all__ = ['MXNetError']
+__all__ = ['MXNetError', 'is_np_compat', 'set_np_compat', 'np_compat', 'use_np_compat']
 #----------------------------
 # library loading
 #----------------------------
@@ -44,8 +45,8 @@
     long = int
 # pylint: enable=pointless-statement
 
-integer_types = (int, long, np.int32, np.int64)
-numeric_types = (float, int, long, np.generic)
+integer_types = (int, long, _np.int32, _np.int64)
+numeric_types = (float, int, long, _np.generic)
 string_types = basestring,
 
 if sys.version_info[0] > 2:
@@ -213,10 +214,11 @@ def _load_lib():
 _LIB = _load_lib()
 
 # type definitions
+mx_int = ctypes.c_int
 mx_uint = ctypes.c_uint
 mx_float = ctypes.c_float
 mx_float_p = ctypes.POINTER(mx_float)
-mx_real_t = np.float32
+mx_real_t = _np.float32
 NDArrayHandle = ctypes.c_void_p
 FunctionHandle = ctypes.c_void_p
 OpHandle = ctypes.c_void_p
@@ -455,7 +457,7 @@ def ctypes2numpy_shared(cptr, shape):
     for s in shape:
         size *= s
     dbuffer = (mx_float * size).from_address(ctypes.addressof(cptr.contents))
-    return np.frombuffer(dbuffer, dtype=np.float32).reshape(shape)
+    return _np.frombuffer(dbuffer, dtype=_np.float32).reshape(shape)
 
 
 def build_param_doc(arg_names, arg_types, arg_descs, remove_dup=True):
@@ -733,3 +735,140 @@ def write_all_str(module_file, module_all_list):
 
 ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
 ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+
+
+def set_np_compat(active):
+    """
+    Turns on/off NumPy compatibility. NumPy-compatibility is turned off by default in backend.
+
+    Parameters
+    ----------
+    active : bool
+        Indicates whether to turn on/off NumPy compatibility.
+
+    Returns
+    -------
+        A bool value indicating the previous state of NumPy compatibility.
+    """
+    prev = ctypes.c_int()
+    check_call(_LIB.MXSetIsNumpyCompatible(ctypes.c_int(active), ctypes.byref(prev)))
+    return bool(prev.value)
+
+
+def is_np_compat():
+    """
+    Checks whether the NumPy compatibility is currently turned on.
+    NumPy-compatibility is turned off by default in backend.
+
+    Returns
+    -------
+        A bool value indicating whether the NumPy compatibility is currently on.
+    """
+    curr = ctypes.c_bool()
+    check_call(_LIB.MXIsNumpyCompatible(ctypes.byref(curr)))
+    return curr.value
+
+
+class _NumpyCompatibilityStateScope(object):
+    """Scope for managing numpy compatibility state.
+    Do not use this class directly. Use `np_compat(active)` instead.
+
+    Example::
+
+        with _NumpyCompatibilityStateScope(True):
+            y = model(x)
+            backward([y])
+
+    """
+    def __init__(self, is_np_compat):  #pylint: disable=redefined-outer-name
+        self._enter_is_np_compat = is_np_compat
+        self._prev_is_np_compat = None
+
+    def __enter__(self):
+        if self._enter_is_np_compat is not None:
+            self._prev_is_np_compat = set_np_compat(self._enter_is_np_compat)
+
+    def __exit__(self, ptype, value, trace):
+        if self._enter_is_np_compat is not None and self._prev_is_np_compat != self._enter_is_np_compat:
+            set_np_compat(self._prev_is_np_compat)
+
+
+def np_compat(active=True):
+    """Returns an activated/deactivated NumPy compatibility state scope to be used in 'with' statement
+    and captures code that needs the compatibility.
+
+    Example::
+
+        with mx.np_compat(active=True):
+            # A scalar tensor's shape is `()`, whose `ndim` is `0`.
+            scalar = mx.nd.ones(shape=())
+            assert scalar.shape == ()
+
+            # In NumPy compatible mode, 0 in a shape means that dimension contains zero elements.
+            data = mx.sym.var("data", shape=(0, 2, 3))
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape()
+            assert arg_shapes[0] == (0, 2, 3)
+            assert out_shapes[0] == (0, 2, 3)
+
+            # -1 means unknown shape dimension size in the new NumPy-compatible shape definition
+            data = mx.sym.var("data", shape=(-1, 2, 3))
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] == (-1, 2, 3)
+            assert out_shapes[0] == (-1, 2, 3)
+
+            # When a shape is completely unknown in NumPy-compatible mode, it is
+            # represented as `None` in Python.
+            data = mx.sym.var("data")
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] is None
+            assert out_shapes[0] is None
+
+        with mx.np_compat(active=False):
+            # 0 means unknown shape dimension size in the legacy shape definition.
+            data = mx.sym.var("data", shape=(0, 2, 3))
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] == (0, 2, 3)
+            assert out_shapes[0] == (0, 2, 3)
+
+            # When a shape is completely unknown in the legacy mode (default), its ndim is
+            # equal to 0 and it is represented as `()` in Python.
+            data = mx.sym.var("data")
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] == ()
+            assert out_shapes[0] == ()
+    """
+    return _NumpyCompatibilityStateScope(active)
+
+
+def use_np_compat(func):
+    """Wraps a function with an activated NumPy-compatibility scope. This ensures
+    that the execution of the function is guaranteed with NumPy compatible semantics,
+    such as zero-dim and zero size tensors.
+
+    Example::
+        import mxnet as mx
+        @mx.use_np_compat
+        def scalar_one():
+            return mx.nd.ones(())
+        print(scalar_one())
+
+    Parameters
+    ----------
+    func : a user-provided callable function to be scoped by the NumPy compatibility state.
+
+    Returns
+    -------
+    Function
+        A function for wrapping the user functions in the NumPy compatibility scope.
+    """
+    @wraps(func)
+    def _with_np_compat(*args, **kwargs):
+        with np_compat(active=True):
+            return func(*args, **kwargs)
+
+    return _with_np_compat
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index 96183bb7a172..9e5f8c1e2311 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -258,7 +258,7 @@ def _smooth_distribution(p, eps=0.0001):
 
 
 # pylint: disable=line-too-long
-def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
+def _get_optimal_threshold(arr, quantized_dtype, num_bins=8001, num_quantized_bins=255):
     """Given a dataset, find the optimal threshold for quantizing it.
     The reference distribution is `q`, and the candidate distribution is `p`.
     `q` is a truncated version of the original distribution.
@@ -285,6 +285,10 @@ def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
     max_val = np.max(arr)
     th = max(abs(min_val), abs(max_val))
 
+    if min_val >= 0 and quantized_dtype in ['auto', 'uint8']:
+        # We need to move negative bins to positive bins to fit uint8 range.
+        num_quantized_bins = num_quantized_bins * 2 + 1
+
     hist, hist_edges = np.histogram(arr, bins=num_bins, range=(-th, th))
     zero_bin_idx = num_bins // 2
     num_half_quantized_bins = num_quantized_bins // 2
@@ -348,7 +352,7 @@ def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
 # pylint: enable=line-too-long
 
 
-def _get_optimal_thresholds(nd_dict, num_bins=8001, num_quantized_bins=255, logger=None):
+def _get_optimal_thresholds(nd_dict, quantized_dtype, num_bins=8001, num_quantized_bins=255, logger=None):
     """Given a ndarray dict, find the optimal threshold for quantizing each value of the key."""
     if stats is None:
         raise ImportError('scipy.stats is required for running entropy mode of calculating'
@@ -364,7 +368,7 @@ def _get_optimal_thresholds(nd_dict, num_bins=8001, num_quantized_bins=255, logg
     for name in layer_names:
         assert name in nd_dict
         min_val, max_val, min_divergence, opt_th = \
-            _get_optimal_threshold(nd_dict[name], num_bins=num_bins,
+            _get_optimal_threshold(nd_dict[name], quantized_dtype, num_bins=num_bins,
                                    num_quantized_bins=num_quantized_bins)
         del nd_dict[name]  # release the memory of ndarray
         if min_val < 0:
@@ -521,7 +525,7 @@ def quantize_model(sym, arg_params, aux_params,
                                                            logger=logger)
             logger.info('Collected layer outputs from FP32 model using %d examples' % num_examples)
             logger.info('Calculating optimal thresholds for quantization')
-            th_dict = _get_optimal_thresholds(nd_dict, logger=logger)
+            th_dict = _get_optimal_thresholds(nd_dict, quantized_dtype, logger=logger)
         elif calib_mode == 'naive':
             th_dict, num_examples = _collect_layer_output_min_max(
                 mod, calib_data, include_layer=calib_layer, max_num_examples=num_calib_examples,
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 7bf867579d6b..9dfe63682f86 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -25,7 +25,7 @@
 import copy
 import numpy as np
 from .base import _LIB
-from .base import mx_uint, NDArrayHandle, ExecutorHandle, py_str
+from .base import mx_uint, NDArrayHandle, ExecutorHandle, py_str, mx_int
 from .base import check_call, c_handle_array, c_array_buf, c_str_array
 from .ndarray import NDArray
 from .ndarray import _ndarray_cls
@@ -433,29 +433,29 @@ def reshape(self, partial_shaping=False, allow_up_sizing=False, **kwargs):
         num_aux_states = ctypes.c_uint()
         aux_state_handles = ctypes.POINTER(NDArrayHandle)()
 
-        check_call(_LIB.MXExecutorReshape(ctypes.c_int(int(partial_shaping)),
-                                          ctypes.c_int(int(allow_up_sizing)),
-                                          ctypes.c_int(self._ctx.device_typeid),
-                                          ctypes.c_int(self._ctx.device_id),
-                                          mx_uint(len(ctx_map_keys)),
-                                          c_str_array(ctx_map_keys),
-                                          c_array_buf(ctypes.c_int,
-                                                      py_array('i', ctx_map_dev_types)),
-                                          c_array_buf(ctypes.c_int,
-                                                      py_array('i', ctx_map_dev_ids)),
-                                          mx_uint(len(provided_arg_shape_names)),
-                                          c_str_array(provided_arg_shape_names),
-                                          c_array_buf(mx_uint,
-                                                      py_array('I', provided_arg_shape_data)),
-                                          c_array_buf(mx_uint,
-                                                      py_array('I', provided_arg_shape_idx)),
-                                          ctypes.byref(num_in_args),
-                                          ctypes.byref(in_arg_handles),
-                                          ctypes.byref(arg_grad_handles),
-                                          ctypes.byref(num_aux_states),
-                                          ctypes.byref(aux_state_handles),
-                                          shared_handle,
-                                          ctypes.byref(handle)))
+        check_call(_LIB.MXExecutorReshapeEx(ctypes.c_int(int(partial_shaping)),
+                                            ctypes.c_int(int(allow_up_sizing)),
+                                            ctypes.c_int(self._ctx.device_typeid),
+                                            ctypes.c_int(self._ctx.device_id),
+                                            mx_uint(len(ctx_map_keys)),
+                                            c_str_array(ctx_map_keys),
+                                            c_array_buf(ctypes.c_int,
+                                                        py_array('i', ctx_map_dev_types)),
+                                            c_array_buf(ctypes.c_int,
+                                                        py_array('i', ctx_map_dev_ids)),
+                                            mx_uint(len(provided_arg_shape_names)),
+                                            c_str_array(provided_arg_shape_names),
+                                            c_array_buf(mx_int,
+                                                        py_array('i', provided_arg_shape_data)),
+                                            c_array_buf(mx_uint,
+                                                        py_array('I', provided_arg_shape_idx)),
+                                            ctypes.byref(num_in_args),
+                                            ctypes.byref(in_arg_handles),
+                                            ctypes.byref(arg_grad_handles),
+                                            ctypes.byref(num_aux_states),
+                                            ctypes.byref(aux_state_handles),
+                                            shared_handle,
+                                            ctypes.byref(handle)))
 
         arg_arrays = [_ndarray_cls(NDArrayHandle(in_arg_handles[i]))
                       for i in range(num_in_args.value)]
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 7047364966af..2f3ed91cb5b7 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -1024,6 +1024,14 @@ def imports(symbol_file, input_names, param_file=None, ctx=None):
             ret.collect_params().load(param_file, ctx=ctx)
         return ret
 
+    def __repr__(self):
+        s = '{name}(\n{modstr}\n)'
+        modstr = '\n'.join(['{block} : {numinputs} -> {numoutputs}'.format(block=self._cached_graph[1],
+                                                                           numinputs=len(self._cached_graph[0]),
+                                                                           numoutputs=len(self._cached_graph[1].
+                                                                                          list_outputs()))])
+        return s.format(name=self.__class__.__name__,
+                        modstr=modstr)
 
     def __init__(self, outputs, inputs, params=None):
         super(SymbolBlock, self).__init__(prefix=None, params=None)
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
index 9310e15f5133..dff7f66b032d 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms.py
@@ -228,6 +228,67 @@ def forward(self, x):
         return image.random_size_crop(x, *self._args)[0]
 
 
+class CropResize(HybridBlock):
+    r"""Crop the input image with and optionally resize it.
+
+    Makes a crop of the original image then optionally resize it to the specified size.
+
+    Parameters
+    ----------
+    x : int
+        Left boundary of the cropping area
+    y : int
+        Top boundary of the cropping area
+    w : int
+        Width of the cropping area
+    h : int
+        Height of the cropping area
+    size : int or tuple of (w, h)
+        Optional, resize to new size after cropping
+    interpolation : int, optional
+        Interpolation method for resizing. By default uses bilinear
+        interpolation. See OpenCV's resize function for available choices.
+        https://docs.opencv.org/2.4/modules/imgproc/doc/geometric_transformations.html?highlight=resize#resize
+        Note that the Resize on gpu use contrib.bilinearResize2D operator
+        which only support bilinear interpolation(1). The result would be slightly
+        different on gpu compared to cpu. OpenCV tend to align center while bilinearResize2D
+        use algorithm which aligns corner.
+
+
+    Inputs:
+        - **data**: input tensor with (H x W x C) or (N x H x W x C) shape.
+
+    Outputs:
+        - **out**: input tensor with (H x W x C) or (N x H x W x C) shape.
+
+    Examples
+    --------
+    >>> transformer = vision.transforms.CropResize(x=0, y=0, width=100, height=100)
+    >>> image = mx.nd.random.uniform(0, 255, (224, 224, 3)).astype(dtype=np.uint8)
+    >>> transformer(image)
+    <NDArray 100x100x3 @cpu(0)>
+    >>> image = mx.nd.random.uniform(0, 255, (3, 224, 224, 3)).astype(dtype=np.uint8)
+    >>> transformer(image)
+    <NDArray 3x100x100x3 @cpu(0)>
+    >>> transformer = vision.transforms.CropResize(x=0, y=0, width=100, height=100, size=(50, 50), interpolation=1)
+    >>> transformer(image)
+    <NDArray 3x50x50 @cpu(0)>
+    """
+    def __init__(self, x, y, width, height, size=None, interpolation=None):
+        super(CropResize, self).__init__()
+        self._x = x
+        self._y = y
+        self._width = width
+        self._height = height
+        self._size = size
+        self._interpolation = interpolation
+
+    def hybrid_forward(self, F, x):
+        out = F.image.crop(x, self._x, self._y, self._width, self._height)
+        if self._size:
+            out = F.image.resize(out, self._size, False, self._interpolation)
+        return out
+
 class CenterCrop(Block):
     """Crops the image `src` to the given `size` by trimming on all four
     sides and preserving the center of the image. Upsamples if `src` is
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 29d0105ae8dd..e6d4c5bab852 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -30,6 +30,7 @@
 from ..base import numeric_types
 from .block import HybridBlock
 
+
 def _apply_weighting(F, loss, weight=None, sample_weight=None):
     """Apply weighting to loss.
 
@@ -60,10 +61,12 @@ def _apply_weighting(F, loss, weight=None, sample_weight=None):
 
     return loss
 
+
 def _reshape_like(F, x, y):
     """Reshapes x to the same shape as y."""
     return x.reshape(y.shape) if F is ndarray else F.reshape_like(x, y)
 
+
 class Loss(HybridBlock):
     """Base class for loss.
 
@@ -74,6 +77,7 @@ class Loss(HybridBlock):
     batch_axis : int, default 0
         The axis that represents mini-batch.
     """
+
     def __init__(self, weight, batch_axis, **kwargs):
         super(Loss, self).__init__(**kwargs)
         self._weight = weight
@@ -126,13 +130,14 @@ class L2Loss(Loss):
         - **loss**: loss tensor with shape (batch_size,). Dimenions other than
           batch_axis are averaged out.
     """
+
     def __init__(self, weight=1., batch_axis=0, **kwargs):
         super(L2Loss, self).__init__(weight, batch_axis, **kwargs)
 
     def hybrid_forward(self, F, pred, label, sample_weight=None):
         label = _reshape_like(F, label, pred)
         loss = F.square(label - pred)
-        loss = _apply_weighting(F, loss, self._weight/2, sample_weight)
+        loss = _apply_weighting(F, loss, self._weight / 2, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
 
@@ -164,6 +169,7 @@ class L1Loss(Loss):
         - **loss**: loss tensor with shape (batch_size,). Dimenions other than
           batch_axis are averaged out.
     """
+
     def __init__(self, weight=None, batch_axis=0, **kwargs):
         super(L1Loss, self).__init__(weight, batch_axis, **kwargs)
 
@@ -184,18 +190,22 @@ class SigmoidBinaryCrossEntropyLoss(Loss):
 
         prob = \frac{1}{1 + \exp(-{pred})}
 
-        L = - \sum_i {label}_i * \log({prob}_i) +
+        L = - \sum_i {label}_i * \log({prob}_i) * pos\_weight +
             (1 - {label}_i) * \log(1 - {prob}_i)
 
     If `from_sigmoid` is True, this loss computes:
 
     .. math::
 
-        L = - \sum_i {label}_i * \log({pred}_i) +
+        L = - \sum_i {label}_i * \log({pred}_i) * pos\_weight +
             (1 - {label}_i) * \log(1 - {pred}_i)
 
+    A tensor `pos_weight > 1` decreases the false negative count, hence increasing
+    the recall.
+    Conversely setting `pos_weight < 1` decreases the false positive count and
+    increases the precision.
 
-    `label` and `pred` can have arbitrary shape as long as they have the same
+    `pred` and `label` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
@@ -218,25 +228,45 @@ class SigmoidBinaryCrossEntropyLoss(Loss):
           to the same shape as pred. For example, if pred has shape (64, 10)
           and you want to weigh each sample in the batch separately,
           sample_weight should have shape (64, 1).
+        - **pos_weight**: a weighting tensor of positive examples. Must be a vector with length
+          equal to the number of classes.For example, if pred has shape (64, 10),
+          pos_weight should have shape (1, 10).
 
     Outputs:
         - **loss**: loss tensor with shape (batch_size,). Dimenions other than
           batch_axis are averaged out.
     """
+
     def __init__(self, from_sigmoid=False, weight=None, batch_axis=0, **kwargs):
-        super(SigmoidBinaryCrossEntropyLoss, self).__init__(weight, batch_axis, **kwargs)
+        super(SigmoidBinaryCrossEntropyLoss, self).__init__(
+            weight, batch_axis, **kwargs)
         self._from_sigmoid = from_sigmoid
 
-    def hybrid_forward(self, F, pred, label, sample_weight=None):
+    def hybrid_forward(self, F, pred, label, sample_weight=None, pos_weight=None):
         label = _reshape_like(F, label, pred)
         if not self._from_sigmoid:
-            # We use the stable formula: max(x, 0) - x * z + log(1 + exp(-abs(x)))
-            loss = F.relu(pred) - pred * label + F.Activation(-F.abs(pred), act_type='softrelu')
+            if pos_weight is None:
+                # We use the stable formula: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+                loss = F.relu(pred) - pred * label + \
+                    F.Activation(-F.abs(pred), act_type='softrelu')
+            else:
+                # We use the stable formula: x - x * z + (1 + z * pos_weight - z) * \
+                #    (log(1 + exp(-abs(x))) + max(-x, 0))
+                log_weight = 1 + F.broadcast_mul(pos_weight - 1, label)
+                loss = pred - pred * label + log_weight * \
+                       (F.Activation(-F.abs(pred), act_type='softrelu') + F.relu(-pred))
         else:
-            loss = -(F.log(pred+1e-12)*label + F.log(1.-pred+1e-12)*(1.-label))
+            eps = 1e-12
+            if pos_weight is None:
+                loss = -(F.log(pred + eps) * label
+                         + F.log(1. - pred + eps) * (1. - label))
+            else:
+                loss = -(F.broadcast_mul(F.log(pred + eps) * label, pos_weight)
+                         + F.log(1. - pred + eps) * (1. - label))
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
+
 SigmoidBCELoss = SigmoidBinaryCrossEntropyLoss
 
 
@@ -301,9 +331,11 @@ class SoftmaxCrossEntropyLoss(Loss):
         - **loss**: loss tensor with shape (batch_size,). Dimenions other than
           batch_axis are averaged out.
     """
+
     def __init__(self, axis=-1, sparse_label=True, from_logits=False, weight=None,
                  batch_axis=0, **kwargs):
-        super(SoftmaxCrossEntropyLoss, self).__init__(weight, batch_axis, **kwargs)
+        super(SoftmaxCrossEntropyLoss, self).__init__(
+            weight, batch_axis, **kwargs)
         self._axis = axis
         self._sparse_label = sparse_label
         self._from_logits = from_logits
@@ -315,10 +347,11 @@ def hybrid_forward(self, F, pred, label, sample_weight=None):
             loss = -F.pick(pred, label, axis=self._axis, keepdims=True)
         else:
             label = _reshape_like(F, label, pred)
-            loss = -F.sum(pred*label, axis=self._axis, keepdims=True)
+            loss = -F.sum(pred * label, axis=self._axis, keepdims=True)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
+
 SoftmaxCELoss = SoftmaxCrossEntropyLoss
 
 
@@ -382,6 +415,7 @@ class KLDivLoss(Loss):
         `Kullback-Leibler divergence
         <https://en.wikipedia.org/wiki/Kullback-Leibler_divergence>`_
     """
+
     def __init__(self, from_logits=True, axis=-1, weight=None, batch_axis=0,
                  **kwargs):
         super(KLDivLoss, self).__init__(weight, batch_axis, **kwargs)
@@ -391,7 +425,7 @@ def __init__(self, from_logits=True, axis=-1, weight=None, batch_axis=0,
     def hybrid_forward(self, F, pred, label, sample_weight=None):
         if not self._from_logits:
             pred = F.log_softmax(pred, self._axis)
-        loss = label * (F.log(label+1e-12) - pred)
+        loss = label * (F.log(label + 1e-12) - pred)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
@@ -453,11 +487,12 @@ class CTCLoss(Loss):
         Sequence Data with Recurrent Neural Networks
         <http://www.cs.toronto.edu/~graves/icml_2006.pdf>`_
     """
+
     def __init__(self, layout='NTC', label_layout='NT', weight=None, **kwargs):
         assert layout in ['NTC', 'TNC'],\
-               "Only 'NTC' and 'TNC' layouts for pred are supported. Got: %s"%layout
+            "Only 'NTC' and 'TNC' layouts for pred are supported. Got: %s" % layout
         assert label_layout in ['NT', 'TN'],\
-               "Only 'NT' and 'TN' layouts for label are supported. Got: %s"%label_layout
+            "Only 'NT' and 'TN' layouts for label are supported. Got: %s" % label_layout
         self._layout = layout
         self._label_layout = label_layout
         batch_axis = label_layout.find('N')
@@ -512,6 +547,7 @@ class HuberLoss(Loss):
         - **loss**: loss tensor with shape (batch_size,). Dimenions other than
           batch_axis are averaged out.
     """
+
     def __init__(self, rho=1, weight=None, batch_axis=0, **kwargs):
         super(HuberLoss, self).__init__(weight, batch_axis, **kwargs)
         self._rho = rho
@@ -520,7 +556,7 @@ def hybrid_forward(self, F, pred, label, sample_weight=None):
         label = _reshape_like(F, label, pred)
         loss = F.abs(label - pred)
         loss = F.where(loss > self._rho, loss - 0.5 * self._rho,
-                       (0.5/self._rho) * F.square(loss))
+                       (0.5 / self._rho) * F.square(loss))
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
@@ -558,6 +594,7 @@ class HingeLoss(Loss):
         - **loss**: loss tensor with shape (batch_size,). Dimenions other than
           batch_axis are averaged out.
     """
+
     def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
         super(HingeLoss, self).__init__(weight, batch_axis, **kwargs)
         self._margin = margin
@@ -602,6 +639,7 @@ class SquaredHingeLoss(Loss):
         - **loss**: loss tensor with shape (batch_size,). Dimenions other than
           batch_axis are averaged out.
     """
+
     def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
         super(SquaredHingeLoss, self).__init__(weight, batch_axis, **kwargs)
         self._margin = margin
@@ -647,6 +685,7 @@ class LogisticLoss(Loss):
         - **loss**: loss tensor with shape (batch_size,). Dimenions other than
           batch_axis are averaged out.
     """
+
     def __init__(self, weight=None, batch_axis=0, label_format='signed', **kwargs):
         super(LogisticLoss, self).__init__(weight, batch_axis, **kwargs)
         self._label_format = label_format
@@ -659,7 +698,8 @@ def hybrid_forward(self, F, pred, label, sample_weight=None):
         if self._label_format == 'signed':
             label = (label + 1.0) / 2.0  # Transform label to be either 0 or 1
         # Use a stable formula in computation
-        loss = F.relu(pred) - pred * label + F.Activation(-F.abs(pred), act_type='softrelu')
+        loss = F.relu(pred) - pred * label + \
+            F.Activation(-F.abs(pred), act_type='softrelu')
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
@@ -696,6 +736,7 @@ class TripletLoss(Loss):
     Outputs:
         - **loss**: loss tensor with shape (batch_size,).
     """
+
     def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
         super(TripletLoss, self).__init__(weight, batch_axis, **kwargs)
         self._margin = margin
@@ -703,7 +744,7 @@ def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
     def hybrid_forward(self, F, pred, positive, negative):
         positive = _reshape_like(F, positive, pred)
         negative = _reshape_like(F, negative, pred)
-        loss = F.sum(F.square(positive-pred) - F.square(negative-pred),
+        loss = F.sum(F.square(positive - pred) - F.square(negative - pred),
                      axis=self._batch_axis, exclude=True)
         loss = F.relu(loss + self._margin)
         return _apply_weighting(F, loss, self._weight, None)
@@ -748,6 +789,7 @@ class PoissonNLLLoss(Loss):
     Outputs:
         - **loss**: Average loss (shape=(1,1)) of the loss tensor with shape (batch_size,).
     """
+
     def __init__(self, weight=None, from_logits=True, batch_axis=0, compute_full=False, **kwargs):
         super(PoissonNLLLoss, self).__init__(weight, batch_axis, **kwargs)
         self._from_logits = from_logits
@@ -761,7 +803,8 @@ def hybrid_forward(self, F, pred, target, sample_weight=None, epsilon=1e-08):
             loss = pred - target * F.log(pred + epsilon)
         if self._compute_full:
             # Using numpy's pi value
-            stirling_factor = target * F.log(target)- target + 0.5 * F.log(2 * target * np.pi)
+            stirling_factor = target * \
+                F.log(target) - target + 0.5 * F.log(2 * target * np.pi)
             target_gt_1 = target > 1
             stirling_factor *= target_gt_1
             loss += stirling_factor
@@ -804,6 +847,7 @@ class CosineEmbeddingLoss(Loss):
     Outputs:
         - **loss**: The loss tensor with shape (batch_size,).
     """
+
     def __init__(self, weight=None, batch_axis=0, margin=0, **kwargs):
         super(CosineEmbeddingLoss, self).__init__(weight, batch_axis, **kwargs)
         self._margin = margin
@@ -820,7 +864,8 @@ def hybrid_forward(self, F, input1, input2, label, sample_weight=None):
             z_array = F.array([0])
         else:
             z_array = F.zeros((1, 1))
-        cos_sim_b = F.broadcast_maximum(z_array, y_minus_1 * (cos_sim - self._margin), axis=1)
+        cos_sim_b = F.broadcast_maximum(
+            z_array, y_minus_1 * (cos_sim - self._margin), axis=1)
         loss = cos_sim_a + cos_sim_b
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return loss
@@ -829,7 +874,7 @@ def _cosine_similarity(self, F, x, y, axis=-1):
         # Calculates the cosine similarity between 2 vectors
         x_norm = F.norm(x, axis=axis).reshape(-1, 1)
         y_norm = F.norm(y, axis=axis).reshape(-1, 1)
-        x_dot_y = F.sum(x*y, axis=axis).reshape(-1, 1)
+        x_dot_y = F.sum(x * y, axis=axis).reshape(-1, 1)
         if F is ndarray:
             eps_arr = F.array([1e-12])
         else:
diff --git a/python/mxnet/gluon/nn/activations.py b/python/mxnet/gluon/nn/activations.py
index fa8eee9d2989..c7dc83176e14 100644
--- a/python/mxnet/gluon/nn/activations.py
+++ b/python/mxnet/gluon/nn/activations.py
@@ -18,7 +18,7 @@
 # coding: utf-8
 # pylint: disable= arguments-differ
 """Basic neural network layers."""
-__all__ = ['Activation', 'LeakyReLU', 'PReLU', 'ELU', 'SELU', 'Swish']
+__all__ = ['Activation', 'LeakyReLU', 'PReLU', 'ELU', 'SELU', 'Swish', 'GELU']
 
 from ... import initializer
 from ..block import HybridBlock
@@ -180,6 +180,25 @@ def __init__(self, **kwargs):
     def hybrid_forward(self, F, x):
         return F.LeakyReLU(x, act_type='selu', name='fwd')
 
+class GELU(HybridBlock):
+    r"""
+    Gaussian Exponential Linear Unit (GELU)
+        "Gaussian Error Linear Units (GELUs)", Hendrycks et al, 2016
+        https://arxiv.org/abs/1606.08415
+
+
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
+
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
+    """
+    def __init__(self, **kwargs):
+        super(GELU, self).__init__(**kwargs)
+
+    def hybrid_forward(self, F, x):
+        return F.LeakyReLU(x, act_type='gelu', name='fwd')
+
 
 class Swish(HybridBlock):
     r"""
diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index c210081f6071..4122a08563fa 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -855,7 +855,7 @@ class AvgPool1D(_Pooling):
     Parameters
     ----------
     pool_size: int
-        Size of the max pooling windows.
+        Size of the average pooling windows.
     strides: int, or None
         Factor by which to downscale. E.g. 2 will halve the input size.
         If `None`, it will default to `pool_size`.
@@ -903,7 +903,7 @@ class AvgPool2D(_Pooling):
     Parameters
     ----------
     pool_size: int or list/tuple of 2 ints,
-        Size of the max pooling windows.
+        Size of the average pooling windows.
     strides: int, list/tuple of 2 ints, or None.
         Factor by which to downscale. E.g. 2 will halve the input size.
         If `None`, it will default to `pool_size`.
@@ -954,7 +954,7 @@ class AvgPool3D(_Pooling):
     Parameters
     ----------
     pool_size: int or list/tuple of 3 ints,
-        Size of the max pooling windows.
+        Size of the average pooling windows.
     strides: int, list/tuple of 3 ints, or None.
         Factor by which to downscale. E.g. 2 will halve the input size.
         If `None`, it will default to `pool_size`.
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index c43dc8527fd4..6dfec43a8b5f 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -37,7 +37,7 @@ def __init__(self, hidden_size, num_layers, layout,
                  i2h_bias_initializer, h2h_bias_initializer,
                  mode, projection_size, h2r_weight_initializer,
                  lstm_state_clip_min, lstm_state_clip_max, lstm_state_clip_nan,
-                 **kwargs):
+                 dtype, **kwargs):
         super(_RNNLayer, self).__init__(**kwargs)
         assert layout in ('TNC', 'NTC'), \
             "Invalid layout %s; must be one of ['TNC' or 'NTC']"%layout
@@ -57,6 +57,7 @@ def __init__(self, hidden_size, num_layers, layout,
         self._lstm_state_clip_min = lstm_state_clip_min
         self._lstm_state_clip_max = lstm_state_clip_max
         self._lstm_state_clip_nan = lstm_state_clip_nan
+        self._dtype = dtype
 
         self._gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]
 
@@ -66,16 +67,16 @@ def __init__(self, hidden_size, num_layers, layout,
                 for j in ['l', 'r'][:self._dir]:
                     self._register_param('{}{}_i2h_weight'.format(j, i),
                                          shape=(ng*nh, ni),
-                                         init=i2h_weight_initializer)
+                                         init=i2h_weight_initializer, dtype=dtype)
                     self._register_param('{}{}_h2h_weight'.format(j, i),
                                          shape=(ng*nh, nh),
-                                         init=h2h_weight_initializer)
+                                         init=h2h_weight_initializer, dtype=dtype)
                     self._register_param('{}{}_i2h_bias'.format(j, i),
                                          shape=(ng*nh,),
-                                         init=i2h_bias_initializer)
+                                         init=i2h_bias_initializer, dtype=dtype)
                     self._register_param('{}{}_h2h_bias'.format(j, i),
                                          shape=(ng*nh,),
-                                         init=h2h_bias_initializer)
+                                         init=h2h_bias_initializer, dtype=dtype)
                 ni = nh * self._dir
         else:
             np = self._projection_size
@@ -83,24 +84,24 @@ def __init__(self, hidden_size, num_layers, layout,
                 for j in ['l', 'r'][:self._dir]:
                     self._register_param('{}{}_i2h_weight'.format(j, i),
                                          shape=(ng*nh, ni),
-                                         init=i2h_weight_initializer)
+                                         init=i2h_weight_initializer, dtype=dtype)
                     self._register_param('{}{}_h2h_weight'.format(j, i),
                                          shape=(ng*nh, np),
-                                         init=h2h_weight_initializer)
+                                         init=h2h_weight_initializer, dtype=dtype)
                     self._register_param('{}{}_i2h_bias'.format(j, i),
                                          shape=(ng*nh,),
-                                         init=i2h_bias_initializer)
+                                         init=i2h_bias_initializer, dtype=dtype)
                     self._register_param('{}{}_h2h_bias'.format(j, i),
                                          shape=(ng*nh,),
-                                         init=h2h_bias_initializer)
+                                         init=h2h_bias_initializer, dtype=dtype)
                     self._register_param('{}{}_h2r_weight'.format(j, i),
                                          shape=(np, nh),
-                                         init=h2r_weight_initializer)
+                                         init=h2r_weight_initializer, dtype=dtype)
                 ni = np * self._dir
 
-    def _register_param(self, name, shape, init):
+    def _register_param(self, name, shape, init, dtype):
         p = self.params.get(name, shape=shape, init=init,
-                            allow_deferred_init=True)
+                            allow_deferred_init=True, dtype=dtype)
         setattr(self, name, p)
         return p
 
@@ -179,6 +180,10 @@ def _unfuse(self):
 
         return stack
 
+    def cast(self, dtype):
+        super(_RNNLayer, self).cast(dtype)
+        self._dtype = dtype
+
     def begin_state(self, batch_size=0, func=ndarray.zeros, **kwargs):
         """Initial state for this cell.
 
@@ -317,6 +322,8 @@ class RNN(_RNNLayer):
     input_size: int, default 0
         The number of expected features in the input x.
         If not specified, it will be inferred from input.
+    dtype : str, default 'float32'
+        Type to initialize the parameters and default states to
     prefix : str or None
         Prefix of this `Block`.
     params : ParameterDict or None
@@ -357,17 +364,17 @@ def __init__(self, hidden_size, num_layers=1, activation='relu',
                  layout='TNC', dropout=0, bidirectional=False,
                  i2h_weight_initializer=None, h2h_weight_initializer=None,
                  i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
-                 input_size=0, **kwargs):
+                 input_size=0, dtype='float32', **kwargs):
         super(RNN, self).__init__(hidden_size, num_layers, layout,
                                   dropout, bidirectional, input_size,
                                   i2h_weight_initializer, h2h_weight_initializer,
                                   i2h_bias_initializer, h2h_bias_initializer,
                                   'rnn_'+activation, None, None, None, None, False,
-                                  **kwargs)
+                                  dtype, **kwargs)
 
     def state_info(self, batch_size=0):
         return [{'shape': (self._num_layers * self._dir, batch_size, self._hidden_size),
-                 '__layout__': 'LNC'}]
+                 '__layout__': 'LNC', 'dtype': self._dtype}]
 
 
 class LSTM(_RNNLayer):
@@ -432,6 +439,8 @@ class LSTM(_RNNLayer):
     state_clip_nan : boolean, default False
         Whether to stop NaN from propagating in state by clipping it to min/max.
         If the clipping range is not specified, this option is ignored.
+    dtype : str, default 'float32'
+        Type to initialize the parameters and default states to
     input_size: int, default 0
         The number of expected features in the input x.
         If not specified, it will be inferred from input.
@@ -477,26 +486,26 @@ def __init__(self, hidden_size, num_layers=1, layout='TNC',
                  i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
                  projection_size=None, h2r_weight_initializer=None,
                  state_clip_min=None, state_clip_max=None, state_clip_nan=False,
-                 **kwargs):
+                 dtype='float32', **kwargs):
         super(LSTM, self).__init__(hidden_size, num_layers, layout,
                                    dropout, bidirectional, input_size,
                                    i2h_weight_initializer, h2h_weight_initializer,
                                    i2h_bias_initializer, h2h_bias_initializer,
                                    'lstm', projection_size, h2r_weight_initializer,
                                    state_clip_min, state_clip_max, state_clip_nan,
-                                   **kwargs)
+                                   dtype, **kwargs)
 
     def state_info(self, batch_size=0):
         if self._projection_size is None:
             return [{'shape': (self._num_layers * self._dir, batch_size, self._hidden_size),
-                     '__layout__': 'LNC'},
+                     '__layout__': 'LNC', 'dtype': self._dtype},
                     {'shape': (self._num_layers * self._dir, batch_size, self._hidden_size),
-                     '__layout__': 'LNC'}]
+                     '__layout__': 'LNC', 'dtype': self._dtype}]
         else:
             return [{'shape': (self._num_layers * self._dir, batch_size, self._projection_size),
-                     '__layout__': 'LNC'},
+                     '__layout__': 'LNC', 'dtype': self._dtype},
                     {'shape': (self._num_layers * self._dir, batch_size, self._hidden_size),
-                     '__layout__': 'LNC'}]
+                     '__layout__': 'LNC', 'dtype': self._dtype}]
 
 
 class GRU(_RNNLayer):
@@ -544,6 +553,8 @@ class GRU(_RNNLayer):
         Initializer for the bias vector.
     h2h_bias_initializer : str or Initializer
         Initializer for the bias vector.
+    dtype : str, default 'float32'
+        Type to initialize the parameters and default states to
     input_size: int, default 0
         The number of expected features in the input x.
         If not specified, it will be inferred from input.
@@ -586,14 +597,14 @@ def __init__(self, hidden_size, num_layers=1, layout='TNC',
                  dropout=0, bidirectional=False, input_size=0,
                  i2h_weight_initializer=None, h2h_weight_initializer=None,
                  i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
-                 **kwargs):
+                 dtype='float32', **kwargs):
         super(GRU, self).__init__(hidden_size, num_layers, layout,
                                   dropout, bidirectional, input_size,
                                   i2h_weight_initializer, h2h_weight_initializer,
                                   i2h_bias_initializer, h2h_bias_initializer,
                                   'gru', None, None, None, None, False,
-                                  **kwargs)
+                                  dtype, **kwargs)
 
     def state_info(self, batch_size=0):
         return [{'shape': (self._num_layers * self._dir, batch_size, self._hidden_size),
-                 '__layout__': 'LNC'}]
+                 '__layout__': 'LNC', 'dtype': self._dtype}]
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 8060f38ac2aa..a95417cf523b 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -95,10 +95,10 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
             if param._grad_stype != 'default':
                 self._contains_sparse_grad = True
         self._compression_params = compression_params
-        optimizer_params = optimizer_params if optimizer_params else {}
-        self._scale = float(optimizer_params.get('rescale_grad', 1.0))
         self._contexts = self._check_contexts()
+        optimizer_params = optimizer_params if optimizer_params else {}
         self._init_optimizer(optimizer, optimizer_params)
+        self._scale = self._optimizer.rescale_grad
         self._kvstore_params = {'kvstore': kvstore, 'update_on_kvstore': update_on_kvstore}
         self._kv_initialized = False
         self._kvstore = None
@@ -241,10 +241,6 @@ def _init_kvstore(self):
                 kvstore.set_optimizer(self._optimizer)
             self._kvstore = kvstore
             self._update_on_kvstore = update_on_kvstore
-            if self._optimizer.lr_scheduler and not self._update_on_kvstore:
-                raise ValueError("update_on_kvstore=False does not support " \
-                                 "optimizer with LRScheduler. Please " \
-                                 "consider setting learning rate manually.")
         else:
             self._kvstore = None
             self._update_on_kvstore = None
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 55edd950d223..b00cc043d493 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -63,10 +63,6 @@ def split_data(data, num_slice, batch_axis=0, even_split=True):
         Return value is a list even if `num_slice` is 1.
     """
     size = data.shape[batch_axis]
-    if size < num_slice:
-        raise ValueError(
-            "Too many slices for data with shape %s. Arguments are " \
-            "num_slice=%d and batch_axis=%d."%(str(data.shape), num_slice, batch_axis))
     if even_split and size % num_slice != 0:
         raise ValueError(
             "data with shape %s cannot be evenly split into %d slices along axis %d. " \
@@ -75,6 +71,12 @@ def split_data(data, num_slice, batch_axis=0, even_split=True):
                 str(data.shape), num_slice, batch_axis, num_slice))
 
     step = size // num_slice
+
+    # If size < num_slice, make fewer slices
+    if not even_split and size < num_slice:
+        step = 1
+        num_slice = size
+
     if batch_axis == 0:
         slices = [data[i*step:(i+1)*step] if i < num_slice - 1 else data[i*step:size]
                   for i in range(num_slice)]
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index 1dd665607597..8bcf724ac4d2 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -428,7 +428,7 @@ def fixed_crop(src, x0, y0, w, h, size=None, interp=2):
     NDArray
         An `NDArray` containing the cropped image.
     """
-    out = nd.crop(src, begin=(y0, x0, 0), end=(y0 + h, x0 + w, int(src.shape[2])))
+    out = nd.slice(src, begin=(y0, x0, 0), end=(y0 + h, x0 + w, int(src.shape[2])))
     if size is not None and (w, h) != size:
         sizes = (h, w, size[1], size[0])
         out = imresize(out, *size, interp=_get_interp_method(interp, sizes))
@@ -585,14 +585,12 @@ def random_size_crop(src, size, area, ratio, interp=2, **kwargs):
         area = (area, 1.0)
     for _ in range(10):
         target_area = random.uniform(area[0], area[1]) * src_area
-        new_ratio = random.uniform(*ratio)
+        log_ratio = (np.log(ratio[0]), np.log(ratio[1]))
+        new_ratio = np.exp(random.uniform(*log_ratio))
 
         new_w = int(round(np.sqrt(target_area * new_ratio)))
         new_h = int(round(np.sqrt(target_area / new_ratio)))
 
-        if random.random() < 0.5:
-            new_h, new_w = new_w, new_h
-
         if new_w <= w and new_h <= h:
             x0 = random.randint(0, w - new_w)
             y0 = random.randint(0, h - new_h)
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index 611592aa4d82..aca7c58707e2 100755
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -159,6 +159,12 @@ def __call__(self, desc, arr):
             elif desc.endswith('max'):
                 self._init_one(desc, arr)
                 self._verbose_print(desc, 'max', arr)
+            elif desc.endswith('weight_quantize'):
+                self._init_quantized_weight(desc, arr)
+                self._verbose_print(desc, 'weight_quantize', arr)
+            elif desc.endswith('bias_quantize'):
+                self._init_quantized_bias(desc, arr)
+                self._verbose_print(desc, 'bias_quantize', arr)
             else:
                 self._init_default(desc, arr)
 
@@ -235,6 +241,9 @@ def _init_one(self, _, arr):
     def _init_bias(self, _, arr):
         arr[:] = 0.0
 
+    def _init_quantized_bias(self, _, arr):
+        arr[:] = 0
+
     def _init_gamma(self, _, arr):
         arr[:] = 1.0
 
@@ -245,6 +254,10 @@ def _init_weight(self, name, arr):
         """Abstract method to Initialize weight."""
         raise NotImplementedError("Must override it")
 
+    def _init_quantized_weight(self, _, arr):
+        _arr = random.randint(-127, 127, dtype='int32').asnumpy()
+        arr[:] = np.int8(_arr)
+
     def _init_default(self, name, _):
         raise ValueError(
             'Unknown initialization pattern for %s. ' \
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index ecb8e1c3bc22..4bfd1b36bf89 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -860,7 +860,7 @@ class MCC(EvalMetric):
 
     .. note::
 
-        This version of MCC only supports binary classification.
+        This version of MCC only supports binary classification.  See PCC.
 
     Parameters
     ----------
@@ -1476,6 +1476,136 @@ def update(self, labels, preds):
             self.global_num_inst += 1
 
 
+@register
+class PCC(EvalMetric):
+    """PCC is a multiclass equivalent for the Matthews correlation coefficient derived
+    from a discrete solution to the Pearson correlation coefficient.
+
+    .. math::
+        \\text{PCC} = \\frac {\\sum _{k}\\sum _{l}\\sum _{m}C_{kk}C_{lm}-C_{kl}C_{mk}}
+        {{\\sqrt {\\sum _{k}(\\sum _{l}C_{kl})(\\sum _{k'|k'\\neq k}\\sum _{l'}C_{k'l'})}}
+         {\\sqrt {\\sum _{k}(\\sum _{l}C_{lk})(\\sum _{k'|k'\\neq k}\\sum _{l'}C_{l'k'})}}}
+
+    defined in terms of a K x K confusion matrix C.
+
+    When there are more than two labels the PCC will no longer range between -1 and +1.
+    Instead the minimum value will be between -1 and 0 depending on the true distribution.
+    The maximum value is always +1.
+
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+
+    Examples
+    --------
+    >>> # In this example the network almost always predicts positive
+    >>> false_positives = 1000
+    >>> false_negatives = 1
+    >>> true_positives = 10000
+    >>> true_negatives = 1
+    >>> predicts = [mx.nd.array(
+        [[.3, .7]]*false_positives +
+        [[.7, .3]]*true_negatives +
+        [[.7, .3]]*false_negatives +
+        [[.3, .7]]*true_positives
+    )]
+    >>> labels  = [mx.nd.array(
+        [0]*(false_positives + true_negatives) +
+        [1]*(false_negatives + true_positives)
+    )]
+    >>> f1 = mx.metric.F1()
+    >>> f1.update(preds = predicts, labels = labels)
+    >>> pcc = mx.metric.PCC()
+    >>> pcc.update(preds = predicts, labels = labels)
+    >>> print f1.get()
+    ('f1', 0.95233560306652054)
+    >>> print pcc.get()
+    ('pcc', 0.01917751877733392)
+    """
+    def __init__(self, name='pcc',
+                 output_names=None, label_names=None,
+                 has_global_stats=True):
+        self.k = 2
+        super(PCC, self).__init__(
+            name=name, output_names=output_names, label_names=label_names,
+            has_global_stats=has_global_stats)
+
+    def _grow(self, inc):
+        self.lcm = numpy.pad(
+            self.lcm, ((0, inc), (0, inc)), 'constant', constant_values=(0))
+        self.gcm = numpy.pad(
+            self.gcm, ((0, inc), (0, inc)), 'constant', constant_values=(0))
+        self.k += inc
+
+    def _calc_mcc(self, cmat):
+        n = cmat.sum()
+        x = cmat.sum(axis=1)
+        y = cmat.sum(axis=0)
+        cov_xx = numpy.sum(x * (n - x))
+        cov_yy = numpy.sum(y * (n - y))
+        if cov_xx == 0 or cov_yy == 0:
+            return float('nan')
+        i = cmat.diagonal()
+        cov_xy = numpy.sum(i * n - x * y)
+        return cov_xy / (cov_xx * cov_yy) ** 0.5
+
+    def update(self, labels, preds):
+        """Updates the internal evaluation result.
+
+        Parameters
+        ----------
+        labels : list of `NDArray`
+            The labels of the data.
+
+        preds : list of `NDArray`
+            Predicted values.
+        """
+        labels, preds = check_label_shapes(labels, preds, True)
+
+        # update the confusion matrix
+        for label, pred in zip(labels, preds):
+            label = label.astype('int32', copy=False).asnumpy()
+            pred = pred.asnumpy().argmax(axis=1)
+            n = max(pred.max(), label.max())
+            if n >= self.k:
+                self._grow(n + 1 - self.k)
+            bcm = numpy.zeros((self.k, self.k))
+            for i, j in zip(pred, label):
+                bcm[i, j] += 1
+            self.lcm += bcm
+            self.gcm += bcm
+
+        self.num_inst += 1
+        self.global_num_inst += 1
+
+    @property
+    def sum_metric(self):
+        return self._calc_mcc(self.lcm) * self.num_inst
+
+    @property
+    def global_sum_metric(self):
+        return self._calc_mcc(self.gcm) * self.global_num_inst
+
+    def reset(self):
+        """Resets the internal evaluation result to initial state."""
+        self.global_num_inst = 0.
+        self.gcm = numpy.zeros((self.k, self.k))
+        self.reset_local()
+
+    def reset_local(self):
+        """Resets the local portion of the internal evaluation results
+        to initial state."""
+        self.num_inst = 0.
+        self.lcm = numpy.zeros((self.k, self.k))
+
+
 @register
 class Loss(EvalMetric):
     """Dummy metric for directly printing loss.
diff --git a/python/mxnet/ndarray/_internal.py b/python/mxnet/ndarray/_internal.py
index 5f3ce976dbc5..8045d9bd2b14 100644
--- a/python/mxnet/ndarray/_internal.py
+++ b/python/mxnet/ndarray/_internal.py
@@ -20,8 +20,6 @@
 import os as _os
 import sys as _sys
 
-import numpy as np
-
 try:
     if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
         from .._ctypes.ndarray import NDArrayBase, CachedOp
diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index 74c355dc1288..1718a2c68d13 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -18,6 +18,7 @@
 # coding: utf-8
 # pylint: disable=wildcard-import, unused-wildcard-import,redefined-outer-name
 """Contrib NDArray API of MXNet."""
+from __future__ import absolute_import
 import math
 import numpy as np
 from ..context import current_context
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index fb329f1865a9..97cfd827c7fe 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -35,7 +35,7 @@
 import numpy as np
 from ..base import _LIB, numeric_types, integer_types
 from ..base import c_str, c_array, c_array_buf, c_handle_array, mx_real_t
-from ..base import mx_uint, NDArrayHandle, check_call, DLPackHandle
+from ..base import mx_uint, NDArrayHandle, check_call, DLPackHandle, mx_int
 from ..base import ctypes2buffer
 from ..context import Context, current_context
 from . import _internal
@@ -143,11 +143,11 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
 
 def _new_from_shared_mem(shared_pid, shared_id, shape, dtype):
     hdl = NDArrayHandle()
-    check_call(_LIB.MXNDArrayCreateFromSharedMem(
+    check_call(_LIB.MXNDArrayCreateFromSharedMemEx(
         ctypes.c_int(shared_pid),
         ctypes.c_int(shared_id),
-        c_array(mx_uint, shape),
-        mx_uint(len(shape)),
+        c_array(mx_int, shape),
+        mx_int(len(shape)),
         ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
         ctypes.byref(hdl)))
     return hdl
@@ -158,12 +158,9 @@ def waitall():
 
     This function is used for benchmarking only.
 
-    .. warning::
+    .. note::
 
-       If your code has exceptions, `waitall` can cause silent failures.
-       For this reason you should avoid `waitall` in your code.
-       Use it only if you are confident that your code is error free.
-       Then make sure you call `wait_to_read` on all outputs after `waitall`.
+       If your mxnet code throws an exception, then waitall can cause performance impact.
     """
     check_call(_LIB.MXNDArrayWaitAll())
 
@@ -1848,11 +1845,14 @@ def shape(self):
         >>> y.shape
         (2L, 3L, 4L)
         """
-        ndim = mx_uint()
-        pdata = ctypes.POINTER(mx_uint)()
-        check_call(_LIB.MXNDArrayGetShape(
+        ndim = mx_int()
+        pdata = ctypes.POINTER(mx_int)()
+        check_call(_LIB.MXNDArrayGetShapeEx(
             self.handle, ctypes.byref(ndim), ctypes.byref(pdata)))
-        return tuple(pdata[:ndim.value]) # pylint: disable=invalid-slice-index
+        if ndim.value == -1:
+            return None
+        else:
+            return tuple(pdata[:ndim.value])  # pylint: disable=invalid-slice-index
 
 
     @property
@@ -2513,10 +2513,10 @@ def moveaxis(tensor, source, destination):
     ----------
     tensor : mx.nd.array
         The array which axes should be reordered
-    source : int
-        Original position of the axes to move.
-    destination : int
-        Destination position for each of the original axes.
+    source : int or sequence of int
+        Original position of the axes to move. Can be negative but must be unique.
+    destination : int or sequence of int
+        Destination position for each of the original axes. Can be negative but must be unique.
 
     Returns
     -------
@@ -2528,23 +2528,36 @@ def moveaxis(tensor, source, destination):
     >>> X = mx.nd.array([[1, 2, 3], [4, 5, 6]])
     >>> mx.nd.moveaxis(X, 0, 1).shape
     (3L, 2L)
+
+    >>> X = mx.nd.zeros((3, 4, 5))
+    >>> mx.nd.moveaxis(X, [0, 1], [-1, -2]).shape
+    (5, 4, 3)
     """
-    axes = list(range(tensor.ndim))
     try:
-        axes.pop(source)
+        source = np.core.numeric.normalize_axis_tuple(source, tensor.ndim)
     except IndexError:
         raise ValueError('Source should verify 0 <= source < tensor.ndim'
                          'Got %d' % source)
     try:
-        axes.insert(destination, source)
+        destination = np.core.numeric.normalize_axis_tuple(destination, tensor.ndim)
     except IndexError:
-        raise ValueError('Destination should verify 0 <= destination < tensor.ndim'
-                         'Got %d' % destination)
-    return op.transpose(tensor, axes)
+        raise ValueError('Destination should verify 0 <= destination < tensor.ndim (%d).'
+                         % tensor.ndim, 'Got %d' % destination)
+
+    if len(source) != len(destination):
+        raise ValueError('`source` and `destination` arguments must have '
+                         'the same number of elements')
+
+    order = [n for n in range(tensor.ndim) if n not in source]
+
+    for dest, src in sorted(zip(destination, source)):
+        order.insert(dest, src)
+
+    return op.transpose(tensor, order)
 
 
 # pylint: disable= no-member, protected-access, too-many-arguments, redefined-outer-name
-def arange(start, stop=None, step=1.0, repeat=1, infer_range=False, ctx=None, dtype=mx_real_t):
+def arange(start, stop=None, step=1.0, repeat=1, infer_range=None, ctx=None, dtype=mx_real_t):
     """Returns evenly spaced values within a given interval.
 
     Values are generated within the half-open interval [`start`, `stop`). In other
@@ -2588,10 +2601,13 @@ def arange(start, stop=None, step=1.0, repeat=1, infer_range=False, ctx=None, dt
     >>> mx.nd.arange(2, 6, step=2, repeat=3, dtype='int32').asnumpy()
     array([2, 2, 2, 4, 4, 4], dtype=int32)
     """
+    if infer_range is not None:
+        warnings.warn('`infer_range` argument has been deprecated',
+                      DeprecationWarning)
     if ctx is None:
         ctx = current_context()
     return _internal._arange(start=start, stop=stop, step=step, repeat=repeat,
-                             infer_range=infer_range, dtype=dtype, ctx=str(ctx))
+                             infer_range=False, dtype=dtype, ctx=str(ctx))
 # pylint: enable= no-member, protected-access, too-many-arguments
 
 
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
index 05d7f17a8fc1..1ccf228698ba 100644
--- a/python/mxnet/ndarray/register.py
+++ b/python/mxnet/ndarray/register.py
@@ -16,9 +16,10 @@
 # under the License.
 
 """Register backend ops in mxnet.ndarray namespace"""
+from __future__ import absolute_import
 import os as _os
 import ctypes
-import numpy as np  # pylint: disable=unused-import
+import numpy as _np  # pylint: disable=unused-import
 
 from ._internal import NDArrayBase, _imperative_invoke # pylint: disable=unused-import
 from ..ndarray_doc import _build_doc
@@ -103,7 +104,7 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
             if dtype_name is not None:
                 code.append("""
     if '%s' in kwargs:
-        kwargs['%s'] = np.dtype(kwargs['%s']).name"""%(
+        kwargs['%s'] = _np.dtype(kwargs['%s']).name"""%(
             dtype_name, dtype_name, dtype_name))
             code.append("""
     _ = kwargs.pop('name', None)
@@ -136,7 +137,7 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
                 code.append("""
     if %s is not _Null:
         keys.append('%s')
-        vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
+        vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
 
     if not signature_only:
         code.append("""
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
index e8fa571d44db..2c69b9b46521 100644
--- a/python/mxnet/operator.py
+++ b/python/mxnet/operator.py
@@ -28,7 +28,7 @@
 from ctypes import CFUNCTYPE, POINTER, Structure, pointer
 from ctypes import c_void_p, c_int, c_char, c_char_p, cast, c_bool
 
-from .base import _LIB, check_call, MXCallbackList, c_array, c_array_buf
+from .base import _LIB, check_call, MXCallbackList, c_array, c_array_buf, mx_int
 from .base import c_str, mx_uint, mx_float, ctypes2numpy_shared, NDArrayHandle, py_str
 from . import symbol, context
 from .ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
@@ -164,7 +164,7 @@ def get_symbol(self, *args, **kwargs):
         fb_functype = CFUNCTYPE(None, c_int, POINTER(POINTER(mx_float)), POINTER(c_int),
                                 POINTER(POINTER(mx_uint)), POINTER(c_int), c_void_p)
         infer_functype = CFUNCTYPE(None, c_int, POINTER(c_int),
-                                   POINTER(POINTER(mx_uint)), c_void_p)
+                                   POINTER(POINTER(mx_int)), c_void_p)
         list_functype = CFUNCTYPE(None, POINTER(POINTER(POINTER(c_char))), c_void_p)
         class NumpyOpInfo(Structure):
             """Structure that holds Callback information. Passed to NumpyOpProp"""
@@ -214,9 +214,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
             assert len(ishape) == n_in
             rshape = list(ishape) + list(oshape)
             for i in range(n_in+n_out):
-                tensor_shapes[i] = cast(c_array_buf(mx_uint,
-                                                    array('I', rshape[i])),
-                                        POINTER(mx_uint))
+                tensor_shapes[i] = cast(c_array_buf(mx_int,
+                                                    array('i', rshape[i])),
+                                        POINTER(mx_int))
                 tensor_dims[i] = len(rshape[i])
 
         def list_outputs_entry(out, _):
@@ -266,7 +266,7 @@ def __init__(self, need_top_grad=True):
     def get_symbol(self, *args, **kwargs):
         fb_functype = CFUNCTYPE(c_bool, c_int, POINTER(c_void_p), POINTER(c_int), c_void_p)
         infer_functype = CFUNCTYPE(c_bool, c_int, POINTER(c_int),
-                                   POINTER(POINTER(mx_uint)), c_void_p)
+                                   POINTER(POINTER(mx_int)), c_void_p)
         list_functype = CFUNCTYPE(c_bool, POINTER(POINTER(POINTER(c_char))), c_void_p)
         deps_functype = CFUNCTYPE(c_bool, c_int_p, c_int_p, c_int_p,
                                   c_int_p, POINTER(c_int_p), c_void_p)
@@ -335,9 +335,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
                 assert len(ishape) == n_in
                 rshape = list(ishape) + list(oshape)
                 for i in range(n_in+n_out):
-                    tensor_shapes[i] = cast(c_array_buf(mx_uint,
-                                                        array('I', rshape[i])),
-                                            POINTER(mx_uint))
+                    tensor_shapes[i] = cast(c_array_buf(mx_int,
+                                                        array('i', rshape[i])),
+                                            POINTER(mx_int))
                     tensor_dims[i] = len(rshape[i])
             except Exception:
                 print('Error in NDArrayOp.infer_shape: %s' % traceback.format_exc())
@@ -698,7 +698,7 @@ def do_register(prop_cls):
         del_functype = CFUNCTYPE(c_int, c_void_p)
 
         infershape_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int),
-                                        POINTER(POINTER(mx_uint)), c_void_p)
+                                        POINTER(POINTER(mx_int)), c_void_p)
         infertype_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int), c_void_p)
         inferstorage_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int), c_void_p)
         inferstorage_backward_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int), \
@@ -747,9 +747,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
                         "shapes, got %d."%(n_aux, len(ashape))
                     rshape = list(ishape) + list(oshape) + list(ashape)
                     for i in range(n_in+n_out+n_aux):
-                        tensor_shapes[i] = cast(c_array_buf(mx_uint,
-                                                            array('I', rshape[i])),
-                                                POINTER(mx_uint))
+                        tensor_shapes[i] = cast(c_array_buf(mx_int,
+                                                            array('i', rshape[i])),
+                                                POINTER(mx_int))
                         tensor_dims[i] = len(rshape[i])
 
                     infer_shape_entry._ref_holder = [tensor_shapes]
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index def2c958ede4..2e7fe86c5af9 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -106,7 +106,8 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.wd_mult = {}
         self.begin_num_update = begin_num_update
         self.num_update = begin_num_update
-        self._index_update_count = {}
+        self._all_index_update_counts = {0 : {}}
+        self._index_update_count = self._all_index_update_counts[0]
         self.clip_gradient = clip_gradient
         self.multi_precision = multi_precision
         self.aggregate_num = 0
@@ -380,6 +381,18 @@ def set_wd_mult(self, args_wd_mult):
                     self.wd_mult[name] = float(attr[name]['__wd_mult__'])
         self.wd_mult.update(args_wd_mult)
 
+    def _set_current_context(self, device_id):
+        """Sets the number of the currently handled device.
+
+        Parameters
+        ----------
+        device_id : int
+            The number of current device.
+        """
+        if device_id not in self._all_index_update_counts:
+            self._all_index_update_counts[device_id] = {}
+        self._index_update_count = self._all_index_update_counts[device_id]
+
     def _update_count(self, index):
         """Updates num_update.
 
@@ -1623,6 +1636,8 @@ def __call__(self, index, grad, weight):
             indices = index
             grads = grad
             weights = weight
+        if weights:
+            self.optimizer._set_current_context(weights[0].context.device_id)
         for i, idx in enumerate(indices):
             # convert ctypes.char_p.value back to python str if needed
             if isinstance(idx, bytes):
diff --git a/python/mxnet/random.py b/python/mxnet/random.py
index 6394727b1465..1d6fe27c3d46 100644
--- a/python/mxnet/random.py
+++ b/python/mxnet/random.py
@@ -22,7 +22,7 @@
 from __future__ import absolute_import
 
 import ctypes
-from .base import _LIB, check_call
+from .base import _LIB, check_call, integer_types
 from .ndarray.random import *
 from .context import Context
 
@@ -90,9 +90,9 @@ def seed(seed_state, ctx="all"):
     [[ 2.5020072 -1.6884501]
      [-0.7931333 -1.4218881]]
     """
-    if not isinstance(seed_state, int):
+    if not isinstance(seed_state, integer_types):
         raise ValueError('seed_state must be int')
-    seed_state = ctypes.c_int(seed_state)
+    seed_state = ctypes.c_int(int(seed_state))
     if ctx == "all":
         check_call(_LIB.MXRandomSeed(seed_state))
     else:
diff --git a/python/mxnet/runtime.py b/python/mxnet/runtime.py
index 7ef5e1943072..e47cca93dace 100644
--- a/python/mxnet/runtime.py
+++ b/python/mxnet/runtime.py
@@ -27,17 +27,27 @@
 
 class Feature(ctypes.Structure):
     """
-    Compile time feature description
+    Compile time feature description, member fields: `name` and `enabled`.
     """
     _fields_ = [
         ("_name", ctypes.c_char_p),
-        ("enabled", ctypes.c_bool)
+        ("_enabled", ctypes.c_bool)
     ]
 
     @property
     def name(self):
+        """
+        Feature name.
+        """
         return self._name.decode()
 
+    @property
+    def enabled(self):
+        """
+        True if MXNet was compiled with the given compile-time feature.
+        """
+        return self._enabled
+
     def __repr__(self):
         if self.enabled:
             return "✔ {}".format(self.name)
@@ -50,7 +60,8 @@ def feature_list():
 
     Returns
     -------
-    :return: list of class LibFeature indicating which features are available and enabled
+    list
+        List of :class:`.Feature` objects
     """
     lib_features_c_array = ctypes.POINTER(Feature)()
     lib_features_size = ctypes.c_size_t()
@@ -74,11 +85,13 @@ def is_enabled(self, feature_name):
 
         Parameters
         ----------
-        :param x: str The name of a valid feature as string for example 'CUDA'
+        feature_name: str
+            The name of a valid feature as string for example 'CUDA'
 
         Returns
         -------
-        :return: bool True if it's enabled, False if it's disabled, RuntimeError if the feature is not known
+        Boolean
+            True if it's enabled, False if it's disabled, RuntimeError if the feature is not known
         """
         feature_name = feature_name.upper()
         if feature_name not in self:
diff --git a/python/mxnet/symbol/_internal.py b/python/mxnet/symbol/_internal.py
index 53fc684008cf..7e9787e32b1c 100644
--- a/python/mxnet/symbol/_internal.py
+++ b/python/mxnet/symbol/_internal.py
@@ -22,8 +22,6 @@
 import sys as _sys
 import os as _os
 
-import numpy as np
-
 try:
     if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
         from .._ctypes.symbol import SymbolBase, _set_symbol_class
diff --git a/python/mxnet/symbol/register.py b/python/mxnet/symbol/register.py
index 15c8e5e1fa68..ac59f8b97f15 100644
--- a/python/mxnet/symbol/register.py
+++ b/python/mxnet/symbol/register.py
@@ -17,9 +17,10 @@
 
 # pylint: disable=unused-import
 """Register backend ops in mxnet.symbol namespace."""
+from __future__ import absolute_import
 import os as _os
 import ctypes
-import numpy as np
+import numpy as _np
 
 from . import _internal
 from ._internal import SymbolBase, _symbol_creator
@@ -109,7 +110,7 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
             if dtype_name is not None:
                 code.append("""
     if '%s' in kwargs:
-        kwargs['%s'] = np.dtype(kwargs['%s']).name"""%(
+        kwargs['%s'] = _np.dtype(kwargs['%s']).name"""%(
             dtype_name, dtype_name, dtype_name))
             code.append("""
     attr = kwargs.pop('attr', None)
@@ -175,7 +176,7 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
                 code.append("""
     if %s is not _Null:
         _keys.append('%s')
-        _vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
+        _vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
 
             code.append("""
     if not hasattr(NameManager._current, "value"):
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 3e3e79ed59f7..4bf60a6a1fcd 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -34,7 +34,7 @@
 
 from ..attribute import AttrScope
 from ..base import _LIB, numeric_types, c_array, c_array_buf, c_str, c_str_array, c_handle_array
-from ..base import mx_uint, py_str, string_types, integer_types
+from ..base import mx_uint, py_str, string_types, integer_types, mx_int, is_np_compat
 from ..base import NDArrayHandle, ExecutorHandle, SymbolHandle
 from ..base import check_call, MXNetError, NotImplementedForSymbol
 from ..context import Context, current_context
@@ -47,7 +47,7 @@
 from ._internal import SymbolBase, _set_symbol_class
 
 __all__ = ["Symbol", "var", "Variable", "Group", "load", "load_json",
-           "pow", "maximum", "minimum", "hypot", "eye", "zeros", "ones", "full", "arange",
+           "pow", "power", "maximum", "minimum", "hypot", "eye", "zeros", "ones", "full", "arange",
            "histogram", "split_v2"]
 
 
@@ -1078,7 +1078,11 @@ def infer_shape(self, *args, **kwargs):
                 arg_names = self.list_arguments()
                 unknowns = []
                 for name, shape in zip(arg_names, arg_shapes):
-                    if not shape or not _numpy.prod(shape):
+                    if is_np_compat():
+                        shape_is_none = not shape or -1 in shape
+                    else:
+                        shape_is_none = not shape or 0 in shape
+                    if shape_is_none:
                         if len(unknowns) >= 10:
                             unknowns.append('...')
                             break
@@ -1174,25 +1178,25 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
                 indptr.append(len(sdata))
             keys = c_str_array(str_keys)
         arg_shape_size = mx_uint()
-        arg_shape_ndim = ctypes.POINTER(mx_uint)()
-        arg_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        arg_shape_ndim = ctypes.POINTER(mx_int)()
+        arg_shape_data = ctypes.POINTER(ctypes.POINTER(mx_int))()
         out_shape_size = mx_uint()
-        out_shape_ndim = ctypes.POINTER(mx_uint)()
-        out_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        out_shape_ndim = ctypes.POINTER(mx_int)()
+        out_shape_data = ctypes.POINTER(ctypes.POINTER(mx_int))()
         aux_shape_size = mx_uint()
-        aux_shape_ndim = ctypes.POINTER(mx_uint)()
-        aux_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        aux_shape_ndim = ctypes.POINTER(mx_int)()
+        aux_shape_data = ctypes.POINTER(ctypes.POINTER(mx_int))()
         complete = ctypes.c_int()
         if partial:
-            infer_func = _LIB.MXSymbolInferShapePartial
+            infer_func = _LIB.MXSymbolInferShapePartialEx
         else:
-            infer_func = _LIB.MXSymbolInferShape
+            infer_func = _LIB.MXSymbolInferShapeEx
         check_call(infer_func(
             self.handle,
             mx_uint(len(indptr) - 1),
             keys,
             c_array_buf(mx_uint, array('I', indptr)),
-            c_array_buf(mx_uint, array('I', sdata)),
+            c_array_buf(mx_int, array('i', sdata)),
             ctypes.byref(arg_shape_size),
             ctypes.byref(arg_shape_ndim),
             ctypes.byref(arg_shape_data),
@@ -1204,12 +1208,15 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
             ctypes.byref(aux_shape_data),
             ctypes.byref(complete)))
         if complete.value != 0:
-            arg_shapes = [
-                tuple(arg_shape_data[i][:arg_shape_ndim[i]]) for i in range(arg_shape_size.value)]
-            out_shapes = [
-                tuple(out_shape_data[i][:out_shape_ndim[i]]) for i in range(out_shape_size.value)]
-            aux_shapes = [
-                tuple(aux_shape_data[i][:aux_shape_ndim[i]]) for i in range(aux_shape_size.value)]
+            arg_shapes = [tuple(arg_shape_data[i][:arg_shape_ndim[i]])
+                          if arg_shape_ndim[i] >= 0 else None
+                          for i in range(arg_shape_size.value)]
+            out_shapes = [tuple(out_shape_data[i][:out_shape_ndim[i]])
+                          if out_shape_ndim[i] >= 0 else None
+                          for i in range(out_shape_size.value)]
+            aux_shapes = [tuple(aux_shape_data[i][:aux_shape_ndim[i]])
+                          if aux_shape_ndim[i] >= 0 else None
+                          for i in range(aux_shape_size.value)]
             return (arg_shapes, out_shapes, aux_shapes)
         else:
             return (None, None, None)
@@ -1564,42 +1571,42 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
         aux_state_handles = ctypes.POINTER(NDArrayHandle)()
 
         try:
-            check_call(_LIB.MXExecutorSimpleBind(self.handle,
-                                                 ctypes.c_int(ctx.device_typeid),
-                                                 ctypes.c_int(ctx.device_id),
-                                                 num_ctx_map_keys,
-                                                 ctx_map_keys,
-                                                 ctx_map_dev_types,
-                                                 ctx_map_dev_ids,
-                                                 mx_uint(provided_req_type_list_len),
-                                                 provided_grad_req_names,
-                                                 provided_grad_req_types,
-                                                 mx_uint(len(provided_arg_shape_names)),
-                                                 c_str_array(provided_arg_shape_names),
-                                                 c_array_buf(mx_uint,
-                                                             array('I', provided_arg_shape_data)),
-                                                 c_array_buf(mx_uint,
-                                                             array('I', provided_arg_shape_idx)),
-                                                 num_provided_arg_types,
-                                                 provided_arg_type_names,
-                                                 provided_arg_type_data,
-                                                 num_provided_arg_stypes,
-                                                 provided_arg_stype_names,
-                                                 provided_arg_stype_data,
-                                                 mx_uint(len(shared_arg_name_list)),
-                                                 c_str_array(shared_arg_name_list),
-                                                 ctypes.byref(shared_buffer_len),
-                                                 shared_buffer_names,
-                                                 shared_buffer_handles,
-                                                 ctypes.byref(updated_shared_buffer_names),
-                                                 ctypes.byref(updated_shared_buffer_handles),
-                                                 ctypes.byref(num_in_args),
-                                                 ctypes.byref(in_arg_handles),
-                                                 ctypes.byref(arg_grad_handles),
-                                                 ctypes.byref(num_aux_states),
-                                                 ctypes.byref(aux_state_handles),
-                                                 shared_exec_handle,
-                                                 ctypes.byref(exe_handle)))
+            check_call(_LIB.MXExecutorSimpleBindEx(self.handle,
+                                                   ctypes.c_int(ctx.device_typeid),
+                                                   ctypes.c_int(ctx.device_id),
+                                                   num_ctx_map_keys,
+                                                   ctx_map_keys,
+                                                   ctx_map_dev_types,
+                                                   ctx_map_dev_ids,
+                                                   mx_uint(provided_req_type_list_len),
+                                                   provided_grad_req_names,
+                                                   provided_grad_req_types,
+                                                   mx_uint(len(provided_arg_shape_names)),
+                                                   c_str_array(provided_arg_shape_names),
+                                                   c_array_buf(mx_int,
+                                                               array('I', provided_arg_shape_data)),
+                                                   c_array_buf(mx_uint,
+                                                               array('i', provided_arg_shape_idx)),
+                                                   num_provided_arg_types,
+                                                   provided_arg_type_names,
+                                                   provided_arg_type_data,
+                                                   num_provided_arg_stypes,
+                                                   provided_arg_stype_names,
+                                                   provided_arg_stype_data,
+                                                   mx_uint(len(shared_arg_name_list)),
+                                                   c_str_array(shared_arg_name_list),
+                                                   ctypes.byref(shared_buffer_len),
+                                                   shared_buffer_names,
+                                                   shared_buffer_handles,
+                                                   ctypes.byref(updated_shared_buffer_names),
+                                                   ctypes.byref(updated_shared_buffer_handles),
+                                                   ctypes.byref(num_in_args),
+                                                   ctypes.byref(in_arg_handles),
+                                                   ctypes.byref(arg_grad_handles),
+                                                   ctypes.byref(num_aux_states),
+                                                   ctypes.byref(aux_state_handles),
+                                                   shared_exec_handle,
+                                                   ctypes.byref(exe_handle)))
         except MXNetError as e:
             error_msg = "simple_bind error. Arguments:\n"
             for k, v in kwargs.items():
@@ -2740,6 +2747,8 @@ def pow(base, exp):
     Both inputs can be Symbol or scalar number.
     Broadcasting is not supported. Use `broadcast_pow` instead.
 
+    `sym.pow` is being deprecated, please use `sym.power` instead.
+
     Parameters
     ---------
     base : Symbol or scalar
@@ -2780,6 +2789,43 @@ def pow(base, exp):
         raise TypeError('types (%s, %s) not supported' % (str(type(base)), str(type(exp))))
 
 
+def power(base, exp):
+    """Returns element-wise result of base element raised to powers from exp element.
+
+    Both inputs can be Symbol or scalar number.
+    Broadcasting is not supported. Use `broadcast_pow` instead.
+
+    Parameters
+    ---------
+    base : Symbol or scalar
+        The base symbol
+    exp : Symbol or scalar
+        The exponent symbol
+
+    Returns
+    -------
+    Symbol or scalar
+        The bases in x raised to the exponents in y.
+
+    Examples
+    --------
+    >>> mx.sym.power(2, 3)
+    8
+    >>> x = mx.sym.Variable('x')
+    >>> y = mx.sym.Variable('y')
+    >>> z = mx.sym.power(x, 2)
+    >>> z.eval(x=mx.nd.array([1,2]))[0].asnumpy()
+    array([ 1.,  4.], dtype=float32)
+    >>> z = mx.sym.power(3, y)
+    >>> z.eval(y=mx.nd.array([2,3]))[0].asnumpy()
+    array([  9.,  27.], dtype=float32)
+    >>> z = mx.sym.power(x, y)
+    >>> z.eval(x=mx.nd.array([3,4]), y=mx.nd.array([2,3]))[0].asnumpy()
+    array([  9.,  64.], dtype=float32)
+    """
+    return pow(base, exp)
+
+
 # pylint: disable=no-member
 # pylint: disable=redefined-builtin
 def maximum(left, right):
@@ -2912,6 +2958,7 @@ def hypot(left, right):
     else:
         raise TypeError('types (%s, %s) not supported' % (str(type(left)), str(type(right))))
 
+
 def eye(N, M=0, k=0, dtype=None, **kwargs):
     """Returns a new symbol of 2-D shpae, filled with ones on the diagonal and zeros elsewhere.
 
@@ -3002,11 +3049,16 @@ def full(shape, val, dtype=None, **kwargs):
 def arange(start, stop=None, step=1.0, repeat=1, infer_range=False, name=None, dtype=None):
     """Returns evenly spaced values within a given interval.
 
+    Values are generated within the half-open interval [`start`, `stop`). In other
+    words, the interval includes `start` but excludes `stop`. The function is
+    similar to the built-in Python function `range` and to `numpy.arange`,
+    but returns a `Symbol`.
+
     Parameters
     ----------
-    start : number
+    start : number, optional
         Start of interval. The interval includes this value. The default start value is 0.
-    stop : number, optional
+    stop : number
         End of interval. The interval does not include this value.
     step : number, optional
         Spacing between values.
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 4138e4d2d755..bbb12dd5d7af 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -23,7 +23,6 @@
 import struct
 import traceback
 import numbers
-import subprocess
 import sys
 import os
 import errno
@@ -213,6 +212,7 @@ def _get_powerlaw_dataset_csr(num_rows, num_cols, density=0.1, dtype=None):
     else:
         return mx.nd.array(output_arr).tostype("csr")
 
+
 def assign_each(the_input, function):
     """Return ndarray composed of passing each array value through some function"""
     if function is None:
@@ -1391,14 +1391,7 @@ def list_gpus():
         If there are n GPUs, then return a list [0,1,...,n-1]. Otherwise returns
         [].
     """
-    re = ''
-    nvidia_smi = ['nvidia-smi', '/usr/bin/nvidia-smi', '/usr/local/nvidia/bin/nvidia-smi']
-    for cmd in nvidia_smi:
-        try:
-            re = subprocess.check_output([cmd, "-L"], universal_newlines=True)
-        except (subprocess.CalledProcessError, OSError):
-            pass
-    return range(len([i for i in re.split('\n') if 'GPU' in i]))
+    return range(mx.util.get_gpu_count())
 
 def download(url, fname=None, dirname=None, overwrite=False, retries=5):
     """Download an given URL
@@ -1933,7 +1926,7 @@ def chi_square_check(generator, buckets, probs, nsamples=1000000):
     _, p = ss.chisquare(f_obs=obs_freq, f_exp=expected_freq)
     return p, obs_freq, expected_freq
 
-def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, success_rate=0.25, alpha=0.05):
+def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, success_rate=0.2, alpha=0.05):
     """Verify whether the generator is correct using chi-square testing.
 
     The test is repeated for "nrepeat" times and we check if the success rate is
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 62c05d252828..fc8d985b9566 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -16,9 +16,12 @@
 # under the License.
 """general utility functions"""
 
+import ctypes
 import os
 import sys
 
+from .base import _LIB, check_call
+
 
 def makedirs(d):
     """Create directories recursively if they don't exist. os.makedirs(exist_ok=True) is not
@@ -28,3 +31,16 @@ def makedirs(d):
         mkpath(d)
     else:
         os.makedirs(d, exist_ok=True)  # pylint: disable=unexpected-keyword-arg
+
+
+def get_gpu_count():
+    size = ctypes.c_int()
+    check_call(_LIB.MXGetGPUCount(ctypes.byref(size)))
+    return size.value
+
+
+def get_gpu_memory(gpu_dev_id):
+    free_mem = ctypes.c_uint64(0)
+    total_mem = ctypes.c_uint64(0)
+    check_call(_LIB.MXGetGPUMemoryInformation64(gpu_dev_id, ctypes.byref(free_mem), ctypes.byref(total_mem)))
+    return free_mem.value, total_mem.value
diff --git a/python/mxnet/visualization.py b/python/mxnet/visualization.py
index 1ebdcb54f4ce..4101f749a583 100644
--- a/python/mxnet/visualization.py
+++ b/python/mxnet/visualization.py
@@ -205,10 +205,10 @@ def print_layer_summary(node, out_shape):
             print('=' * line_length)
         else:
             print('_' * line_length)
-    print('Total params: %s' % total_params)
+    print("Total params: {params}".format(params=total_params))
     print('_' * line_length)
 
-def plot_network(symbol, title="plot", save_format='pdf', shape=None, node_attrs={},
+def plot_network(symbol, title="plot", save_format='pdf', shape=None, dtype=None, node_attrs={},
                  hide_weights=True):
     """Creates a visualization (Graphviz digraph object) of the given computation graph.
     Graphviz must be installed for this function to work.
@@ -224,6 +224,10 @@ def plot_network(symbol, title="plot", save_format='pdf', shape=None, node_attrs
         Specifies the shape of the input tensors. If specified, the visualization will include
         the shape of the tensors between the nodes. `shape` is a dictionary mapping
         input symbol names (str) to the corresponding tensor shape (tuple).
+    dtype: dict, optional
+        Specifies the type of the input tensors. If specified, the visualization will include
+        the type of the tensors between the nodes. `dtype` is a dictionary mapping
+        input symbol names (str) to the corresponding tensor type (e.g. `numpy.float32`).
     node_attrs: dict, optional
         Specifies the attributes for nodes in the generated visualization. `node_attrs` is
         a dictionary of Graphviz attribute names and values. For example::
@@ -271,14 +275,19 @@ def plot_network(symbol, title="plot", save_format='pdf', shape=None, node_attrs
         raise ImportError("Draw network requires graphviz library")
     if not isinstance(symbol, Symbol):
         raise TypeError("symbol must be a Symbol")
-    draw_shape = False
-    if shape is not None:
-        draw_shape = True
-        interals = symbol.get_internals()
-        _, out_shapes, _ = interals.infer_shape(**shape)
+    internals = symbol.get_internals()
+    draw_shape = shape is not None
+    if draw_shape:
+        _, out_shapes, _ = internals.infer_shape(**shape)
         if out_shapes is None:
             raise ValueError("Input shape is incomplete")
-        shape_dict = dict(zip(interals.list_outputs(), out_shapes))
+        shape_dict = dict(zip(internals.list_outputs(), out_shapes))
+    draw_type = dtype is not None
+    if draw_type:
+        _, out_types, _ = internals.infer_type(**dtype)
+        if out_types is None:
+            raise ValueError("Input type is incomplete")
+        type_dict = dict(zip(internals.list_outputs(), out_types))
     conf = json.loads(symbol.tojson())
     nodes = conf["nodes"]
     # check if multiple nodes have the same name
@@ -328,24 +337,33 @@ def looks_like_weight(name):
             label = node["name"]
             attr["fillcolor"] = cm[0]
         elif op == "Convolution":
-            label = r"Convolution\n%s/%s, %s" % ("x".join(_str2tuple(node["attrs"]["kernel"])),
-                                                 "x".join(_str2tuple(node["attrs"]["stride"]))
-                                                 if "stride" in node["attrs"] else "1",
-                                                 node["attrs"]["num_filter"])
+            label = "Convolution\n{kernel}/{stride}, {filter}".format(
+                kernel="x".join(_str2tuple(node["attrs"]["kernel"])),
+                stride="x".join(_str2tuple(node["attrs"]["stride"]))
+                if "stride" in node["attrs"] else "1",
+                filter=node["attrs"]["num_filter"]
+            )
             attr["fillcolor"] = cm[1]
         elif op == "FullyConnected":
-            label = r"FullyConnected\n%s" % node["attrs"]["num_hidden"]
+            label = "FullyConnected\n{hidden}".format(hidden=node["attrs"]["num_hidden"])
             attr["fillcolor"] = cm[1]
         elif op == "BatchNorm":
             attr["fillcolor"] = cm[3]
-        elif op in ('Activation', 'LeakyReLU'):
-            label = r"%s\n%s" % (op, node["attrs"]["act_type"])
+        elif op == 'Activation':
+            act_type = node["attrs"]["act_type"]
+            label = 'Activation\n{activation}'.format(activation=act_type)
+            attr["fillcolor"] = cm[2]
+        elif op == 'LeakyReLU':
+            attrs = node.get("attrs")
+            act_type = attrs.get("act_type", "Leaky") if attrs else "Leaky"
+            label = 'LeakyReLU\n{activation}'.format(activation=act_type)
             attr["fillcolor"] = cm[2]
         elif op == "Pooling":
-            label = r"Pooling\n%s, %s/%s" % (node["attrs"]["pool_type"],
-                                             "x".join(_str2tuple(node["attrs"]["kernel"])),
-                                             "x".join(_str2tuple(node["attrs"]["stride"]))
-                                             if "stride" in node["attrs"] else "1")
+            label = "Pooling\n{pooltype}, {kernel}/{stride}".format(pooltype=node["attrs"]["pool_type"],
+                                                                    kernel="x".join(_str2tuple(node["attrs"]["kernel"]))
+                                                                    if "kernel" in node["attrs"] else "[]",
+                                                                    stride="x".join(_str2tuple(node["attrs"]["stride"]))
+                                                                    if "stride" in node["attrs"] else "1")
             attr["fillcolor"] = cm[4]
         elif op in ("Concat", "Flatten", "Reshape"):
             attr["fillcolor"] = cm[5]
@@ -370,7 +388,7 @@ def looks_like_weight(name):
                 input_node = nodes[item[0]]
                 input_name = input_node["name"]
                 if input_name not in hidden_nodes:
-                    attr = {"dir": "back", 'arrowtail':'open'}
+                    attr = {"dir": "back", 'arrowtail':'open', 'label': ''}
                     # add shapes
                     if draw_shape:
                         if input_node["op"] != "null":
@@ -387,6 +405,19 @@ def looks_like_weight(name):
                             shape = shape_dict[key][1:]
                             label = "x".join([str(x) for x in shape])
                             attr["label"] = label
+                    if draw_type:
+                        if input_node["op"] != "null":
+                            key = input_name + "_output"
+                            if "attrs" in input_node:
+                                params = input_node["attrs"]
+                                if "num_outputs" in params:
+                                    key += str(int(params["num_outputs"]) - 1)
+                            dtype = type_dict[key]
+                            attr["label"] += '(' + dtype.__name__ + ')'
+                        else:
+                            key = input_name
+                            dtype = type_dict[key]
+                            attr["label"] += '(' + dtype.__name__ + ')'
                     dot.edge(tail_name=name, head_name=input_name, **attr)
 
     return dot
diff --git a/scala-package/.gitignore b/scala-package/.gitignore
index 9bf7851716d6..dadc000c612e 100644
--- a/scala-package/.gitignore
+++ b/scala-package/.gitignore
@@ -9,3 +9,4 @@ core/src/main/scala/org/apache/mxnet/SymbolBase.scala
 core/src/main/scala/org/apache/mxnet/SymbolRandomAPIBase.scala
 examples/scripts/infer/images/
 examples/scripts/infer/models/
+examples/scripts/infer/objectdetector/boundingImage.png
diff --git a/scala-package/assembly/src/main/assembly/assembly.xml b/scala-package/assembly/src/main/assembly/assembly.xml
index 4c3d17392624..5a931360645c 100644
--- a/scala-package/assembly/src/main/assembly/assembly.xml
+++ b/scala-package/assembly/src/main/assembly/assembly.xml
@@ -30,9 +30,11 @@
         <exclude>org.scala-lang.modules:*</exclude>
         <exclude>commons-io:commons-io</exclude>
         <exclude>commons-codec:commons-codec</exclude>
+        <exclude>org.slf4j:slf4j-api</exclude>
+        <exclude>args4j:args4j</exclude>
       </excludes>
-      <outputDirectory>/</outputDirectory>
-      <useProjectArtifact>true</useProjectArtifact>
+      <outputDirectory>.</outputDirectory>
+      <useProjectArtifact>false</useProjectArtifact>
       <unpack>true</unpack>
       <scope>runtime</scope>
     </dependencySet>
@@ -69,7 +71,7 @@
         <include>cub/LICENSE.TXT</include>
         <include>mkldnn/external/mklml_mac_2019.0.1.20180928/license.txt</include>
       </includes>
-      <outputDirectory>/</outputDirectory>
+      <outputDirectory>.</outputDirectory>
     </fileSet>
   </fileSets>
 </assembly>
diff --git a/scala-package/assembly/src/main/assembly/javadoc.xml b/scala-package/assembly/src/main/assembly/javadoc.xml
index 176fa432190c..c6df96a3f5a5 100644
--- a/scala-package/assembly/src/main/assembly/javadoc.xml
+++ b/scala-package/assembly/src/main/assembly/javadoc.xml
@@ -25,7 +25,7 @@
   <fileSets>
     <fileSet>
       <directory>${rootdir}/core/target/site/scaladocs</directory>
-      <outputDirectory>/</outputDirectory>
+      <outputDirectory>.</outputDirectory>
     </fileSet>
   </fileSets>
 </assembly>
diff --git a/scala-package/assembly/src/main/assembly/source.xml b/scala-package/assembly/src/main/assembly/source.xml
index c06786130192..1f004e811cfc 100644
--- a/scala-package/assembly/src/main/assembly/source.xml
+++ b/scala-package/assembly/src/main/assembly/source.xml
@@ -29,7 +29,7 @@
       <includes>
         <include>**\/*.scala</include>
       </includes>
-      <outputDirectory>/</outputDirectory>
+      <outputDirectory>.</outputDirectory>
     </fileSet>
   </fileSets>
 </assembly>
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 5a28a11a0208..b888b5549e7a 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -100,10 +100,6 @@
           </excludes>
         </configuration>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-clean-plugin</artifactId>
@@ -141,6 +137,7 @@
       </plugin>
     </plugins>
   </build>
+
   <dependencies>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Executor.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Executor.scala
index 85f45bc66fe0..b0fae0f9d58d 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Executor.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Executor.scala
@@ -45,29 +45,40 @@ object Executor {
  * @see Symbol.bind : to create executor
  */
 class Executor private[mxnet](private[mxnet] val handle: ExecutorHandle,
-                              private[mxnet] val symbol: Symbol) extends NativeResource {
-  private[mxnet] var argArrays: Array[NDArray] = null
-  private[mxnet] var gradArrays: Array[NDArray] = null
-  private[mxnet] var auxArrays: Array[NDArray] = null
+                              private[mxnet] val symbol: Symbol,
+                              private[mxnet] var argArrays: Array[NDArray] = null,
+                              private[mxnet] var gradArrays: Array[NDArray] = null,
+                              private[mxnet] var auxArrays: Array[NDArray] = null,
+                              private var _ctx: Context = null,
+                              private var _gradsReq: Iterable[_] = null,
+                              private var _group2ctx: Map[String, Context] = null
+                             ) extends NativeResource {
+
   val outputs: Array[NDArray] = getOutputs
   protected var _argDict: Map[String, NDArray] = null
   protected var _gradDict: Map[String, NDArray] = null
   protected var _auxDict: Map[String, NDArray] = null
   protected var monitorCallback: MXMonitorCallback = null
-  private[mxnet] var _ctx: Context = null
-  private[mxnet] var _gradsReq: Iterable[_] = null
-  private[mxnet] var _group2ctx: Map[String, Context] = null
   private val logger: Logger = LoggerFactory.getLogger(classOf[Executor])
 
+  private var reshaped = false
+
   override def nativeAddress: CPtrAddress = handle
   override def nativeDeAllocator: (CPtrAddress => Int) = _LIB.mxExecutorFree
   // cannot determine the off-heap size of this object
   override val bytesAllocated: Long = 0
   override val ref: NativeResourceRef = super.register()
+
   override def dispose(): Unit = {
     if (!super.isDisposed) {
       super.dispose()
       outputs.foreach(o => o.dispose())
+      if (reshaped && argArrays != null) {argArrays.foreach(a => a.dispose())}
+      if (reshaped && gradArrays != null) {gradArrays.foreach(
+        // Symbol will sometimes fill this with nulls so we've got to check the elements too
+        a => if (a != null) {a.dispose()})
+      }
+      if (reshaped && auxArrays != null) {auxArrays.foreach(a => a.dispose())}
     }
   }
 
@@ -86,82 +97,59 @@ class Executor private[mxnet](private[mxnet] val handle: ExecutorHandle,
    */
   def reshape(partialShaping: Boolean = false, allowUpSizing: Boolean = false,
     kwargs: Map[String, Shape]): Executor = {
-     val (argShapes, _, auxShapes) = this.symbol.inferShape(kwargs)
-    // TODO: more precise error message should be provided by backend
-    require(argShapes != null, "Shape inference failed." +
-      s"Known shapes are $kwargs for symbol arguments ${symbol.listArguments()} " +
-      s"and aux states ${symbol.listAuxiliaryStates()}")
-
-    var newArgDict = Map[String, NDArray]()
-    var newGradDict = Map[String, NDArray]()
 
-    this.symbol.listArguments().zipWithIndex.foreach { case (name, i) =>
-      val newShape = argShapes(i)
-      val arr = this.argArrays(i)
-      val dArr = if (this.gradArrays == null) null else this.gradArrays(i)
-      if (partialShaping || kwargs.contains(name) || newShape.equals(arr.shape)) {
-        if (newShape.product > arr.shape.product) {
-          require(allowUpSizing, s"New shape of arg:$name larger than original. " +
-                        "First making a big executor and then down sizing it " +
-                        "is more efficient than the reverse." +
-                        "If you really want to up size, set allowUpSizing = true " +
-                        "to enable allocation of new arrays.")
-          newArgDict = newArgDict + (name -> NDArray.empty(newShape, arr.context, arr.dtype))
-          if (dArr != null) {
-            newGradDict = newGradDict + (name -> NDArray.empty(newShape, dArr.context, dArr.dtype))
-          }
-        } else {
-          newArgDict = newArgDict + (name -> arr.reshape(newShape.toArray))
-          if (dArr != null) {
-            newGradDict = newGradDict + (name -> dArr.reshape(newShape.toArray))
-          }
-        }
-      } else {
-        throw new  AssertionError(s"Shape of unspecified array arg:$name changed." +
-                    "This can cause the new executor to not share parameters " +
-                    "with the old one. Please check for error in network." +
-                    "If this is intended, set partialShaping = true to suppress this warning.")
-      }
-    }
+    val providedArgShapeNames = kwargs.keys
+    val providedArgShapeData = kwargs.values.flatMap(_.toVector)
+    val providedArgShapeIdx = kwargs.values.scanLeft(0)((sum, shape) => sum + shape.size)
 
-    var newAuxDict = Map[String, NDArray]()
-    val zip3 = (this.symbol.listAuxiliaryStates(), auxShapes, this.auxArrays).zipped
-    zip3.foreach { case (name, newShape, arr) =>
-      if (partialShaping || newShape.equals(arr.shape)) {
-        if (newShape.product > arr.shape.product) {
-          require(allowUpSizing, s"New shape of aux:$name larger than original. " +
-                        "First making a big executor and then down sizing it " +
-                        "is more efficient than the reverse." +
-                        "If you really want to up size, set allowUpSizing = true " +
-                        "to enable allocation of new arrays.")
-          newAuxDict = newAuxDict + (name -> NDArray.empty(newShape, arr.context))
-        } else {
-          newAuxDict = newAuxDict + (name -> arr.reshape(newShape.toArray))
-        }
-      } else {
-        throw new  AssertionError(s"Shape of unspecified array aux:$name changed." +
-                  "This can cause the new executor to not share parameters " +
-                  "with the old one. Please check for error in network." +
-                  "If this is intended, set partialShaping = true to suppress this warning.")
-      }
+    val ctxMapKeys = if (_group2ctx != null) _group2ctx.keys.toArray else Array.empty[String]
+    val ctxMapDevTypes = if (_group2ctx != null) {
+      _group2ctx.values.map(_.deviceTypeid).toArray
+    } else {
+      Array.empty[Int]
     }
-    if (this._gradsReq.isInstanceOf[Seq[_]]) {
-      this.symbol.bind(this._ctx,
-                          newArgDict,
-                          newGradDict,
-                          this._gradsReq.asInstanceOf[Seq[String]],
-                          newAuxDict,
-                          this._group2ctx,
-                          this)
+    val ctxMapDevIds = if (_group2ctx != null) {
+      _group2ctx.values.map(_.deviceId).toArray
     } else {
-      this.symbol.bind(this._ctx,
-                          newArgDict,
-                          newGradDict,
-                          this._gradsReq.asInstanceOf[Map[String, String]],
-                          newAuxDict,
-                          this._group2ctx,
-                          this)
+      Array.empty[Int]
     }
+
+    val inArgs = ArrayBuffer.empty[NDArrayHandle]
+    val argGrads = ArrayBuffer.empty[NDArrayHandle]
+    val auxStates = ArrayBuffer.empty[NDArrayHandle]
+    val outHandle = new ExecutorHandleRef()
+
+    checkCall(_LIB.mxExecutorReshape(
+              if (partialShaping) 1 else 0,
+              if (allowUpSizing) 1 else 0,
+              _ctx.deviceTypeid,
+              _ctx.deviceId,
+              ctxMapKeys.toArray,
+              ctxMapDevTypes.toArray,
+              ctxMapDevIds.toArray,
+              providedArgShapeNames.toArray,
+              providedArgShapeData.toArray,
+              providedArgShapeIdx.toArray,
+              inArgs,
+              argGrads,
+              auxStates,
+              this.handle,
+              outHandle))
+
+    val argArrays = inArgs.map(new NDArray(_)).toArray
+    val gradArrays = argGrads.map(handle =>
+      if (handle == 0) null else new NDArray(handle)).toArray
+    val auxArrays = auxStates.map(new NDArray(_)).toArray
+
+    val executor = new Executor(outHandle.value, this.symbol)
+    executor._ctx = this._ctx
+    executor._gradsReq = this._gradsReq
+    executor._group2ctx = this._group2ctx
+    executor.argArrays = argArrays
+    executor.gradArrays = gradArrays
+    executor.auxArrays = auxArrays
+    executor.reshaped = true
+    executor
   }
 
   /**
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala b/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
index 2ed9d8cfbb84..2b1765531824 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
@@ -180,6 +180,7 @@ class FeedForward private(
 
   // Initialize the predictor module for running prediction.
   private def initPredictor(inputShapes: Map[String, Shape]): Unit = {
+    var shouldInit = true
     if (this.predExec != null) {
       val (argShapes, _, _) = symbol.inferShape(inputShapes)
       require(argShapes != null, "Shape inference failed." +
@@ -187,14 +188,16 @@ class FeedForward private(
         s"and aux states ${symbol.listAuxiliaryStates()}")
       val predShapes = this.predExec.argArrays.map(_.shape)
       if (argShapes.sameElements(predShapes)) {
-        return
+        shouldInit = false
       }
     }
-    // for now only use the first device
-    val predExec = symbol.simpleBind(ctx(0), gradReq = "null", shapeDict = inputShapes)
-    predExec.copyParamsFrom(_argParams, _auxParams)
-    ExecutorManager.checkArguments(symbol)
-    this.predExec = predExec
+    if(shouldInit) {
+      // for now only use the first device
+      val predExec = symbol.simpleBind(ctx(0), gradReq = "null", shapeDict = inputShapes)
+      predExec.copyParamsFrom(_argParams, _auxParams)
+      ExecutorManager.checkArguments(symbol)
+      this.predExec = predExec
+    }
   }
 
   // Initialize the iterator given input.
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala
index 0f756e24027f..52e26efb41f1 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala
@@ -17,6 +17,7 @@
 
 package org.apache.mxnet
 // scalastyle:off
+import java.awt.{BasicStroke, Color, Graphics2D}
 import java.awt.image.BufferedImage
 // scalastyle:on
 import java.io.InputStream
@@ -182,4 +183,57 @@ object Image {
     img
   }
 
+  /**
+    * Helper function to generate ramdom colors
+    * @param transparency The transparency level
+    * @return Color
+    */
+  private def randomColor(transparency: Option[Float] = Some(1.0f)) : Color = {
+    new Color(
+      Math.random().toFloat, Math.random().toFloat, Math.random().toFloat,
+      transparency.get
+    )
+  }
+
+  /**
+    * Method to draw bounding boxes for an image
+    * @param src Source of the buffered image
+    * @param coordinate Contains Map of xmin, xmax, ymin, ymax
+    *                   corresponding to top-left and down-right points
+    * @param names The name set of the bounding box
+    * @param stroke Thickness of the bounding box
+    * @param fontSizeMult Font size multiplier
+    * @param transparency Transparency of the bounding box
+    */
+  def drawBoundingBox(src: BufferedImage, coordinate: Array[Map[String, Int]],
+                      names: Option[Array[String]] = None,
+                      stroke : Option[Int] = Some(3),
+                      fontSizeMult : Option[Float] = Some(1.0f),
+                      transparency: Option[Float] = Some(1.0f)): Unit = {
+    val g2d : Graphics2D = src.createGraphics()
+    g2d.setStroke(new BasicStroke(stroke.get))
+    // Increase the size of font
+    val currentFont = g2d.getFont
+    val newFont = currentFont.deriveFont(currentFont.getSize * fontSizeMult.get)
+    g2d.setFont(newFont)
+    // Get font metrics to draw the font box
+    val fm = g2d.getFontMetrics(newFont)
+    for (idx <- coordinate.indices) {
+      val map = coordinate(idx)
+      g2d.setColor(randomColor(transparency).darker())
+      g2d.drawRect(map("xmin"), map("ymin"), map("xmax") - map("xmin"), map("ymax") - map("ymin"))
+      // Write the name of the bounding box
+      if (names.isDefined) {
+        val x = map("xmin") - stroke.get
+        val y = map("ymin")
+        val h = fm.getHeight
+        val w = fm.charsWidth(names.get(idx).toCharArray, 0, names.get(idx).length())
+        g2d.fillRect(x, y - h, w, h)
+        g2d.setColor(Color.WHITE)
+        g2d.drawString(names.get(idx), x, y)
+      }
+    }
+    g2d.dispose()
+  }
+
 }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
index 20b6ed9fc806..aba618540141 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
@@ -86,9 +86,10 @@ private[mxnet] class LibInfo {
   @native def mxNDArrayAt(handle: NDArrayHandle,
                           idx: MXUint,
                           out: NDArrayHandleRef): Int
-  @native def mxNDArrayReshape(handle: NDArrayHandle,
+  @native def mxNDArrayReshape64(handle: NDArrayHandle,
                                nDim: Int,
-                               dims: Array[Int],
+                               dims: Array[Long],
+                               reverse: Boolean,
                                reshapeHandle: NDArrayHandleRef): Int
   @native def mxNDArraySyncCopyFromCPU(handle: NDArrayHandle,
                                        source: Array[MXFloat],
@@ -187,6 +188,23 @@ private[mxnet] class LibInfo {
                                  grads: Array[NDArrayHandle]): Int
   @native def mxExecutorPrint(handle: ExecutorHandle, debugStr: RefString): Int
   @native def mxExecutorSetMonitorCallback(handle: ExecutorHandle, callback: MXMonitorCallback): Int
+  // scalastyle:off parameterNum
+  @native def mxExecutorReshape(partialShaping: Int,
+                                allowUpSizing: Int,
+                                devType: Int,
+                                devId: Int,
+                                mapKeys: Array[String],
+                                mapDevTypes: Array[Int],
+                                mapDevIds: Array[Int],
+                                providedArgShapeNames: Array[String],
+                                providedArgShapeData: Array[Int],
+                                providedArgShapeIdx: Array[Int],
+                                inArgs: ArrayBuffer[NDArrayHandle],
+                                argGrads: ArrayBuffer[NDArrayHandle],
+                                auxStates: ArrayBuffer[NDArrayHandle],
+                                sharedExec: ExecutorHandle,
+                                out: ExecutorHandleRef): Int
+  // scalastyle:on parameterNum
 
   // Symbols
   @native def mxSymbolListAtomicSymbolCreators(symbolList: ListBuffer[SymbolHandle]): Int
@@ -239,11 +257,20 @@ private[mxnet] class LibInfo {
                                  numArgs: MXUint,
                                  keys: Array[String],
                                  argIndPtr: Array[MXUint],
-                                 argShapeData: Array[MXUint],
+                                 argShapeData: Array[Int],
                                  inShapeData: ListBuffer[Array[Int]],
                                  outShapeData: ListBuffer[Array[Int]],
                                  auxShapeData: ListBuffer[Array[Int]],
                                  complete: RefInt): Int
+  @native def mxSymbolInferShapePartial(handle: SymbolHandle,
+                                        numArgs: MXUint,
+                                        keys: Array[String],
+                                        argIndPtr: Array[MXUint],
+                                        argShapeData: Array[Int],
+                                        inShapeData: ListBuffer[Array[Int]],
+                                        outShapeData: ListBuffer[Array[Int]],
+                                        auxShapeData: ListBuffer[Array[Int]],
+                                        complete: RefInt): Int
   @native def mxSymbolGetOutput(handle: SymbolHandle, index: Int, out: SymbolHandleRef): Int
   @native def mxSymbolSaveToJSON(handle: SymbolHandle, out: RefString): Int
   @native def mxSymbolCreateFromJSON(json: String, handle: SymbolHandleRef): Int
@@ -321,4 +348,8 @@ private[mxnet] class LibInfo {
   @native def mxSetProfilerConfig(keys: Array[String], vals: Array[String]): Int
   @native def mxSetProfilerState(state: Int): Int
   @native def mxDumpProfile(finished: Int): Int
+
+  // Numpy
+  @native def mxIsNumpyCompatible(compatible: RefInt): Int
+  @native def mxSetIsNumpyCompatible(isNpComp: Int, prev: RefInt): Int
 }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
index ca2e986e7f29..849f4566f528 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
@@ -575,15 +575,13 @@ object NDArray extends NDArrayBase {
    * @param stop End of interval.
    * @param step Spacing between values. The default step size is 1.
    * @param repeat Number of times to repeat each element. The default repeat count is 1.
-   * @param infer_range
-   *          When set to True, infer the stop position from the start, step,
-   *          repeat, and output tensor size.
    * @param ctx Device context. Default context is the current default context.
    * @param dType The data type of the `NDArray`. The default datatype is `DType.Float32`.
    * @return NDArray of evenly spaced values in the specified range.
    */
-  def arange(start: Float, stop: Option[Float], step: Float,
-             repeat: Int, ctx: Context, dType: DType): NDArray = {
+  def arange(start: Float, stop: Option[Float] = None, step: Float = 1.0f,
+             repeat: Int = 1, ctx: Context = Context.defaultCtx,
+             dType: DType = Base.MX_REAL_TYPE): NDArray = {
     val params = Map("start" -> start, "step" -> step, "repeat" -> repeat,
       "infer_range" -> false, "ctx" -> ctx.toString, "dtype" -> dType.toString())
     val fParams = if (stop == None) params else params ++ Map("stop" -> stop.get)
@@ -952,8 +950,19 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
    * @return a reshaped NDArray that shares memory with current one.
    */
   def reshape(dims: Array[Int]): NDArray = {
+    reshape(dims.map(_.toLong))
+  }
+
+  /**
+    * Return a reshaped NDArray that shares memory with current one.
+    * @param dims New shape.
+    * @param reverse whether to inplace reshape
+    * @return a reshaped NDArray that shares memory with current one.
+    */
+  def reshape(dims: Array[Long], reverse: Option[Boolean] = None): NDArray = {
     val reshapeHandle = new NDArrayHandleRef
-    checkCall(_LIB.mxNDArrayReshape(handle, dims.length, dims, reshapeHandle))
+    checkCall(_LIB.mxNDArrayReshape64(handle,
+      dims.length, dims, reverse.getOrElse(false), reshapeHandle))
     new NDArray(handle = reshapeHandle.value, writable = this.writable)
   }
 
@@ -1265,11 +1274,15 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
    * @return an array representing shape of current ndarray
    */
   def shape: Shape = {
-    val ndim = new MXUintRef
+    val ndim = new RefInt
     val data = ArrayBuffer[Int]()
     checkCall(_LIB.mxNDArrayGetShape(handle, ndim, data))
-    require(ndim.value == data.length, s"ndim=$ndim, while len(data)=${data.length}")
-    Shape(data)
+    if (ndim.value == -1) {
+      null
+    } else {
+      require(ndim.value == data.length, s"ndim=$ndim, while len(data)=${data.length}")
+      Shape(data)
+    }
   }
 
   // Get size of current NDArray.
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NumpyScope.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NumpyScope.scala
new file mode 100644
index 000000000000..d3e76f1044a7
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NumpyScope.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet
+
+import org.apache.mxnet.Base._
+
+/**
+  * NumpyScope object provides util functions for turning on/off NumPy compatibility
+  * and checking whether NumPy compatibility has been turned on/off. NumPy compatibility
+  * is introduced first to support zero-dim and zero-size tensors as in NumPy.
+  */
+object NumpyScope {
+  def setNumpyCompatible(isNpComp: Boolean): Boolean = {
+    val prev = new RefInt()
+    checkCall(_LIB.mxSetIsNumpyCompatible(if (isNpComp) 1 else 0, prev))
+    if (prev.value != 0) true else false
+  }
+
+  def isNumpyCompatible: Boolean = {
+    val curr = new RefInt
+    checkCall(_LIB.mxIsNumpyCompatible(curr))
+    if (curr.value != 0) true else false
+  }
+
+  def enableNumpyCompatible: NumpyScope = {
+    new NumpyScope(true)
+  }
+
+
+  def disableNumpyCompatible: NumpyScope = {
+    new NumpyScope(false)
+  }
+}
+
+class NumpyScope(var isCompatible: Boolean) {
+  private var prev: Boolean = false
+
+  def withScope[T](body: => T): T = {
+    prev = NumpyScope.setNumpyCompatible(isCompatible)
+    try {
+      body
+    } finally {
+      if (prev != isCompatible) {
+        NumpyScope.setNumpyCompatible(prev)
+      }
+    }
+  }
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Optimizer.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Optimizer.scala
index 1fb634cebb26..123eae986cd7 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Optimizer.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Optimizer.scala
@@ -28,15 +28,17 @@ object Optimizer {
   def getUpdater(optimizer: Optimizer): MXKVStoreUpdater = {
     new MXKVStoreUpdater with MXKVStoreCachedStates {
       override def update(index: Int, grad: NDArray, weight: NDArray): Unit = {
-        val state =
-          if (states.contains(index)) {
-            states.get(index).get
-          } else {
-            val newState = optimizer.createState(index, weight)
-            states.put(index, newState)
-            newState
-          }
-        optimizer.update(index, weight, grad, state)
+        ResourceScope.usingIfScopeExists(this.scope) {
+          val state =
+            if (states.contains(index)) {
+              states.get(index).get
+            } else {
+              val newState = optimizer.createState(index, weight)
+              states.put(index, newState)
+              newState
+            }
+          optimizer.update(index, weight, grad, state)
+        }
       }
 
       override def dispose(): Unit = {
@@ -52,15 +54,13 @@ object Optimizer {
           val out = new ObjectOutputStream(bos)
           out.writeInt(states.size)
           states.foreach { case (k, v) =>
-            if (v != null) {
-              out.writeInt(k)
-              val stateBytes = optimizer.serializeState(v)
-              if (stateBytes == null) {
-                out.writeInt(0)
-              } else {
-                out.writeInt(stateBytes.length)
-                out.write(stateBytes)
-              }
+            out.writeInt(k)
+            val stateBytes = optimizer.serializeState(v)
+            if (stateBytes == null) {
+              out.writeInt(0)
+            } else {
+              out.writeInt(stateBytes.length)
+              out.write(stateBytes)
             }
           }
           out.flush()
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/ResourceScope.scala b/scala-package/core/src/main/scala/org/apache/mxnet/ResourceScope.scala
index bb363c0c396b..b955c185b6d1 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/ResourceScope.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/ResourceScope.scala
@@ -48,8 +48,10 @@ class ResourceScope extends AutoCloseable {
     */
   override def close(): Unit = {
     ResourceScope.removeFromThreadLocal(this)
-    resourceQ.foreach(resource => if (resource != null) resource.dispose(false) )
-    resourceQ.clear()
+    if (!ResourceScope.threadLocalScopes.get().contains(this)) {
+      resourceQ.foreach(resource => if (resource != null) resource.dispose(false))
+      resourceQ.clear()
+    }
   }
 
   /**
@@ -145,7 +147,7 @@ object ResourceScope {
         null.asInstanceOf[A] // we'll throw in finally
     } finally {
       var toThrow: Throwable = retThrowable
-      if (retThrowable eq null) curScope.close()
+      if (retThrowable eq null) curScope.close
       else {
         try {
           curScope.close
@@ -160,6 +162,17 @@ object ResourceScope {
     }
   }
 
+  private[mxnet] def usingIfScopeExists[A](scope: Option[ResourceScope])(body: => A): A = {
+    if (scope == None) {
+      body
+    } else {
+      ResourceScope.addToThreadLocal(scope.get)
+      ResourceScope.using(scope.get){
+        body
+      }
+    }
+  }
+
   // thread local Scopes
   private[mxnet] val threadLocalScopes = new ThreadLocal[ArrayBuffer[ResourceScope]] {
     override def initialValue(): ArrayBuffer[ResourceScope] =
@@ -179,7 +192,7 @@ object ResourceScope {
     * @param r ResourceScope to remove
     */
   private[mxnet] def removeFromThreadLocal(r: ResourceScope): Unit = {
-    threadLocalScopes.get() -= r
+    threadLocalScopes.get().remove(threadLocalScopes.get().lastIndexOf(r))
   }
 
   /**
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
index 29885fc723cd..808a23a8c945 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
@@ -260,17 +260,45 @@ class Symbol private(private[mxnet] val handle: SymbolHandle) extends NativeReso
 
   def inferShape(keys: Array[String], indPtr: Array[Int], values: Array[Int])
     : (IndexedSeq[Shape], IndexedSeq[Shape], IndexedSeq[Shape]) = {
+    val res = inferShapeImpl(partial = false, keys, indPtr, values)
+    if (res._2 == null) {
+      val (argShapes, _, _) = inferShapeImpl(partial = true, keys, indPtr, values)
+      val argNames = listArguments()
+      val unknown = (argNames zip argShapes).map { case (name, shape) =>
+        val shapeIsNone = if (NumpyScope.isNumpyCompatible) {
+          shape == null || shape.toVector.contains(-1)
+        } else {
+          shape == null || shape.toVector.contains(0)
+        }
+        if (shapeIsNone) s"$name: $shape" else ""
+      }
+      logger.warn("Cannot decide shape for the following arguments. " +
+        "Consider providing them as input: \n\t{}",
+        unknown.filter(_ != "").mkString("\n\t"))
+    }
+    res
+  }
+
+  private def inferShapeImpl(partial: Boolean,
+                             keys: Array[String],
+                             indPtr: Array[Int],
+                             values: Array[Int])
+    : (IndexedSeq[Shape], IndexedSeq[Shape], IndexedSeq[Shape]) = {
     val argShapeData = ListBuffer.empty[Array[Int]]
     val outShapeData = ListBuffer.empty[Array[Int]]
     val auxShapeData = ListBuffer.empty[Array[Int]]
     val complete = new RefInt
-
-    checkCall(_LIB.mxSymbolInferShape(handle, indPtr.length - 1, keys, indPtr, values,
-      argShapeData, outShapeData, auxShapeData, complete))
+    if (partial) {
+      checkCall(_LIB.mxSymbolInferShapePartial(handle, indPtr.length - 1, keys, indPtr, values,
+        argShapeData, outShapeData, auxShapeData, complete))
+    } else {
+      checkCall(_LIB.mxSymbolInferShape(handle, indPtr.length - 1, keys, indPtr, values,
+        argShapeData, outShapeData, auxShapeData, complete))
+    }
     if (complete.value != 0) {
       (argShapeData.map(s => Shape(s)).toIndexedSeq,
-       outShapeData.map(s => Shape(s)).toIndexedSeq,
-       auxShapeData.map(s => Shape(s)).toIndexedSeq)
+        outShapeData.map(s => Shape(s)).toIndexedSeq,
+        auxShapeData.map(s => Shape(s)).toIndexedSeq)
     } else {
       (null, null, null)
     }
@@ -803,18 +831,23 @@ class Symbol private(private[mxnet] val handle: SymbolHandle) extends NativeReso
                                    auxArgsHandle,
                                    sharedHandle,
                                    execHandle))
-    val executor = new Executor(execHandle.value, this.clone())
-    executor.argArrays = argsNDArray
-    executor.gradArrays = argsGradNDArray
-    executor.auxArrays = auxStatesNDArray
-    executor._ctx = new Context(ctx.deviceType, ctx.deviceId)
-    executor._gradsReq = gradsReq
-    executor._group2ctx =
+
+    val executorGroup2ctx =
       if (group2ctx == null) null
       else group2ctx.map { case (key, value) =>
         key -> new Context(value.deviceType, value.deviceId)
       }
-    executor
+
+    // If this is in a scope then we want to create the clone in the same scope
+    var newSymbol: Symbol = null
+    ResourceScope.usingIfScopeExists(this.scope) {
+      newSymbol = this.clone()
+    }
+
+    new Executor(execHandle.value, newSymbol, argsNDArray, argsGradNDArray,
+                                auxStatesNDArray, new Context(ctx.deviceType, ctx.deviceId),
+                                gradsReq, executorGroup2ctx)
+
   }
 
   /**
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Image.scala b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Image.scala
index 7d6f31e930ad..f72223d1e4da 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Image.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Image.scala
@@ -20,15 +20,16 @@ package org.apache.mxnet.javaapi
 import java.awt.image.BufferedImage
 // scalastyle:on
 import java.io.InputStream
+import scala.collection.JavaConverters._
 
 object Image {
   /**
     * Decode image with OpenCV.
     * Note: return image in RGB by default, instead of OpenCV's default BGR.
-    * @param buf    Buffer containing binary encoded image
-    * @param flag   Convert decoded image to grayscale (0) or color (1).
+    * @param buf   Buffer containing binary encoded image
+    * @param flag  Convert decoded image to grayscale (0) or color (1).
     * @param toRGB Whether to convert decoded image
-    *               to mxnet's default RGB format (instead of opencv's default BGR).
+    *              to mxnet's default RGB format (instead of opencv's default BGR).
     * @return NDArray in HWC format with DType [[DType.UInt8]]
     */
   def imDecode(buf: Array[Byte], flag: Int, toRGB: Boolean): NDArray = {
@@ -43,8 +44,8 @@ object Image {
     * Same imageDecode with InputStream
     *
     * @param inputStream the inputStream of the image
-    * @param flag   Convert decoded image to grayscale (0) or color (1).
-    * @param toRGB Whether to convert decoded image
+    * @param flag        Convert decoded image to grayscale (0) or color (1).
+    * @param toRGB       Whether to convert decoded image
     * @return NDArray in HWC format with DType [[DType.UInt8]]
     */
   def imDecode(inputStream: InputStream, flag: Int, toRGB: Boolean): NDArray = {
@@ -60,7 +61,7 @@ object Image {
     * Note: return image in RGB by default, instead of OpenCV's default BGR.
     * @param filename Name of the image file to be loaded.
     * @param flag     Convert decoded image to grayscale (0) or color (1).
-    * @param toRGB   Whether to convert decoded image to mxnet's default RGB format
+    * @param toRGB    Whether to convert decoded image to mxnet's default RGB format
     *                 (instead of opencv's default BGR).
     * @return org.apache.mxnet.NDArray in HWC format with DType [[DType.UInt8]]
     */
@@ -74,10 +75,10 @@ object Image {
 
   /**
     * Resize image with OpenCV.
-    * @param src     source image in NDArray
-    * @param w       Width of resized image.
-    * @param h       Height of resized image.
-    * @param interp  Interpolation method (default=cv2.INTER_LINEAR).
+    * @param src    source image in NDArray
+    * @param w      Width of resized image.
+    * @param h      Height of resized image.
+    * @param interp Interpolation method (default=cv2.INTER_LINEAR).
     * @return org.apache.mxnet.NDArray
     */
   def imResize(src: NDArray, w: Int, h: Int, interp: Integer): NDArray = {
@@ -92,10 +93,10 @@ object Image {
   /**
     * Do a fixed crop on the image
     * @param src Src image in NDArray
-    * @param x0 starting x point
-    * @param y0 starting y point
-    * @param w width of the image
-    * @param h height of the image
+    * @param x0  starting x point
+    * @param y0  starting y point
+    * @param w   width of the image
+    * @param h   height of the image
     * @return cropped NDArray
     */
   def fixedCrop(src: NDArray, x0: Int, y0: Int, w: Int, h: Int): NDArray = {
@@ -111,4 +112,21 @@ object Image {
   def toImage(src: NDArray): BufferedImage = {
     org.apache.mxnet.Image.toImage(src)
   }
+
+  /**
+    * Draw bounding boxes on the image
+    * @param src        buffered image to draw on
+    * @param coordinate Contains Map of xmin, xmax, ymin, ymax
+    *                   corresponding to top-left and down-right points
+    * @param names      The name set of the bounding box
+    */
+  def drawBoundingBox(src: BufferedImage,
+                      coordinate: java.util.List[
+                        java.util.Map[java.lang.String, java.lang.Integer]],
+                      names: java.util.List[java.lang.String]): Unit = {
+    val coord = coordinate.asScala.map(
+      _.asScala.map{case (name, value) => (name, Integer2int(value))}.toMap).toArray
+    org.apache.mxnet.Image.drawBoundingBox(src, coord, Option(names.asScala.toArray))
+  }
+
 }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Layout.scala b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Layout.scala
new file mode 100644
index 000000000000..cfe290c1aff7
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Layout.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mxnet.javaapi
+
+/**
+  * Layout definition of DataDesc
+  * N Batch size
+  * C channels
+  * H Height
+  * W Weight
+  * T sequence length
+  * __undefined__ default value of Layout
+  */
+object Layout {
+  val UNDEFINED: String = org.apache.mxnet.Layout.UNDEFINED
+  val NCHW: String = org.apache.mxnet.Layout.NCHW
+  val NTC: String = org.apache.mxnet.Layout.NTC
+  val NT: String = org.apache.mxnet.Layout.NT
+  val N: String = org.apache.mxnet.Layout.N
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala
index 1ac798e1b617..41a6f69394d2 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala
@@ -173,14 +173,13 @@ class BucketingModule(symGen: AnyRef => (Symbol, IndexedSeq[String], IndexedSeq[
                           allowMissing: Boolean = false,
                           forceInit: Boolean = false,
                           allowExtra: Boolean = false): Unit = {
-    if (paramsInitialized && !forceInit) {
-      return
+    if (!paramsInitialized || forceInit) {
+      require(binded, "call bind before initializing the parameters")
+      this._currModule.initParams(initializer, argParams, auxParams,
+        allowMissing, forceInit, allowExtra)
+      this.paramsDirty = false
+      this.paramsInitialized = true
     }
-    require(binded, "call bind before initializing the parameters")
-    this._currModule.initParams(initializer, argParams, auxParams,
-      allowMissing, forceInit, allowExtra)
-    this.paramsDirty = false
-    this.paramsInitialized = true
   }
 
   /**
@@ -218,28 +217,27 @@ class BucketingModule(symGen: AnyRef => (Symbol, IndexedSeq[String], IndexedSeq[
 
     if (this.binded) {
       logger.warn("Already bound, ignoring bind()")
-      return
-    }
+    } else {
+      require(sharedModule.isEmpty,
+        "sharedModule for BucketingModule is not supported")
 
-    require(sharedModule.isEmpty,
-      "sharedModule for BucketingModule is not supported")
-
-    this.forTraining = forTraining
-    this.inputsNeedGrad = inputsNeedGrad
-    this.binded = true
-
-    val (sym, dNames, lNames) = this.symGen(this.defaultBucketKey)
-    val module = new Module(sym, dNames, lNames, this.contexts,
-      this.workLoadList, this.fixedParamNames)
-    module.bind(dataShapes, labelShapes, forTraining, inputsNeedGrad,
-      forceRebind = false, sharedModule = None, gradReq)
-    this._currModule = module
-    this._currBucketKey = this.defaultBucketKey
-    this._buckets(this.defaultBucketKey) = module
-
-    // copy back saved params, if already initialized
-    if (this.paramsInitialized) {
-      this.setParams(argParams, auxParams)
+      this.forTraining = forTraining
+      this.inputsNeedGrad = inputsNeedGrad
+      this.binded = true
+
+      val (sym, dNames, lNames) = this.symGen(this.defaultBucketKey)
+      val module = new Module(sym, dNames, lNames, this.contexts,
+        this.workLoadList, this.fixedParamNames)
+      module.bind(dataShapes, labelShapes, forTraining, inputsNeedGrad,
+        forceRebind = false, sharedModule = None, gradReq)
+      this._currModule = module
+      this._currBucketKey = this.defaultBucketKey
+      this._buckets(this.defaultBucketKey) = module
+
+      // copy back saved params, if already initialized
+      if (this.paramsInitialized) {
+        this.setParams(argParams, auxParams)
+      }
     }
   }
 
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/DataParallelExecutorGroup.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/DataParallelExecutorGroup.scala
index df66ea7721fb..74e63be3916b 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/DataParallelExecutorGroup.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/DataParallelExecutorGroup.scala
@@ -299,7 +299,6 @@ class DataParallelExecutorGroup private[module](
 
   private var batchSize: Int = -1
   private var slices: Array[(Int, Int)] = null
-  private var _defaultExecs: Array[Executor] = null
   private var execs: Array[Executor] = null
   private var dataArrays: Seq[Array[((Int, Int), NDArray)]] = null
   private var labelArrays: Option[Seq[Array[((Int, Int), NDArray)]]] = None
@@ -373,7 +372,12 @@ class DataParallelExecutorGroup private[module](
         val labelShapesSliced = labelShapes.map(slicedShape(_, i, labelLayouts))
         val inputShapes
           = dataShapesSliced.toMap ++ labelShapesSliced.getOrElse(Map.empty[String, Shape])
-        execs(i) = _defaultExecs(i).reshape(allowUpSizing = true, kwargs = inputShapes)
+
+        ResourceScope.usingIfScopeExists(execs(i).scope) {
+          val tmpExec = execs(i).reshape(allowUpSizing = true, kwargs = inputShapes)
+          execs(i).dispose()
+          execs(i) = tmpExec
+        }
       }
     } else {
       execs = (0 until contexts.length).map(i =>
@@ -434,9 +438,6 @@ class DataParallelExecutorGroup private[module](
    */
   def reshape(dataShapes: IndexedSeq[DataDesc], labelShapes: Option[IndexedSeq[DataDesc]]): Unit = {
     if (!(dataShapes == this.dataShapes && labelShapes == this.labelShapes)) {
-      if (this._defaultExecs == null) {
-        this._defaultExecs = this.execs.map(x => x)
-      }
       this.bindExec(dataShapes, labelShapes, None, reshape = true)
     }
   }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
index 97df3dcb307d..3255d9346b80 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
@@ -121,36 +121,35 @@ class Module(symbolVar: Symbol,
                           allowMissing: Boolean = false,
                           forceInit: Boolean = false,
                           allowExtra: Boolean = false): Unit = {
-    if (paramsInitialized && !forceInit) {
-      return
-    }
-    require(binded, "call bind before initializing the parameters")
+    if (!paramsInitialized || forceInit) {
+      require(binded, "call bind before initializing the parameters")
 
-    if (this.argParams == null) {
-      val paramArrays =
-        execGroup.paramArrays.map(nds => NDArray.zeros(nds(0).shape, dtype = nds(0).dtype))
-      this.argParams = this.paramNames.zip(paramArrays).toMap
-    }
+      if (this.argParams == null) {
+        val paramArrays =
+          execGroup.paramArrays.map(nds => NDArray.zeros(nds(0).shape, dtype = nds(0).dtype))
+        this.argParams = this.paramNames.zip(paramArrays).toMap
+      }
 
-    if (this.auxParams == null) {
-      val auxArrays =
-        execGroup.auxArrays.map(nds => NDArray.zeros(nds(0).shape, dtype = nds(0).dtype))
-      this.auxParams = this.auxNames.zip(auxArrays).toMap
-    }
+      if (this.auxParams == null) {
+        val auxArrays =
+          execGroup.auxArrays.map(nds => NDArray.zeros(nds(0).shape, dtype = nds(0).dtype))
+        this.auxParams = this.auxNames.zip(auxArrays).toMap
+      }
 
-    this.argParams.foreach { case (name, arr) =>
-      impl(name, arr, allowMissing, Option(initializer), argParams)
-    }
+      this.argParams.foreach { case (name, arr) =>
+        impl(name, arr, allowMissing, Option(initializer), argParams)
+      }
 
-    this.auxParams.foreach { case (name, arr) =>
-      impl(name, arr, allowMissing, Option(initializer), auxParams)
-    }
+      this.auxParams.foreach { case (name, arr) =>
+        impl(name, arr, allowMissing, Option(initializer), auxParams)
+      }
 
-    this.paramsInitialized = true
-    this.paramsDirty = false
+      this.paramsInitialized = true
+      this.paramsDirty = false
 
-    // copy the initialized parameters to devices
-    this.execGroup.setParams(this.argParams, this.auxParams, allowExtra = allowExtra)
+      // copy the initialized parameters to devices
+      this.execGroup.setParams(this.argParams, this.auxParams, allowExtra = allowExtra)
+    }
   }
 
   // Internal helper for parameter initialization
@@ -246,64 +245,64 @@ class Module(symbolVar: Symbol,
 
     if (binded) {
       logger.warn("Already binded, ignoring bind()")
-      return
-    }
+    } else {
+      this.forTraining = forTraining
+      this.inputsNeedGrad = inputsNeedGrad
+      this.binded = true
 
-    this.forTraining = forTraining
-    this.inputsNeedGrad = inputsNeedGrad
-    this.binded = true
+      if (!forTraining) {
+        require(!inputsNeedGrad, "Invalid inputsNeedGrad (cannot be true if not forTraining)")
+      } else {
+        // this is not True, as some module might not contains a loss function
+        // that consumes the labels
+        // require(labelShapes != None)
+      }
 
-    if (!forTraining) {
-      require(!inputsNeedGrad, "Invalid inputsNeedGrad (cannot be true if not forTraining)")
-    } else {
-      // this is not True, as some module might not contains a loss function
-      // that consumes the labels
-      // require(labelShapes != None)
-    }
+      this.dataShapesVar = dataShapes
+      this.labelShapesVar = labelShapes
 
-    this.dataShapesVar = dataShapes
-    this.labelShapesVar = labelShapes
-
-    val sharedGroup =
-      sharedModule.map(sharedModuleInst => {
-        require(sharedModuleInst.binded && sharedModuleInst.paramsInitialized,
-          s"bind() and initParams() must be called first on shared module.")
-        sharedModuleInst.execGroup
-      })
-
-    val inputTypes = this.dataShapesVar.map(dataDesc => (dataDesc.name, dataDesc.dtype)).toMap ++
-      labelShapes.map(shapes => shapes.map(dataDesc => (dataDesc.name, dataDesc.dtype)).toMap)
-                 .getOrElse(Map.empty[String, DType])
-
-    execGroup = new Builder(symbol, contexts, paramNames)
-      .setWorkLoadList(workLoads)
-      .setDataShapes(dataShapes)
-      .setLabelShapes(labelShapes.orNull)
-      .setForTraining(forTraining)
-      .setInputsNeedGrad(inputsNeedGrad)
-      .setSharedGroup(sharedGroup.orNull)
-      .setFixedParamNames(fixedParamNames.orNull)
-      .setGradReq(gradReq)
-      .setInputTypes(inputTypes)
-      .build()
-
-    if (sharedModule.isDefined) {
-      paramsInitialized = true
-      argParams = sharedModule.get.argParams
-      auxParams = sharedModule.get.auxParams
-    } else if (paramsInitialized) {
-      // if the parameters are already initialized, we are re-binding
-      // so automatically copy the already initialized params
-      execGroup.setParams(argParams, auxParams)
-    }
+      val sharedGroup =
+        sharedModule.map(sharedModuleInst => {
+          require(sharedModuleInst.binded && sharedModuleInst.paramsInitialized,
+            s"bind() and initParams() must be called first on shared module.")
+          sharedModuleInst.execGroup
+        })
 
-    sharedModule.foreach {
-      case sharedModuleInst: Module =>
-        if (sharedModuleInst.optimizerInitialized) {
-          borrowOptimizer(sharedModuleInst)
-        }
-      case _ =>
+      val inputTypes = this.dataShapesVar.map(dataDesc => (dataDesc.name, dataDesc.dtype)).toMap ++
+        labelShapes.map(shapes => shapes.map(dataDesc => (dataDesc.name, dataDesc.dtype)).toMap)
+          .getOrElse(Map.empty[String, DType])
+
+      execGroup = new Builder(symbol, contexts, paramNames)
+        .setWorkLoadList(workLoads)
+        .setDataShapes(dataShapes)
+        .setLabelShapes(labelShapes.orNull)
+        .setForTraining(forTraining)
+        .setInputsNeedGrad(inputsNeedGrad)
+        .setSharedGroup(sharedGroup.orNull)
+        .setFixedParamNames(fixedParamNames.orNull)
+        .setGradReq(gradReq)
+        .setInputTypes(inputTypes)
+        .build()
+
+      if (sharedModule.isDefined) {
+        paramsInitialized = true
+        argParams = sharedModule.get.argParams
+        auxParams = sharedModule.get.auxParams
+      } else if (paramsInitialized) {
+        // if the parameters are already initialized, we are re-binding
+        // so automatically copy the already initialized params
+        execGroup.setParams(argParams, auxParams)
+      }
+
+      sharedModule.foreach {
+        case sharedModuleInst: Module =>
+          if (sharedModuleInst.optimizerInitialized) {
+            borrowOptimizer(sharedModuleInst)
+          }
+        case _ =>
+      }
     }
+
   }
 
   /**
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala
index 2e506c08e548..3c3eeb97f201 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala
@@ -154,38 +154,37 @@ class SequentialModule extends BaseModule {
                           allowMissing: Boolean = false,
                           forceInit: Boolean = false,
                           allowExtra: Boolean = false): Unit = {
-    if (this.paramsInitialized && !forceInit) {
-      return
-    }
-    require(this.binded, "call bind before initializing the parameters")
+    if (!this.paramsInitialized || forceInit) {
+      require(this.binded, "call bind before initializing the parameters")
 
-    for (module <- this.modules) {
-      module.initParams(initializer = initializer, argParams = argParams,
-          auxParams = auxParams, allowMissing = allowMissing,
-          forceInit = forceInit, allowExtra = allowExtra)
-    }
+      for (module <- this.modules) {
+        module.initParams(initializer = initializer, argParams = argParams,
+            auxParams = auxParams, allowMissing = allowMissing,
+            forceInit = forceInit, allowExtra = allowExtra)
+      }
 
-    // Internal function to help checking duplicated names,
-    // make sure we do not have duplicated parameter names.
-    def checkName(knownNames: scala.collection.mutable.Map[String, Int],
-      newNames: Array[String], modules: ArrayBuffer[BaseModule], i: Int): Unit = {
-      for (name <- newNames) {
-        require(!knownNames.contains(name), s"Duplicated parameter names: " +
-            s"name $name in layer $i (${modules(i).getClass.getName}) is already " +
-            s"used in layer ${knownNames("name")}" +
-            s"(${modules(knownNames("name")).getClass.getName})")
-        knownNames(name) = i
+      // Internal function to help checking duplicated names,
+      // make sure we do not have duplicated parameter names.
+      def checkName(knownNames: scala.collection.mutable.Map[String, Int],
+        newNames: Array[String], modules: ArrayBuffer[BaseModule], i: Int): Unit = {
+        for (name <- newNames) {
+          require(!knownNames.contains(name), s"Duplicated parameter names: " +
+              s"name $name in layer $i (${modules(i).getClass.getName}) is already " +
+              s"used in layer ${knownNames("name")}" +
+              s"(${modules(knownNames("name")).getClass.getName})")
+          knownNames(name) = i
+        }
       }
-    }
 
-    val argNames = scala.collection.mutable.Map[String, Int]()
-    val auxNames = scala.collection.mutable.Map[String, Int]()
-    for ((module, iLayer) <- this.modules.zipWithIndex) {
-      val (argParams, auxParams) = module.getParams
-      checkName(argNames, argParams.keys.toArray, this.modules, iLayer)
-      checkName(auxNames, auxParams.keys.toArray, this.modules, iLayer)
+      val argNames = scala.collection.mutable.Map[String, Int]()
+      val auxNames = scala.collection.mutable.Map[String, Int]()
+      for ((module, iLayer) <- this.modules.zipWithIndex) {
+        val (argParams, auxParams) = module.getParams
+        checkName(argNames, argParams.keys.toArray, this.modules, iLayer)
+        checkName(auxNames, auxParams.keys.toArray, this.modules, iLayer)
+      }
+      this.paramsInitialized = true
     }
-    this.paramsInitialized = true
   }
 
   /**
@@ -216,54 +215,54 @@ class SequentialModule extends BaseModule {
                     gradReq: String = "write"): Unit = {
     if (this.binded && !forceRebind) {
       logger.warn(s"Already binded, ignoring bind()")
-      return
-    }
-
-    if (inputsNeedGrad) {
-      require(forTraining, "inputsNeedGrad can be set only for training")
-    }
-
-    require(sharedModule == None, "Shared module is not supported")
-    require(this.modules.length > 0, "Attempting to bind an empty SequentialModule")
-
-    this.forTraining = forTraining
-    this.inputsNeedGrad = inputsNeedGrad
-    this.binded = true
-
-    // the same label shapes are used for all chained modules
-    this.labelShapesVar = labelShapes
+    } else {
+      if (inputsNeedGrad) {
+        require(forTraining, "inputsNeedGrad can be set only for training")
+      }
 
-    var myDataShapes = dataShapes
-    var myLabelShapes = labelShapes
-    var anybodyEverNeedsLabel = false
-    for ((module, iLayer) <- this.modules.zipWithIndex) {
-      val meta = this.metas(iLayer)
-      if (meta.contains(META_TAKE_LABELS) && meta(META_TAKE_LABELS)) {
-        myLabelShapes = labelShapes
-        anybodyEverNeedsLabel = true
-      } else myLabelShapes = None
-
-      val myInputsNeedGrad = if (inputsNeedGrad || (forTraining && iLayer > 0)) true else false
-      if (meta.contains(META_AUTO_WIRING) && meta(META_AUTO_WIRING)) {
-        val dataNames = module.dataNames
-        require(dataNames.length == myDataShapes.length,
-          s"dataNmes $dataNames and dataShapes $myDataShapes do not match")
-        myDataShapes = dataNames.zip(myDataShapes).map { case (newName, dataDes) =>
-          DataDesc(newName, dataDes.shape)
+      require(sharedModule == None, "Shared module is not supported")
+      require(this.modules.length > 0, "Attempting to bind an empty SequentialModule")
+
+      this.forTraining = forTraining
+      this.inputsNeedGrad = inputsNeedGrad
+      this.binded = true
+
+      // the same label shapes are used for all chained modules
+      this.labelShapesVar = labelShapes
+
+      var myDataShapes = dataShapes
+      var myLabelShapes = labelShapes
+      var anybodyEverNeedsLabel = false
+      for ((module, iLayer) <- this.modules.zipWithIndex) {
+        val meta = this.metas(iLayer)
+        if (meta.contains(META_TAKE_LABELS) && meta(META_TAKE_LABELS)) {
+          myLabelShapes = labelShapes
+          anybodyEverNeedsLabel = true
+        } else myLabelShapes = None
+
+        val myInputsNeedGrad = if (inputsNeedGrad || (forTraining && iLayer > 0)) true else false
+        if (meta.contains(META_AUTO_WIRING) && meta(META_AUTO_WIRING)) {
+          val dataNames = module.dataNames
+          require(dataNames.length == myDataShapes.length,
+            s"dataNmes $dataNames and dataShapes $myDataShapes do not match")
+          myDataShapes = dataNames.zip(myDataShapes).map { case (newName, dataDes) =>
+            DataDesc(newName, dataDes.shape)
+          }
         }
-      }
 
-      module.bind(myDataShapes, myLabelShapes, forTraining, myInputsNeedGrad,
+        module.bind(myDataShapes, myLabelShapes, forTraining, myInputsNeedGrad,
           forceRebind, sharedModule = None, gradReq)
-      // the output of the previous module is the data of the next module
-      myDataShapes = module.outputShapes.map{case (name, shape) => DataDesc(name, shape)}
-    }
+        // the output of the previous module is the data of the next module
+        myDataShapes = module.outputShapes.map{case (name, shape) => DataDesc(name, shape)}
+      }
 
 
-    if (!anybodyEverNeedsLabel) {
-      // then I do not need label either
-      this.labelShapesVar = None
+      if (!anybodyEverNeedsLabel) {
+        // then I do not need label either
+        this.labelShapesVar = None
+      }
     }
+
   }
 
   /**
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/Adam.scala b/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/Adam.scala
index 24f3323073f7..5a8b3cb4e94c 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/Adam.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/Adam.scala
@@ -19,7 +19,7 @@ package org.apache.mxnet.optimizer
 
 import org.apache.mxnet.NDArrayConversions._
 import org.apache.mxnet.util.SerializerUtils
-import org.apache.mxnet.{LRScheduler, NDArray, Optimizer}
+import org.apache.mxnet.{LRScheduler, NDArray, Optimizer, ResourceScope}
 
 /**
  * Adam optimizer as described in [King2014]
@@ -57,63 +57,54 @@ class Adam(val learningRate: Float = 0.002f, beta1: Float = 0.9f, beta2: Float =
    *              The auxiliary state used in optimization.
    */
   override def update(index: Int, weight: NDArray, grad: NDArray, state: AnyRef): Unit = {
-    var lr =
-      (if (lrScheduler != null) {
-        val scheduledLr = lrScheduler(numUpdate)
-        updateCount(index)
-        scheduledLr
-      } else {
-        this.learningRate
-      })
-    lr = getLr(index, lr)
-
-    val (mean, variance) = state.asInstanceOf[(NDArray, NDArray)]
-
-    // increment time only when the first parameters is called
-    timeFirstIndex match {
-      case Some(idx) =>
-        if (idx == index) time += 1
-      case None =>
-        timeFirstIndex = Option(index)
-        time = 0 // all parameters share the same time
-    }
-
-    val t1: Int = time + 1
-    val learningRate = (lr *
-      math.sqrt(1.0 - math.pow(beta2, t1)) /
-      (1.0 - math.pow(beta1, t1))).toFloat
-    val beta1t = beta1 * math.pow(decayFactor, t1 - 1).toFloat
-
-    var resdGrad = grad * rescaleGrad
-    if (clipGradient != 0f) {
-      val oldResdGrad = resdGrad
-      resdGrad = NDArray.clip(resdGrad, -clipGradient, clipGradient)
-      oldResdGrad.dispose()
-    }
-
-    val meanT = (beta1t * mean + (1.0 - beta1t) * resdGrad)
-      .disposeDepsExcept(mean, resdGrad)
-    val varianceT = (beta2 * variance + (1.0f - beta2) * resdGrad * resdGrad)
-      .disposeDepsExcept(variance, resdGrad)
+    ResourceScope.using() {
+      var lr =
+        (if (lrScheduler != null) {
+          val scheduledLr = lrScheduler(numUpdate)
+          updateCount(index)
+          scheduledLr
+        } else {
+          this.learningRate
+        })
+      lr = getLr(index, lr)
 
-    val step = (learningRate * meanT / (NDArray.sqrt(varianceT) + epsilon))
-      .disposeDepsExcept(meanT, varianceT)
+      val (mean, variance) = state.asInstanceOf[(NDArray, NDArray)]
 
-    val wd = this.getWd(index, this.wd)
-    if (wd > 0.0f) {
-      val stepDelta = lr * wd * weight
-      step += stepDelta
-      stepDelta.dispose()
+      // increment time only when the first parameters is called
+      timeFirstIndex match {
+        case Some(idx) =>
+          if (idx == index) time += 1
+        case None =>
+          timeFirstIndex = Option(index)
+          time = 0 // all parameters share the same time
+      }
+
+      val t1: Int = time + 1
+      val learningRate = (lr * math.sqrt(1.0 - math.pow(beta2, t1)) /
+        (1.0 - math.pow(beta1, t1))).toFloat
+      val beta1t = beta1 * math.pow(decayFactor, t1 - 1).toFloat
+
+      var resdGrad = grad * rescaleGrad
+      if (clipGradient != 0f) {
+        val oldResdGrad = resdGrad
+        resdGrad = NDArray.clip(resdGrad, -clipGradient, clipGradient)
+      }
+
+      val meanT = (beta1t * mean + (1.0 - beta1t) * resdGrad)
+      val varianceT = (beta2 * variance + (1.0f - beta2) * resdGrad * resdGrad)
+      val step = (learningRate * meanT / (NDArray.sqrt(varianceT) + epsilon))
+
+      val wd = this.getWd(index, this.wd)
+      if (wd > 0.0f) {
+        val stepDelta = lr * wd * weight
+        step += stepDelta
+      }
+
+      weight -= step
+      mean.set(meanT)
+      variance.set(varianceT)
+      (mean, variance)
     }
-
-    weight -= step
-    mean.set(meanT)
-    variance.set(varianceT)
-
-    meanT.dispose()
-    varianceT.dispose()
-    step.dispose()
-    resdGrad.dispose()
   }
 
   // Create additional optimizer state: mean, variance
diff --git a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ImageTest.java b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ImageTest.java
index 0092744a21a8..f5515dc053a8 100644
--- a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ImageTest.java
+++ b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ImageTest.java
@@ -20,8 +20,15 @@
 import org.apache.commons.io.FileUtils;
 import org.junit.BeforeClass;
 import org.junit.Test;
+
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
 import java.io.File;
 import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 
 import static org.junit.Assert.assertArrayEquals;
 
@@ -56,12 +63,23 @@ public static void downloadFile() throws Exception {
     }
 
     @Test
-    public void testImageProcess() {
+    public void testImageProcess() throws Exception {
         NDArray nd = Image.imRead(imLocation, 1, true);
         assertArrayEquals(nd.shape().toArray(), new int[]{576, 1024, 3});
         NDArray nd2 = Image.imResize(nd, 224, 224, null);
         assertArrayEquals(nd2.shape().toArray(), new int[]{224, 224, 3});
         NDArray cropped = Image.fixedCrop(nd, 0, 0, 224, 224);
         Image.toImage(cropped);
+        BufferedImage buf = ImageIO.read(new File(imLocation));
+        Map<String, Integer> map = new HashMap<>();
+        map.put("xmin", 190);
+        map.put("xmax", 850);
+        map.put("ymin", 50);
+        map.put("ymax", 450);
+        List<Map<String, Integer>> box = new ArrayList<>();
+        box.add(map);
+        List<String> names = new ArrayList<>();
+        names.add("pug");
+        Image.drawBoundingBox(buf, box, names);
     }
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/CheckUtils.scala b/scala-package/core/src/test/scala/org/apache/mxnet/CheckUtils.scala
index 1ddb292dc3d2..7602b53edc9e 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/CheckUtils.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/CheckUtils.scala
@@ -21,13 +21,13 @@ object CheckUtils {
   def reldiff(a: NDArray, b: NDArray): Float = {
     val diff = NDArray.sum(NDArray.abs(a - b)).toScalar
     val norm = NDArray.sum(NDArray.abs(a)).toScalar
-    diff / norm
+    if (diff < Float.MinPositiveValue) diff else diff / norm
   }
 
   def reldiff(a: Array[Float], b: Array[Float]): Float = {
     val diff =
       (a zip b).map { case (aElem, bElem) => Math.abs(aElem - bElem) }.sum
     val norm: Float = a.reduce(Math.abs(_) + Math.abs(_))
-    diff / norm
+    if (diff < Float.MinPositiveValue) diff else diff / norm
   }
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/ImageSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/ImageSuite.scala
index 67815ad6c108..dca4ce02ef89 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/ImageSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/ImageSuite.scala
@@ -57,7 +57,7 @@ class ImageSuite extends FunSuite with BeforeAndAfterAll {
 
   test("Test load image") {
     val nd = Image.imRead(imLocation)
-    logger.info(s"OpenCV load image with shape: ${nd.shape}")
+    logger.debug(s"OpenCV load image with shape: ${nd.shape}")
     require(nd.shape == Shape(576, 1024, 3), "image shape not Match!")
   }
 
@@ -65,14 +65,14 @@ class ImageSuite extends FunSuite with BeforeAndAfterAll {
     val url = new URL("https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg")
     val inputStream = url.openStream
     val nd = Image.imDecode(inputStream)
-    logger.info(s"OpenCV load image with shape: ${nd.shape}")
+    logger.debug(s"OpenCV load image with shape: ${nd.shape}")
     require(nd.shape == Shape(576, 1024, 3), "image shape not Match!")
   }
 
   test("Test resize image") {
     val nd = Image.imRead(imLocation)
     val resizeIm = Image.imResize(nd, 224, 224)
-    logger.info(s"OpenCV resize image with shape: ${resizeIm.shape}")
+    logger.debug(s"OpenCV resize image with shape: ${resizeIm.shape}")
     require(resizeIm.shape == Shape(224, 224, 3), "image shape not Match!")
   }
 
@@ -94,7 +94,28 @@ class ImageSuite extends FunSuite with BeforeAndAfterAll {
     val tempDirPath = System.getProperty("java.io.tmpdir")
     val img = Image.toImage(resizeIm)
     ImageIO.write(img, "png", new File(tempDirPath + "/inputImages/out.png"))
-    logger.info(s"converted image stored in ${tempDirPath + "/inputImages/out.png"}")
+    logger.debug(s"converted image stored in ${tempDirPath + "/inputImages/out.png"}")
+  }
+
+  test("Test draw Bounding box") {
+    val buf = ImageIO.read(new File(imLocation))
+    val box = Array(
+      Map("xmin" -> 190, "xmax" -> 850, "ymin" -> 50, "ymax" -> 450),
+      Map("xmin" -> 200, "xmax" -> 350, "ymin" -> 440, "ymax" -> 530)
+    )
+    val names = Array("pug", "cookie")
+    Image.drawBoundingBox(buf, box, Some(names), fontSizeMult = Some(1.4f))
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    ImageIO.write(buf, "png", new File(tempDirPath + "/inputImages/out2.png"))
+    logger.debug(s"converted image stored in ${tempDirPath + "/inputImages/out2.png"}")
+    for (coord <- box) {
+      val topLeft = buf.getRGB(coord("xmin"), coord("ymin"))
+      val downLeft = buf.getRGB(coord("xmin"), coord("ymax"))
+      val topRight = buf.getRGB(coord("xmax"), coord("ymin"))
+      val downRight = buf.getRGB(coord("xmax"), coord("ymax"))
+      require(downLeft == downRight)
+      require(topRight == downRight)
+    }
   }
 
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/KVStoreSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/KVStoreSuite.scala
index 2e1b36c5b162..3eb61414bac1 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/KVStoreSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/KVStoreSuite.scala
@@ -67,9 +67,6 @@ class KVStoreSuite extends FunSuite with BeforeAndAfterAll {
     val kv = KVStore.create()
     val updater = new MXKVStoreUpdater {
       override def update(key: Int, input: NDArray, stored: NDArray): Unit = {
-        // scalastyle:off println
-        println(s"update on key $key")
-        // scalastyle:on println
         stored += input * 2
       }
       override def dispose(): Unit = {}
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/ModelParallelSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/ModelParallelSuite.scala
index 8ed1dc4c2332..2962e3b4781c 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/ModelParallelSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/ModelParallelSuite.scala
@@ -58,11 +58,6 @@ class ModelParallelSuite extends FunSuite with BeforeAndAfterAll {
     val arrGrad2 = arrGrad.map(_.copyTo(ctx1))
     val exec2 = net.bind(ctx1, args = arr2, argsGrad = arrGrad2)
 
-    // Show the execution plan that involves copynode
-    // scalastyle:off println
-    print(exec1.debugStr)
-    // scalastyle:on println
-
     exec1.forward()
     exec2.forward()
     assert(reldiff(exec1.outputs(0).copyTo(ctx1),
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
index 054300e952a8..c2ef641f9c9a 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
@@ -340,11 +340,28 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
       val stop = start + scala.util.Random.nextFloat() * 100
       val step = scala.util.Random.nextFloat() * 4
       val repeat = 1
-      val result = (start.toDouble until stop.toDouble by step.toDouble)
-              .flatMap(x => Array.fill[Float](repeat)(x.toFloat))
-      val range = NDArray.arange(start = start, stop = Some(stop), step = step,
-        repeat = repeat, ctx = Context.cpu(), dType = DType.Float32)
-      assert(CheckUtils.reldiff(result.toArray, range.toArray) <= 1e-4f)
+
+      val result1 = (start.toDouble until stop.toDouble by step.toDouble)
+        .flatMap(x => Array.fill[Float](repeat)(x.toFloat))
+      val range1 = NDArray.arange(start = start, stop = Some(stop), step = step,
+        repeat = repeat)
+      assert(CheckUtils.reldiff(result1.toArray, range1.toArray) <= 1e-4f)
+
+      val result2 = (0.0 until stop.toDouble by step.toDouble)
+        .flatMap(x => Array.fill[Float](repeat)(x.toFloat))
+      val range2 = NDArray.arange(stop, step = step, repeat = repeat)
+      assert(CheckUtils.reldiff(result2.toArray, range2.toArray) <= 1e-4f)
+
+      val result3 = 0f to stop by 1f
+      val range3 = NDArray.arange(stop)
+      assert(CheckUtils.reldiff(result3.toArray, range3.toArray) <= 1e-4f)
+
+      val stop4 = Math.abs(stop)
+      val step4 = stop4 + Math.abs(scala.util.Random.nextFloat())
+      val result4 = (0.0 until stop4.toDouble by step4.toDouble)
+        .flatMap(x => Array.fill[Float](repeat)(x.toFloat))
+      val range4 = NDArray.arange(stop4, step = step4, repeat = repeat)
+      assert(CheckUtils.reldiff(result4.toArray, range4.toArray) <= 1e-4f)
     }
   }
 
@@ -861,14 +878,18 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
   }
 
   test("reshape") {
-    val arr = NDArray.array(Array(1f, 2f, 3f, 4f, 5f, 6f), shape = Shape(3, 2))
+    var arr = NDArray.array(Array(1f, 2f, 3f, 4f, 5f, 6f), shape = Shape(3, 2))
 
-    val arr1 = arr.reshape(Array(2, 3))
+    var arr1 = arr.reshape(Array(2, 3))
     assert(arr1.shape === Shape(2, 3))
     assert(arr1.toArray === Array(1f, 2f, 3f, 4f, 5f, 6f))
 
     arr.set(1f)
     assert(arr1.toArray === Array(1f, 1f, 1f, 1f, 1f, 1f))
+
+    arr = NDArray.ones(1, 384, 1)
+    arr1 = arr.reshape(Array(0, -3))
+    assert(arr1.shape === Shape(1, 384))
   }
 
   test("dispose deps") {
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/NumpyScopeSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/NumpyScopeSuite.scala
new file mode 100644
index 000000000000..bf6627ac7e91
--- /dev/null
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/NumpyScopeSuite.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet
+
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+
+class NumpyScopeSuite extends FunSuite with BeforeAndAfterAll {
+  test("compatible") {
+    NumpyScope.enableNumpyCompatible.withScope {
+      assert(NumpyScope.isNumpyCompatible === true)
+    }
+  }
+
+  test("incompatible") {
+    NumpyScope.disableNumpyCompatible.withScope {
+      assert(NumpyScope.isNumpyCompatible === false)
+    }
+  }
+}
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala
index dd5f96f980a6..deb149985ce8 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala
@@ -614,9 +614,6 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     val embed = Symbol.Embedding(name = "embed")()(
       Map("data" -> data, "input_dim" -> inDim, "output_dim" -> outDim))
     // TODO
-    // scalastyle:off println
-    println(s"Embeded symbol: ${embed.toJson}")
-    // scalastyle:on println
   }
 
   // check ops handle duplicate input correctly.
@@ -983,9 +980,6 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
   test("batch norm") {
     val data = Symbol.Variable("data")
     val test = Symbol.BatchNorm(name = "bn")()(Map("data" -> data, "fix_gamma" -> "False"))
-    // scalastyle:off println
-    println(s"BatchNorm: ${test.toJson}")
-    // scalastyle:on println
     // TODO: check numeric gradient
   }
 
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/ResourceScopeSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/ResourceScopeSuite.scala
index 41dfa7d0ead2..19162385f0f7 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/ResourceScopeSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/ResourceScopeSuite.scala
@@ -101,6 +101,39 @@ class ResourceScopeSuite extends FunSuite with BeforeAndAfterAll with Matchers {
     assert(a.isDisposed == true, "returned object should be disposed in the outer scope")
   }
 
+  /**
+    * Tests passing a scope to using and creating new resources within.
+    */
+  test("test moving scope of native resource to scope of another") {
+    var a: TestNativeResource = null
+    var b: TestNativeResource = null
+    var c: TestNativeResource = null
+    var d: TestNativeResource = null
+
+    ResourceScope.using() {
+      a = new TestNativeResource()
+      ResourceScope.using() {
+        b = new TestNativeResource()
+        ResourceScope.usingIfScopeExists(a.scope) {
+          c = new TestNativeResource()
+          ResourceScope.using() {
+            d = new TestNativeResource()
+            assert(c.scope == a.scope)
+          }
+          assert(d.isDisposed == true)
+        }
+        assert(b.isDisposed == false)
+        assert(c.isDisposed == false)
+      }
+      assert(a.isDisposed == false)
+      assert(b.isDisposed == true)
+      assert(c.isDisposed == false)
+    }
+    assert(a.isDisposed == true)
+    assert(b.isDisposed == true)
+    assert(c.isDisposed == true)
+  }
+
   test(testName = "test NativeResources in returned Lists are not disposed") {
     var ndListRet: IndexedSeq[TestNativeResource] = null
     ResourceScope.using() {
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
index d134c83ff7e7..415be5122c95 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
@@ -32,14 +32,8 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
     var net2 = Symbol.FullyConnected(name = "fc3")()(Map("num_hidden" -> 10))
     net2 = Symbol.Activation()()(Map("data" -> net2, "act_type" -> "relu"))
     net2 = Symbol.FullyConnected(name = "fc4")()(Map("data" -> net2, "num_hidden" -> 20))
-    // scalastyle:off println
-    println(s"net2 debug info:\n${net2.debugStr}")
-    // scalastyle:on println
 
     val composed = net2(name = "composed", Map("fc3_data" -> net1))
-    // scalastyle:off println
-    println(s"composed debug info:\n${composed.debugStr}")
-    // scalastyle:on println
     val multiOut = Symbol.Group(composed, net1)
     assert(multiOut.listOutputs().length === 2)
   }
@@ -77,10 +71,6 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
     val lam = Symbol.Variable("lam")
     val rnd = Symbol.random.poisson(lam = Some(lam), shape = Some(Shape(2, 2)))
     val rnd2 = Symbol.random.poisson(lam = Some(1f), shape = Some(Shape(2, 2)))
-    // scalastyle:off println
-    println(s"Symbol.random.poisson debug info: ${rnd.debugStr}")
-    println(s"Symbol.random.poisson debug info: ${rnd2.debugStr}")
-    // scalastyle:on println
   }
 
   test("Symbol random module is generated properly - special case of 'normal'") {
@@ -88,9 +78,5 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
     val scale = Symbol.Variable("scale")
     val rnd = Symbol.random.normal(mu = Some(loc), sigma = Some(scale), shape = Some(Shape(2, 2)))
     val rnd2 = Symbol.random.normal(mu = Some(1f), sigma = Some(2f), shape = Some(Shape(2, 2)))
-    // scalastyle:off println
-    println(s"Symbol.random.sample_normal debug info: ${rnd.debugStr}")
-    println(s"Symbol.random.random_normal debug info: ${rnd2.debugStr}")
-    // scalastyle:on println
   }
 }
diff --git a/scala-package/deploy/pom.xml b/scala-package/deploy/pom.xml
index 74b57077773c..b4dd273719bb 100644
--- a/scala-package/deploy/pom.xml
+++ b/scala-package/deploy/pom.xml
@@ -150,6 +150,9 @@
             </configuration>
           </execution>
         </executions>
+        <configuration>
+            <skip>true</skip> <!-- Skip deploy-plugin:deploy and only use deploy-plugin:deploy-file -->
+        </configuration>
       </plugin>
     </plugins>
   </build>
diff --git a/scala-package/deploy/src/main/deploy/deploy.xml b/scala-package/deploy/src/main/deploy/deploy.xml
index c5be7e2c5ea1..f5b624885cbb 100644
--- a/scala-package/deploy/src/main/deploy/deploy.xml
+++ b/scala-package/deploy/src/main/deploy/deploy.xml
@@ -38,7 +38,7 @@
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parser-combinators_2.11</artifactId>
-      <version>1.0.4</version>
+      <version>1.0.5</version>
     </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
@@ -55,5 +55,15 @@
       <artifactId>commons-io</artifactId>
       <version>2.1</version>
     </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <version>1.7.5</version>
+    </dependency>
+    <dependency>
+      <groupId>args4j</groupId>
+      <artifactId>args4j</artifactId>
+      <version>2.0.29</version>
+    </dependency>
   </dependencies>
 </project>
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 07e430175c6f..257529199176 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -82,10 +82,6 @@
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
       <plugin>
         <groupId>net.alchim31.maven</groupId>
         <artifactId>scala-maven-plugin</artifactId>
@@ -145,5 +141,10 @@
       <artifactId>slf4j-simple</artifactId>
       <version>1.7.5</version>
     </dependency>
+    <dependency>
+      <groupId>com.google.code.gson</groupId>
+      <artifactId>gson</artifactId>
+      <version>2.8.5</version>
+    </dependency>
   </dependencies>
 </project>
diff --git a/scala-package/examples/scripts/infer/bert/get_bert_data.sh b/scala-package/examples/scripts/infer/bert/get_bert_data.sh
new file mode 100755
index 000000000000..609aae27cc66
--- /dev/null
+++ b/scala-package/examples/scripts/infer/bert/get_bert_data.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
+
+data_path=$MXNET_ROOT/scripts/infer/models/static-bert-qa/
+
+if [ ! -d "$data_path" ]; then
+  mkdir -p "$data_path"
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/vocab.json -o $data_path/vocab.json
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/static_bert_qa-0002.params -o $data_path/static_bert_qa-0002.params
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/static_bert_qa-symbol.json -o $data_path/static_bert_qa-symbol.json
+fi
diff --git a/scala-package/examples/scripts/infer/bert/run_bert_qa_example.sh b/scala-package/examples/scripts/infer/bert/run_bert_qa_example.sh
new file mode 100755
index 000000000000..d8ba092c5c1b
--- /dev/null
+++ b/scala-package/examples/scripts/infer/bert/run_bert_qa_example.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../../../.."; pwd)
+
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+
+java -Xmx8G -Dmxnet.traceLeakedObjects=true -cp $CLASS_PATH \
+	org.apache.mxnetexamples.javaapi.infer.bert.BertQA $@
diff --git a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertDataParser.java b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertDataParser.java
new file mode 100644
index 000000000000..440670afc098
--- /dev/null
+++ b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertDataParser.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnetexamples.javaapi.infer.bert;
+
+import java.io.FileReader;
+import java.util.*;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+
+/**
+ * This is the Utility for pre-processing the data for Bert Model
+ * You can use this utility to parse Vocabulary JSON into Java Array and Dictionary,
+ * clean and tokenize sentences and pad the text
+ */
+public class BertDataParser {
+
+    private Map<String, Integer> token2idx;
+    private List<String> idx2token;
+
+    /**
+     * Parse the Vocabulary to JSON files
+     * [PAD], [CLS], [SEP], [MASK], [UNK] are reserved tokens
+     * @param jsonFile the filePath of the vocab.json
+     * @throws Exception
+     */
+    void parseJSON(String jsonFile) throws Exception {
+        Gson gson = new Gson();
+        token2idx = new HashMap<>();
+        idx2token = new LinkedList<>();
+        JsonObject jsonObject = gson.fromJson(new FileReader(jsonFile), JsonObject.class);
+        JsonArray arr = jsonObject.getAsJsonArray("idx_to_token");
+        for (JsonElement element : arr) {
+            idx2token.add(element.getAsString());
+        }
+        JsonObject preMap = jsonObject.getAsJsonObject("token_to_idx");
+        for (String key : preMap.keySet()) {
+            token2idx.put(key, preMap.get(key).getAsInt());
+        }
+    }
+
+    /**
+     * Tokenize the input, split all kinds of whitespace and
+     * Separate the end of sentence symbol: . , ? !
+     * @param input The input string
+     * @return List of tokens
+     */
+    List<String> tokenizer(String input) {
+        String[] step1 = input.split("\\s+");
+        List<String> finalResult = new LinkedList<>();
+        for (String item : step1) {
+            if (item.length() != 0) {
+                if ((item + "a").split("[.,?!]+").length > 1) {
+                    finalResult.add(item.substring(0, item.length() - 1));
+                    finalResult.add(item.substring(item.length() -1));
+                } else {
+                    finalResult.add(item);
+                }
+            }
+        }
+        return finalResult;
+    }
+
+    /**
+     * Pad the tokens to the required length
+     * @param tokens input tokens
+     * @param padItem things to pad at the end
+     * @param num total length after padding
+     * @return List of padded tokens
+     */
+    <E> List<E> pad(List<E> tokens, E padItem, int num) {
+        if (tokens.size() >= num) return tokens;
+        List<E> padded = new LinkedList<>(tokens);
+        for (int i = 0; i < num - tokens.size(); i++) {
+            padded.add(padItem);
+        }
+        return padded;
+    }
+
+    /**
+     * Convert tokens to indexes
+     * @param tokens input tokens
+     * @return List of indexes
+     */
+    List<Integer> token2idx(List<String> tokens) {
+        List<Integer> indexes = new ArrayList<>();
+        for (String token : tokens) {
+            if (token2idx.containsKey(token)) {
+                indexes.add(token2idx.get(token));
+            } else {
+                indexes.add(token2idx.get("[UNK]"));
+            }
+        }
+        return indexes;
+    }
+
+    /**
+     * Convert indexes to tokens
+     * @param indexes List of indexes
+     * @return List of tokens
+     */
+    List<String> idx2token(List<Integer> indexes) {
+        List<String> tokens = new ArrayList<>();
+        for (int index : indexes) {
+            tokens.add(idx2token.get(index));
+        }
+        return tokens;
+    }
+}
diff --git a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertQA.java b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertQA.java
new file mode 100644
index 000000000000..b40a4e94afbd
--- /dev/null
+++ b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertQA.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnetexamples.javaapi.infer.bert;
+
+import org.apache.mxnet.infer.javaapi.Predictor;
+import org.apache.mxnet.javaapi.*;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.*;
+
+/**
+ * This is an example of using BERT to do the general Question and Answer inference jobs
+ * Users can provide a question with a paragraph contains answer to the model and
+ * the model will be able to find the best answer from the answer paragraph
+ */
+public class BertQA {
+    @Option(name = "--model-path-prefix", usage = "input model directory and prefix of the model")
+    private String modelPathPrefix = "/model/static_bert_qa";
+    @Option(name = "--model-epoch", usage = "Epoch number of the model")
+    private int epoch = 2;
+    @Option(name = "--model-vocab", usage = "the vocabulary used in the model")
+    private String modelVocab = "/model/vocab.json";
+    @Option(name = "--input-question", usage = "the input question")
+    private String inputQ = "When did BBC Japan start broadcasting?";
+    @Option(name = "--input-answer", usage = "the input answer")
+    private String inputA =
+        "BBC Japan was a general entertainment Channel.\n" +
+                " Which operated between December 2004 and April 2006.\n" +
+            "It ceased operations after its Japanese distributor folded.";
+    @Option(name = "--seq-length", usage = "the maximum length of the sequence")
+    private int seqLength = 384;
+
+    private final static Logger logger = LoggerFactory.getLogger(BertQA.class);
+    private static NDArray$ NDArray = NDArray$.MODULE$;
+
+    private static int argmax(float[] prob) {
+        int maxIdx = 0;
+        for (int i = 0; i < prob.length; i++) {
+            if (prob[maxIdx] < prob[i]) maxIdx = i;
+        }
+        return maxIdx;
+    }
+
+    /**
+     * Do the post processing on the output, apply softmax to get the probabilities
+     * reshape and get the most probable index
+     * @param result prediction result
+     * @param tokens word tokens
+     * @return Answers clipped from the original paragraph
+     */
+    static List<String> postProcessing(NDArray result, List<String> tokens) {
+        NDArray[] output = NDArray.split(
+                NDArray.new splitParam(result, 2).setAxis(2));
+        // Get the formatted logits result
+        NDArray startLogits = output[0].reshape(new int[]{0, -3});
+        NDArray endLogits = output[1].reshape(new int[]{0, -3});
+        // Get Probability distribution
+        float[] startProb = NDArray.softmax(
+                NDArray.new softmaxParam(startLogits))[0].toArray();
+        float[] endProb = NDArray.softmax(
+                NDArray.new softmaxParam(endLogits))[0].toArray();
+        int startIdx = argmax(startProb);
+        int endIdx = argmax(endProb);
+        return tokens.subList(startIdx, endIdx + 1);
+    }
+
+    public static void main(String[] args) throws Exception{
+        BertQA inst = new BertQA();
+        CmdLineParser parser = new CmdLineParser(inst);
+        parser.parseArgument(args);
+        BertDataParser util = new BertDataParser();
+        Context context = Context.cpu();
+        if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+                Integer.valueOf(System.getenv("SCALA_TEST_ON_GPU")) == 1) {
+            context = Context.gpu();
+        }
+        // pre-processing - tokenize sentence
+        List<String> tokenQ = util.tokenizer(inst.inputQ.toLowerCase());
+        List<String> tokenA = util.tokenizer(inst.inputA.toLowerCase());
+        int validLength = tokenQ.size() + tokenA.size();
+        logger.info("Valid length: " + validLength);
+        // generate token types [0000...1111....0000]
+        List<Float> QAEmbedded = new ArrayList<>();
+        util.pad(QAEmbedded, 0f, tokenQ.size()).addAll(
+                util.pad(new ArrayList<Float>(), 1f, tokenA.size())
+        );
+        List<Float> tokenTypes = util.pad(QAEmbedded, 0f, inst.seqLength);
+        // make BERT pre-processing standard
+        tokenQ.add("[SEP]");
+        tokenQ.add(0, "[CLS]");
+        tokenA.add("[SEP]");
+        tokenQ.addAll(tokenA);
+        List<String> tokens = util.pad(tokenQ, "[PAD]", inst.seqLength);
+        logger.info("Pre-processed tokens: " + Arrays.toString(tokenQ.toArray()));
+        // pre-processing - token to index translation
+        util.parseJSON(inst.modelVocab);
+        List<Integer> indexes = util.token2idx(tokens);
+        List<Float> indexesFloat = new ArrayList<>();
+        for (int integer : indexes) {
+            indexesFloat.add((float) integer);
+        }
+        // Preparing the input data
+        List<NDArray> inputBatch = Arrays.asList(
+                new NDArray(indexesFloat,
+                        new Shape(new int[]{1, inst.seqLength}), context),
+                new NDArray(tokenTypes,
+                        new Shape(new int[]{1, inst.seqLength}), context),
+                new NDArray(new float[] { validLength },
+                        new Shape(new int[]{1}), context)
+        );
+        // Build the model
+        List<Context> contexts = new ArrayList<>();
+        contexts.add(context);
+        List<DataDesc> inputDescs = Arrays.asList(
+                new DataDesc("data0",
+                        new Shape(new int[]{1, inst.seqLength}), DType.Float32(), Layout.NT()),
+                new DataDesc("data1",
+                        new Shape(new int[]{1, inst.seqLength}), DType.Float32(), Layout.NT()),
+                new DataDesc("data2",
+                        new Shape(new int[]{1}), DType.Float32(), Layout.N())
+        );
+        Predictor bertQA = new Predictor(inst.modelPathPrefix, inputDescs, contexts, inst.epoch);
+        // Start prediction
+        NDArray result = bertQA.predictWithNDArray(inputBatch).get(0);
+        List<String> answer = postProcessing(result, tokens);
+        logger.info("Question: " + inst.inputQ);
+        logger.info("Answer paragraph: " + inst.inputA);
+        logger.info("Answer: " + Arrays.toString(answer.toArray()));
+    }
+}
diff --git a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/README.md b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/README.md
new file mode 100644
index 000000000000..7925a259f48f
--- /dev/null
+++ b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/README.md
@@ -0,0 +1,103 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Run BERT QA model using Java Inference API
+
+In this tutorial, we will walk through the BERT QA model trained by MXNet. 
+Users can provide a question with a paragraph contains answer to the model and
+the model will be able to find the best answer from the answer paragraph.
+
+Example:
+```text
+Q: When did BBC Japan start broadcasting?
+```
+
+Answer paragraph
+```text
+BBC Japan was a general entertainment channel, which operated between December 2004 and April 2006.
+It ceased operations after its Japanese distributor folded.
+```
+And it picked up the right one:
+```text
+A: December 2004
+```
+
+## Setup Guide
+
+### Step 1: Download the model
+
+For this tutorial, you can get the model and vocabulary by running following bash file. This script will use `wget` to download these artifacts from AWS S3.
+
+From the `scala-package/examples/scripts/infer/bert/` folder run:
+
+```bash
+./get_bert_data.sh
+```
+
+### Step 2: Setup data path of the model
+
+### Setup Datapath and Parameters
+
+The available arguments are as follows:
+
+| Argument                      | Comments                                 |
+| ----------------------------- | ---------------------------------------- |
+| `--model-path-prefix`           | Folder path with prefix to the model (including json, params). |
+| `--model-vocab`                 | Vocabulary path |
+| `--model-epoch`                 | Epoch number of the model |
+| `--input-question`              | Question that asked to the model |
+| `--input-answer`                | Paragraph that contains the answer |
+| `--seq-length`                  | Sequence Length of the model (384 by default) |
+
+### Step 3: Run Inference
+After the previous steps, you should be able to run the code using the following script that will pass all of the required parameters to the Infer API.
+
+From the `scala-package/examples/scripts/infer/bert/` folder run:
+
+```bash
+./run_bert_qa_example.sh --model-path-prefix ../models/static-bert-qa/static_bert_qa \
+                         --model-vocab ../models/static-bert-qa/vocab.json \
+                         --model-epoch 2
+```
+
+## Background
+
+To learn more about how BERT works in MXNet, please follow this [MXNet Gluon tutorial on NLP using BERT](https://medium.com/apache-mxnet/gluon-nlp-bert-6a489bdd3340).
+
+The model was extracted from MXNet GluonNLP with static length settings.
+
+[Download link for the script](https://gluon-nlp.mxnet.io/_downloads/bert.zip)
+
+The original description can be found in the [MXNet GluonNLP model zoo](https://gluon-nlp.mxnet.io/model_zoo/bert/index.html#bert-base-on-squad-1-1).
+```bash
+python static_finetune_squad.py --optimizer adam --accumulate 2 --batch_size 6 --lr 3e-5 --epochs 2 --gpu 0 --export
+
+```
+This script will generate `json` and `param` fles that are the standard MXNet model files.
+By default, this model are using `bert_12_768_12` model with extra layers for QA jobs.
+
+After that, to be able to use it in Java, we need to export the dictionary from the script to parse the text
+to actual indexes. Please add the following lines after [this line](https://github.com/dmlc/gluon-nlp/blob/master/scripts/bert/staticbert/static_finetune_squad.py#L262).
+```python
+import json
+json_str = vocab.to_json()
+f = open("vocab.json", "w")
+f.write(json_str)
+f.close()
+```
+This would export the token vocabulary in json format.
+Once you have these three files, you will be able to run this example without problems.
diff --git a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/README.md b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/README.md
index 8a9ed3e1736b..4c4512f152c8 100644
--- a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/README.md
+++ b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/README.md
@@ -84,7 +84,7 @@ After the previous steps, you should be able to run the code using the following
 From the `scala-package/examples/scripts/infer/objectdetector/` folder run:
 
 ```bash
-./run_ssd_example.sh ../models/resnet50_ssd/resnet50_ssd_model ../images/dog.jpg ../images
+./run_ssd_java_example.sh ../models/resnet50_ssd/resnet50_ssd_model ../images/dog.jpg ../images
 ```
 
 **Notes**:
diff --git a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/SSDClassifierExample.java b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/SSDClassifierExample.java
index a9c00f7f1d81..31b8514de345 100644
--- a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/SSDClassifierExample.java
+++ b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/SSDClassifierExample.java
@@ -28,12 +28,11 @@
 import org.apache.mxnet.infer.javaapi.ObjectDetector;
 
 // scalastyle:off
+import javax.imageio.ImageIO;
 import java.awt.image.BufferedImage;
 // scalastyle:on
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
+import java.util.*;
 
 import java.io.File;
 
@@ -128,22 +127,34 @@ public static void main(String[] args) {
         try {
             Shape inputShape = new Shape(new int[]{1, 3, 512, 512});
             Shape outputShape = new Shape(new int[]{1, 6132, 6});
-            
-            
-            int width = inputShape.get(2);
-            int height = inputShape.get(3);
+
             StringBuilder outputStr = new StringBuilder().append("\n");
             
             List<List<ObjectDetectorOutput>> output
                     = runObjectDetectionSingle(mdprefixDir, imgPath, context);
-            
+
+            // Creating Bounding box material
+            BufferedImage buf = ImageIO.read(new File(imgPath));
+            int width = buf.getWidth();
+            int height = buf.getHeight();
+            List<Map<String, Integer>> boxes = new ArrayList<>();
+            List<String> names = new ArrayList<>();
             for (List<ObjectDetectorOutput> ele : output) {
                 for (ObjectDetectorOutput i : ele) {
                     outputStr.append("Class: " + i.getClassName() + "\n");
                     outputStr.append("Probabilties: " + i.getProbability() + "\n");
-                    
-                    List<Float> coord = Arrays.asList(i.getXMin() * width,
-                            i.getXMax() * height, i.getYMin() * width, i.getYMax() * height);
+                    names.add(i.getClassName());
+                    Map<String, Integer> map = new HashMap<>();
+                    float xmin = i.getXMin() * width;
+                    float xmax = i.getXMax() * width;
+                    float ymin = i.getYMin() * height;
+                    float ymax = i.getYMax() * height;
+                    List<Float> coord = Arrays.asList(xmin, xmax, ymin, ymax);
+                    map.put("xmin", (int) xmin);
+                    map.put("xmax", (int) xmax);
+                    map.put("ymin", (int) ymin);
+                    map.put("ymax", (int) ymax);
+                    boxes.add(map);
                     StringBuilder sb = new StringBuilder();
                     for (float c : coord) {
                         sb.append(", ").append(c);
@@ -152,7 +163,12 @@ public static void main(String[] args) {
                 }
             }
             logger.info(outputStr.toString());
-            
+
+            // Covert to image
+            Image.drawBoundingBox(buf, boxes, names);
+            File outputFile = new File("boundingImage.png");
+            ImageIO.write(buf, "png", outputFile);
+
             List<List<List<ObjectDetectorOutput>>> outputList =
                     runObjectDetectionBatch(mdprefixDir, imgDir, context);
             
@@ -177,7 +193,6 @@ public static void main(String[] args) {
                 }
             }
             logger.info(outputStr.toString());
-            
         } catch (Exception e) {
             logger.error(e.getMessage(), e);
             parser.printUsage(System.err);
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala
index 6d414bb0328a..350e28cf8634 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala
@@ -202,10 +202,10 @@ object BucketIo {
       labelBuf.set(labels.flatten)
 
       iBucket += 1
-      val batchProvideData = { val tmp = ListMap("data" -> dataBuf.shape)
-        tmp ++ initStates.map(x => x._1 -> Shape(x._2._1, x._2._2))
-      }
-      val batchProvideLabel = ListMap("softmax_label" -> labelBuf.shape)
+      val batchProvideData = IndexedSeq(DataDesc("data", dataBuf.shape, dataBuf.dtype)) ++
+        initStates.map {
+          case (name, shape) => DataDesc(name, Shape(shape._1, shape._2), DType.Float32)}
+      val batchProvideLabel = IndexedSeq(DataDesc("softmax_label", labelBuf.shape, labelBuf.dtype))
       val initStateArrays = initStates.map(x => NDArray.zeros(x._2._1, x._2._2))
       new DataBatch(IndexedSeq(dataBuf.copy()) ++ initStateArrays,
         IndexedSeq(labelBuf.copy()),
diff --git a/scala-package/examples/src/test/java/org/apache/mxnetexamples/javaapi/infer/predictor/BertExampleTest.java b/scala-package/examples/src/test/java/org/apache/mxnetexamples/javaapi/infer/predictor/BertExampleTest.java
new file mode 100644
index 000000000000..0518254c297d
--- /dev/null
+++ b/scala-package/examples/src/test/java/org/apache/mxnetexamples/javaapi/infer/predictor/BertExampleTest.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnetexamples.javaapi.infer.predictor;
+
+import org.apache.mxnetexamples.Util;
+import org.apache.mxnetexamples.javaapi.infer.bert.BertQA;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+
+/**
+ * Test on BERT QA model
+ */
+public class BertExampleTest {
+    final static Logger logger = LoggerFactory.getLogger(BertExampleTest.class);
+    private static String modelPathPrefix = "";
+    private static String vocabPath = "";
+
+    @BeforeClass
+    public static void downloadFile() {
+        logger.info("Downloading Bert QA Model");
+        String tempDirPath = System.getProperty("java.io.tmpdir");
+        logger.info("tempDirPath: %s".format(tempDirPath));
+
+        String baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA";
+        Util.downloadUrl(baseUrl + "/static_bert_qa-symbol.json",
+                tempDirPath + "/static_bert_qa/static_bert_qa-symbol.json", 3);
+        Util.downloadUrl(baseUrl + "/static_bert_qa-0002.params",
+                tempDirPath + "/static_bert_qa/static_bert_qa-0002.params", 3);
+        Util.downloadUrl(baseUrl + "/vocab.json",
+                tempDirPath + "/static_bert_qa/vocab.json", 3);
+        modelPathPrefix = tempDirPath + File.separator + "static_bert_qa/static_bert_qa";
+        vocabPath = tempDirPath + File.separator + "static_bert_qa/vocab.json";
+    }
+
+    @Test
+    public void testBertQA() throws Exception{
+        BertQA bert = new BertQA();
+        String Q = "When did BBC Japan start broadcasting?";
+        String A = "BBC Japan was a general entertainment Channel.\n" +
+                " Which operated between December 2004 and April 2006.\n" +
+                "It ceased operations after its Japanese distributor folded.";
+        String[] args = new String[] {
+                "--model-path-prefix", modelPathPrefix,
+                "--model-vocab", vocabPath,
+                "--model-epoch", "2",
+                "--input-question", Q,
+                "--input-answer", A,
+                "--seq-length", "384"
+        };
+        bert.main(args);
+    }
+}
diff --git a/scala-package/infer/pom.xml b/scala-package/infer/pom.xml
index ed90d8073675..81e93932e83f 100644
--- a/scala-package/infer/pom.xml
+++ b/scala-package/infer/pom.xml
@@ -46,10 +46,6 @@
           </excludes>
         </configuration>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
       <plugin>
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
index bf6581588114..38fdc0028a7a 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
@@ -31,6 +31,7 @@ trait ClassifierBase {
 
   /**
     * Takes an array of floats and returns corresponding (Label, Score) tuples
+    * @tparam T The Scala equivalent of the DType used for the input array and return value
     * @param input            Indexed sequence one-dimensional array of floats/doubles
     * @param topK             (Optional) How many result (sorting based on the last axis)
     *                         elements to return. Default returns unsorted output.
@@ -167,6 +168,12 @@ class Classifier(modelPathPrefix: String,
     result.toIndexedSeq
   }
 
+  /**
+    * Gives the path to the standard location of the synset.txt file
+    * @throws IllegalArgumentException Thrown when the file does not exist
+    * @param modelPathPrefix The path to the model directory
+    * @return The path to the synset.txt file
+    */
   private[infer] def getSynsetFilePath(modelPathPrefix: String): String = {
     val dirPath = modelPathPrefix.substring(0, 1 + modelPathPrefix.lastIndexOf(File.separator))
     val d = new File(dirPath)
@@ -179,6 +186,11 @@ class Classifier(modelPathPrefix: String,
     s.getCanonicalPath
   }
 
+  /**
+    * Parses the labels from a synset file
+    * @param synsetFilePath The path to the synset file. Can be gotten from getSynsetFilePath
+    * @return A IndexedSeq of each element in the file
+    */
   private[infer]  def readSynsetFile(synsetFilePath: String): IndexedSeq[String] = {
     val f = io.Source.fromFile(synsetFilePath)
     try {
@@ -188,6 +200,11 @@ class Classifier(modelPathPrefix: String,
     }
   }
 
+  /**
+    * Creates a predictor with the same modelPath, inputDescriptors, contexts,
+    * and epoch as the classifier
+    * @return The new Predictor
+    */
   private[infer] def getPredictor(): PredictBase = {
       new Predictor(modelPathPrefix, inputDescriptors, contexts, epoch)
   }
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala
index 3c80f9226399..fb5f39fb2096 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala
@@ -66,6 +66,12 @@ class ImageClassifier(modelPathPrefix: String,
   protected[infer] val height = inputShape(inputLayout.indexOf('H'))
   protected[infer] val width = inputShape(inputLayout.indexOf('W'))
 
+  /**
+    * Get the names and shapes that would be returns by a classify call
+    * @return a list of (name, shape) tuples
+    */
+  def outputShapes: IndexedSeq[(String, Shape)] = predictor.outputShapes
+
   /**
     * To classify the image according to the provided model
     *
@@ -125,6 +131,19 @@ class ImageClassifier(modelPathPrefix: String,
     result
   }
 
+  /**
+    * Creates a Classifier
+    *
+    * @param modelPathPrefix    Path prefix from where to load the model artifacts.
+    *                           These include the symbol, parameters, and synset.txt.
+    *                           Example: file://model-dir/resnet-152 (containing
+    *                           resnet-152-symbol.json, resnet-152-0000.params, and synset.txt).
+    * @param inputDescriptors   Descriptors defining the input node names, shape,
+    *                           layout and type parameters
+    * @param contexts           Device contexts on which you want to run inference; defaults to CPU
+    * @param epoch              Model epoch to load; defaults to 0
+    * @return                   A Classifier to perform inference with
+    */
   private[infer] def getClassifier(modelPathPrefix: String,
                                      inputDescriptors: IndexedSeq[DataDesc],
                     contexts: Array[Context] = Context.cpu(),
@@ -154,19 +173,16 @@ object ImageClassifier {
 
   /**
     * Convert input BufferedImage to NDArray of input shape
-    *
-    * <p>
     * Note: Caller is responsible to dispose the NDArray
     * returned by this method after the use.
-    * </p>
-    * @param resizedImage     BufferedImage to get pixels from
     *
-    * @param inputImageShape  Input shape; for example for resnet it is (3,224,224).
-                              Should be same as inputDescriptor shape.
-    * @param dType            The DataType of the NDArray created from the image
-    *                         that should be returned.
-    *                         Currently it defaults to Dtype.Float32
-    * @return                 NDArray pixels array with shape (3, 224, 224) in CHW format
+    * @param resizedImage BufferedImage to get pixels from
+    * @param inputImageShape Input shape; for example for resnet it is (3,224,224).
+    *                        Should be same as inputDescriptor shape.
+    * @param dType The DataType of the NDArray created from the image
+    *              that should be returned.
+    *              Currently it defaults to Dtype.Float32
+    * @return NDArray pixels array with shape (3, 224, 224) in CHW format
     */
   def bufferedImageToPixels(resizedImage: BufferedImage, inputImageShape: Shape,
                             dType : DType = DType.Float32): NDArray = {
@@ -233,4 +249,4 @@ object ImageClassifier {
   def loadInputBatch(inputImagePaths: List[String]): Traversable[BufferedImage] = {
     inputImagePaths.map(path => ImageIO.read(new File(path)))
   }
-}
\ No newline at end of file
+}
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/MXNetHandler.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/MXNetHandler.scala
index d2bed3aa9d80..593bab66bf12 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/MXNetHandler.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/MXNetHandler.scala
@@ -23,6 +23,12 @@ import org.slf4j.LoggerFactory
 
 private[infer] trait MXNetHandler {
 
+  /**
+    * Executes a function within a thread-safe executor
+    * @param f The function to execute
+    * @tparam T The return type of the function
+    * @return Returns the result of the function f
+    */
   def execute[T](f: => T): T
 
   val executor: ExecutorService
@@ -31,7 +37,11 @@ private[infer] trait MXNetHandler {
 
 private[infer] object MXNetHandlerType extends Enumeration {
 
+  /**
+    * The internal type of the MXNetHandlerType enumeration
+    */
   type MXNetHandlerType = Value
+
   val SingleThreadHandler = Value("MXNetSingleThreadHandler")
   val OneThreadPerModelHandler = Value("MXNetOneThreadPerModelHandler")
 }
@@ -93,6 +103,10 @@ private[infer] object MXNetSingleThreadHandler extends MXNetThreadPoolHandler(1)
 
 private[infer] object MXNetHandler {
 
+  /**
+    * Creates a handler based on the handlerType
+    * @return A ThreadPool or Thread Handler
+    */
   def apply(): MXNetHandler = {
     if (handlerType == MXNetHandlerType.OneThreadPerModelHandler) {
       new MXNetThreadPoolHandler(1)
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
index 7146156d7cc5..b78cfbccd987 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
@@ -20,12 +20,13 @@ package org.apache.mxnet.infer
 // scalastyle:off
 import java.awt.image.BufferedImage
 
+import org.apache.mxnet.Shape
+
 import scala.collection.parallel.mutable.ParArray
 // scalastyle:on
 import org.apache.mxnet.NDArray
 import org.apache.mxnet.DataDesc
 import org.apache.mxnet.Context
-import scala.collection.mutable.ListBuffer
 
 /**
   * The ObjectDetector class helps to run ObjectDetection tasks where the goal
@@ -110,6 +111,13 @@ class ObjectDetector(modelPathPrefix: String,
     batchResult.toIndexedSeq
   }
 
+  /**
+    * Formats detection results by sorting in descending order of accuracy (topK only)
+    * and combining with synset labels
+    * @param predictResultND The results from the objectDetect call
+    * @param topK The number of top results to return or None for all
+    * @return The top predicted results as (className, [Accuracy, Xmin, Ymin, Xmax, Ymax])
+    */
   private[infer] def sortAndReformat(predictResultND: NDArray, topK: Option[Int])
   : IndexedSeq[(String, Array[Float])] = {
     // iterating over the all the predictions
@@ -131,7 +139,7 @@ class ObjectDetector(modelPathPrefix: String,
     if (topK.isDefined) {
       var sortedIndices = predictResult.zipWithIndex.sortBy(-_._1(1)).map(_._2)
       sortedIndices = sortedIndices.take(topK.get)
-      // takeRight(5) would provide the output as Array[Accuracy, Xmin, Ymin, Xmax, Ymax
+      // takeRight(5) would provide the output as Array[Accuracy, Xmin, Ymin, Xmax, Ymax]
       result = sortedIndices.map(idx
       => (synset(predictResult(idx)(0).toInt),
           predictResult(idx).takeRight(5))).toIndexedSeq
@@ -169,12 +177,42 @@ class ObjectDetector(modelPathPrefix: String,
     result
   }
 
+  /**
+    * Creates an image classifier from the object detector model
+    * @param modelPathPrefix    Path prefix from where to load the model artifacts.
+    *                           These include the symbol, parameters, and synset.txt.
+    *                           Example: file://model-dir/resnet-152 (containing
+    *                           resnet-152-symbol.json, resnet-152-0000.params, and synset.txt).
+    * @param inputDescriptors   Descriptors defining the input node names, shape,
+    *                           layout and type parameters
+    * @param contexts           Device contexts on which you want to run inference; defaults to CPU
+    * @param epoch              Model epoch to load; defaults to 0
+    * @return The corresponding image classifier
+    */
   private[infer] def getImageClassifier(modelPathPrefix: String,
                                         inputDescriptors: IndexedSeq[DataDesc],
                          contexts: Array[Context] = Context.cpu(),
                          epoch: Option[Int] = Some(0)):
   ImageClassifier = {
-    new ImageClassifier(modelPathPrefix, inputDescriptors, contexts, epoch)
-  }
+    val imageClassifier: ImageClassifier =
+      new ImageClassifier(modelPathPrefix, inputDescriptors, contexts, epoch)
 
+    val shapes: IndexedSeq[(String, Shape)] = imageClassifier.outputShapes
+    if (shapes.length != inputDescriptors.length) {
+      throw new IllegalStateException(s"Invalid output shapes, expected:" +
+        s" $inputDescriptors.length, actual: $shapes.length.")
+    }
+    shapes.map(_._2).foreach(shape => {
+      if (shape.length < 3) {
+        throw new IllegalArgumentException("Invalid output shapes, the model doesn't"
+          + " support object detection.")
+      }
+      if (shape.get(2) < 6) {
+        throw new IllegalArgumentException("Invalid output shapes, the model doesn't"
+          + " support object detection with bounding box.")
+      }
+    })
+
+    imageClassifier
+  }
 }
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala
index 67692a316cc4..cb27c930903d 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala
@@ -33,29 +33,34 @@ import org.slf4j.LoggerFactory
 private[infer] trait PredictBase {
 
   /**
-   * Converts indexed sequences of 1-D array to NDArrays.
-   * <p>
-   * This method will take input as IndexedSeq one dimensional arrays and creates the
-   * NDArray needed for inference. The array will be reshaped based on the input descriptors.
-   * @param input:            An Indexed Sequence of a one-dimensional array of datatype
-    *                         Float or Double
-                              An IndexedSequence is needed when the model has more than one input.
-   * @return                  Indexed sequence array of outputs
-   */
+    * Converts indexed sequences of 1-D array to NDArrays.
+    * This method will take input as IndexedSeq one dimensional arrays and creates the
+    * NDArray needed for inference. The array will be reshaped based on the input descriptors.
+    * @tparam T The Scala equivalent of the DType used for the input array and return value
+    * @param input An Indexed Sequence of a one-dimensional array of datatype
+    *              Float or Double
+    *              An IndexedSequence is needed when the model has more than one input.
+    * @return      Indexed sequence array of outputs
+    */
   def predict[@specialized (Base.MX_PRIMITIVES) T](input: IndexedSeq[Array[T]])
   : IndexedSeq[Array[T]]
 
   /**
-   * Predict using NDArray as input.
-   * <p>
-   * This method is useful when the input is a batch of data
-   * or when multiple operations on the input have to performed.
-   * Note: User is responsible for managing allocation/deallocation of NDArrays.
-   * @param input             IndexedSequence NDArrays.
-   * @return                  Output of predictions as NDArrays.
-   */
+    * Predict using NDArray as input.
+    * <p>
+    * This method is useful when the input is a batch of data
+    * or when multiple operations on the input have to performed.
+    * Note: User is responsible for managing allocation/deallocation of NDArrays.
+    * @param input             IndexedSequence NDArrays.
+    * @return                  Output of predictions as NDArrays.
+    */
   def predictWithNDArray(input: IndexedSeq[NDArray]): IndexedSeq[NDArray]
 
+  /**
+    * Get model output shapes.
+    * @return   model output shapes.
+    */
+  def outputShapes: IndexedSeq[(String, Shape)]
 }
 
 /**
@@ -122,6 +127,8 @@ class Predictor(modelPathPrefix: String,
 
   protected[infer] val mod = loadModule()
 
+  override def outputShapes: IndexedSeq[(String, Shape)] = mod.outputShapes
+
   /**
    * Takes input as IndexedSeq one dimensional arrays and creates the NDArray needed for inference
    * The array will be reshaped based on the input descriptors.
@@ -241,6 +248,10 @@ class Predictor(modelPathPrefix: String,
     resultND
   }
 
+  /**
+    * Creates the module backing the Predictor with the same path, epoch, contexts, and inputs
+    * @return The Module
+    */
   private[infer] def loadModule(): Module = {
     val mod = mxNetHandler.execute(Module.loadCheckpoint(modelPathPrefix, epoch.get,
       contexts = contexts, dataNames = inputDescriptors.map(desc => desc.name)))
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala
index 05334e49a356..8131273eca94 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala
@@ -31,19 +31,23 @@ import scala.language.implicitConversions
   * The ObjectDetector class helps to run ObjectDetection tasks where the goal
   * is to find bounding boxes and corresponding labels for objects in a image.
   *
-  * @param modelPathPrefix    Path prefix from where to load the model artifacts.
-  *                           These include the symbol, parameters, and synset.txt.
-  *                           Example: file://model-dir/ssd_resnet50_512 (containing
-  *                           ssd_resnet50_512-symbol.json, ssd_resnet50_512-0000.params,
-  *                           and synset.txt)
-  * @param inputDescriptors   Descriptors defining the input node names, shape,
-  *                           layout and type parameters
-  * @param contexts           Device contexts on which you want to run inference.
-  *                           Defaults to CPU.
-  * @param epoch              Model epoch to load; defaults to 0
+  * @param objDetector A source Scala Object detector
   */
 class ObjectDetector private[mxnet] (val objDetector: org.apache.mxnet.infer.ObjectDetector){
 
+  /**
+    *
+    * @param modelPathPrefix    Path prefix from where to load the model artifacts.
+    *                           These include the symbol, parameters, and synset.txt.
+    *                           Example: file://model-dir/ssd_resnet50_512 (containing
+    *                           ssd_resnet50_512-symbol.json, ssd_resnet50_512-0000.params,
+    *                           and synset.txt)
+    * @param inputDescriptors   Descriptors defining the input node names, shape,
+    *                           layout and type parameters
+    * @param contexts           Device contexts on which you want to run inference.
+    *                           Defaults to CPU.
+    * @param epoch              Model epoch to load; defaults to 0
+    */
   def this(modelPathPrefix: String, inputDescriptors: java.lang.Iterable[DataDesc], contexts:
   java.lang.Iterable[Context], epoch: Int)
   = this {
@@ -98,32 +102,78 @@ class ObjectDetector private[mxnet] (val objDetector: org.apache.mxnet.infer.Obj
     (ret map {a => (a map {e => new ObjectDetectorOutput(e._1, e._2)}).asJava}).asJava
   }
 
+  /**
+    * Helper to map an implicit conversion
+    * @param l The value to convert
+    * @tparam B The desired type
+    * @tparam A The input type
+    * @return The converted result
+    */
   def convert[B, A <% B](l: IndexedSeq[A]): IndexedSeq[B] = l map { a => a: B }
 
 }
 
 
 object ObjectDetector {
-  implicit def fromObjectDetector(OD: org.apache.mxnet.infer.ObjectDetector):
-    ObjectDetector = new ObjectDetector(OD)
-
-  implicit def toObjectDetector(jOD: ObjectDetector):
-    org.apache.mxnet.infer.ObjectDetector = jOD.objDetector
 
+  /**
+    * Loads an input images from file
+    * @param inputImagePath   Path of single input image
+    * @return                 BufferedImage Buffered image
+    */
   def loadImageFromFile(inputImagePath: String): BufferedImage = {
     org.apache.mxnet.infer.ImageClassifier.loadImageFromFile(inputImagePath)
   }
 
+  /**
+    * Reshape the input image to a new shape
+    *
+    * @param img              Input image
+    * @param newWidth         New width for rescaling
+    * @param newHeight        New height for rescaling
+    * @return                 Rescaled BufferedImage
+    */
   def reshapeImage(img : BufferedImage, newWidth: Int, newHeight: Int): BufferedImage = {
     org.apache.mxnet.infer.ImageClassifier.reshapeImage(img, newWidth, newHeight)
   }
 
+  /**
+    * Convert input BufferedImage to NDArray of input shape
+    * Note: Caller is responsible to dispose the NDArray
+    * returned by this method after the use.
+    *
+    * @param resizedImage BufferedImage to get pixels from
+    * @param inputImageShape Input shape; for example for resnet it is (3,224,224).
+    *                        Should be same as inputDescriptor shape.
+    * @return NDArray pixels array with shape (3, 224, 224) in CHW format
+    */
   def bufferedImageToPixels(resizedImage: BufferedImage, inputImageShape: Shape): NDArray = {
     org.apache.mxnet.infer.ImageClassifier.bufferedImageToPixels(resizedImage, inputImageShape)
   }
 
+  /**
+    * Loads a batch of images from a folder
+    * @param inputImagePaths  Path to a folder of images
+    * @return                   List of buffered images
+    */
   def loadInputBatch(inputImagePaths: java.lang.Iterable[String]): java.util.List[BufferedImage] = {
     org.apache.mxnet.infer.ImageClassifier
       .loadInputBatch(inputImagePaths.asScala.toList).toList.asJava
   }
+
+  /**
+    * Implicitly convert a Scala ObjectDetector to a Java ObjectDetector
+    * @param OD The Scala ObjectDetector
+    * @return The Java ObjectDetector
+    */
+  implicit def fromObjectDetector(OD: org.apache.mxnet.infer.ObjectDetector):
+  ObjectDetector = new ObjectDetector(OD)
+
+  /**
+    * Implicitly converts a Java ObjectDetector to a Scala ObjectDetector
+    * @param jOD The Java ObjectDetector
+    * @return The Scala ObjectDetector
+    */
+  implicit def toObjectDetector(jOD: ObjectDetector):
+  org.apache.mxnet.infer.ObjectDetector = jOD.objDetector
 }
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetectorOutput.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetectorOutput.scala
index 5a6ac7599fa9..32fd87e05f69 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetectorOutput.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetectorOutput.scala
@@ -52,14 +52,14 @@ class ObjectDetectorOutput (className: String, args: Array[Float]){
     *
     * @return       Float of the max X coordinate for the object bounding box
     */
-  def getXMax: Float = args(2)
+  def getXMax: Float = args(3)
 
   /**
     * Gets the minimum Y coordinate for the bounding box containing the predicted object.
     *
     * @return       Float of the min Y coordinate for the object bounding box
     */
-  def getYMin: Float = args(3)
+  def getYMin: Float = args(2)
 
   /**
     * Gets the maximum Y coordinate for the bounding box containing the predicted object.
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
index 6c0871fae51b..e1505a4da821 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
@@ -25,21 +25,25 @@ import scala.collection.JavaConverters._
 /**
   * Implementation of prediction routines.
   *
-  * @param modelPathPrefix     Path prefix from where to load the model artifacts.
-  *                            These include the symbol, parameters, and synset.txt
-  *                            Example: file://model-dir/resnet-152 (containing
-  *                            resnet-152-symbol.json, resnet-152-0000.params, and synset.txt).
-  * @param inputDescriptors    Descriptors defining the input node names, shape,
-  *                            layout and type parameters
-  *                            <p>Note: If the input Descriptors is missing batchSize
-  *                            ('N' in layout), a batchSize of 1 is assumed for the model.
-  * @param contexts            Device contexts on which you want to run inference; defaults to CPU
-  * @param epoch               Model epoch to load; defaults to 0
-
+  * @param predictor The underlying Scala predictor
   */
 
 // JavaDoc description of class to be updated in https://issues.apache.org/jira/browse/MXNET-1178
 class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor){
+
+  /**
+    *
+    * @param modelPathPrefix     Path prefix from where to load the model artifacts.
+    *                            These include the symbol, parameters, and synset.txt
+    *                            Example: file://model-dir/resnet-152 (containing
+    *                            resnet-152-symbol.json, resnet-152-0000.params, and synset.txt).
+    * @param inputDescriptors    Descriptors defining the input node names, shape,
+    *                            layout and type parameters
+    *                            <p>Note: If the input Descriptors is missing batchSize
+    *                            ('N' in layout), a batchSize of 1 is assumed for the model.
+    * @param contexts            Device contexts on which you want to run inference; defaults to CPU
+    * @param epoch               Model epoch to load; defaults to 0
+    */
   def this(modelPathPrefix: String, inputDescriptors: java.lang.Iterable[DataDesc],
            contexts: java.lang.Iterable[Context], epoch: Int)
   = this {
@@ -53,19 +57,17 @@ class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor)
     * Takes input as Array of one dimensional arrays and creates the NDArray needed for inference
     * The array will be reshaped based on the input descriptors. Example of calling in Java:
     *
-    * <pre>
-    * {@code
+    * {{{
     * float tmp[][] = new float[1][224];
     * for (int x = 0; x < 1; x++)
     *   for (int y = 0; y < 224; y++)
     *     tmp[x][y] = (int)(Math.random()*10);
     * predictor.predict(tmp);
-    * }
-    * </pre>
+    * }}}
     *
-    * @param input:            An Array of a one-dimensional array.
-                              An extra Array is needed for when the model has more than one input.
-    * @return                  Indexed sequence array of outputs
+    * @param input An Array of a one-dimensional array.
+    *              An extra Array is needed for when the model has more than one input.
+    * @return Indexed sequence array of outputs
     */
   def predict(input: Array[Array[Float]]):
   Array[Array[Float]] = {
@@ -76,18 +78,16 @@ class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor)
     * Takes input as Array of one dimensional arrays and creates the NDArray needed for inference
     * The array will be reshaped based on the input descriptors. Example of calling in Java:
     *
-    * <pre>
-    * {@code
+    * {{{
     * double tmp[][] = new double[1][224];
     * for (int x = 0; x < 1; x++)
     *   for (int y = 0; y < 224; y++)
     *     tmp[x][y] = (int)(Math.random()*10);
     * predictor.predict(tmp);
-    * }
-    * </pre>
+    * }}}
     *
-    * @param input:            An Array of a one-dimensional array.
-                              An extra Array is needed for when the model has more than one input.
+    * @param input            An Array of a one-dimensional array.
+    *                         An extra Array is needed for when the model has more than one input.
     * @return                  Indexed sequence array of outputs
     */
 
@@ -100,8 +100,8 @@ class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor)
     * Takes input as List of one dimensional iterables and creates the NDArray needed for inference
     * The array will be reshaped based on the input descriptors.
     *
-    * @param input:            A List of a one-dimensional iterables of DType Float.
-                              An extra List is needed for when the model has more than one input.
+    * @param input            A List of a one-dimensional iterables of DType Float.
+    *                         An extra List is needed for when the model has more than one input.
     * @return                  Indexed sequence array of outputs
     */
   def predict(input: java.util.List[java.util.List[java.lang.Float]]):
diff --git a/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorOutputTest.java b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorOutputTest.java
index 04041fcda9bf..6f3df86b8e74 100644
--- a/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorOutputTest.java
+++ b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorOutputTest.java
@@ -36,8 +36,8 @@ public void testConstructor() {
         Assert.assertEquals(odOutput.getClassName(), predictedClassName);
         Assert.assertEquals("Threshold not matching", odOutput.getProbability(), 0f, delta);
         Assert.assertEquals("Threshold not matching", odOutput.getXMin(), 1f, delta);
-        Assert.assertEquals("Threshold not matching", odOutput.getXMax(), 2f, delta);
-        Assert.assertEquals("Threshold not matching", odOutput.getYMin(), 3f, delta);
+        Assert.assertEquals("Threshold not matching", odOutput.getXMax(), 3f, delta);
+        Assert.assertEquals("Threshold not matching", odOutput.getYMin(), 2f, delta);
         Assert.assertEquals("Threshold not matching", odOutput.getYMax(), 4f, delta);
 
     }
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index 91a13d0fc521..cb84824bb4e2 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -51,10 +51,6 @@
           </excludes>
         </configuration>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
       <plugin>
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
index 498c4e943669..b2033f529c65 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
@@ -116,7 +116,7 @@ private[mxnet] abstract class GeneratorBase {
     */
   protected def structGeneration(c: blackbox.Context)
                                 (funcDef: List[c.universe.DefDef], annottees: c.Expr[Any]*)
-  : c.Expr[Any] = {
+  : c.Expr[Nothing] = {
     import c.universe._
     val inputs = annottees.map(_.tree).toList
     // pattern match on the inputs
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
index c18694b59bf6..f5b8bce11cf5 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
@@ -22,15 +22,16 @@ import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
 private[mxnet] class AddNDArrayFunctions(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) = macro NDArrayMacro.addDefs
+  private[mxnet] def macroTransform(annottees: Any*): Any = macro NDArrayMacro.addDefs
 }
 
 private[mxnet] class AddNDArrayAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) = macro TypedNDArrayAPIMacro.typeSafeAPIDefs
+  private[mxnet] def macroTransform(annottees: Any*): Any =
+  macro TypedNDArrayAPIMacro.typeSafeAPIDefs
 }
 
 private[mxnet] class AddNDArrayRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) =
+  private[mxnet] def macroTransform(annottees: Any*): Any =
   macro TypedNDArrayRandomAPIMacro.typeSafeAPIDefs
 }
 
@@ -39,7 +40,7 @@ private[mxnet] class AddNDArrayRandomAPIs(isContrib: Boolean) extends StaticAnno
   */
 private[mxnet] object NDArrayMacro extends GeneratorBase {
 
-  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddNDArrayFunctions($b)" => c.eval[Boolean](c.Expr(b))
@@ -49,7 +50,7 @@ private[mxnet] object NDArrayMacro extends GeneratorBase {
   }
 
   private def impl(c: blackbox.Context)
-                  (isContrib: Boolean, annottees: c.Expr[Any]*): c.Expr[Any] = {
+                  (isContrib: Boolean, annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
 
     val functions = functionsToGenerate(isSymbol = false, isContrib)
@@ -82,7 +83,7 @@ private[mxnet] object NDArrayMacro extends GeneratorBase {
   */
 private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
 
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddNDArrayAPIs($b)" => c.eval[Boolean](c.Expr(b))
@@ -148,7 +149,7 @@ private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
 private[mxnet] object TypedNDArrayRandomAPIMacro extends GeneratorBase
   with RandomHelpers {
 
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     // Note: no contrib managed in this module
 
     val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = false)
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
index 7ec80b9c066c..06b567c3d2d4 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
@@ -23,15 +23,16 @@ import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
 private[mxnet] class AddSymbolFunctions(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) = macro SymbolMacro.addDefs
+  private[mxnet] def macroTransform(annottees: Any*): Any = macro SymbolMacro.addDefs
 }
 
 private[mxnet] class AddSymbolAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) = macro TypedSymbolAPIMacro.typeSafeAPIDefs
+  private[mxnet] def macroTransform(annottees: Any*): Any =
+  macro TypedSymbolAPIMacro.typeSafeAPIDefs
 }
 
 private[mxnet] class AddSymbolRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) =
+  private[mxnet] def macroTransform(annottees: Any*): Any =
   macro TypedSymbolRandomAPIMacro.typeSafeAPIDefs
 }
 
@@ -40,7 +41,7 @@ private[mxnet] class AddSymbolRandomAPIs(isContrib: Boolean) extends StaticAnnot
   */
 private[mxnet] object SymbolMacro extends GeneratorBase {
 
-  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddSymbolFunctions($b)" => c.eval[Boolean](c.Expr(b))
@@ -50,7 +51,7 @@ private[mxnet] object SymbolMacro extends GeneratorBase {
   }
 
   private def impl(c: blackbox.Context)
-                  (isContrib: Boolean, annottees: c.Expr[Any]*): c.Expr[Any] = {
+                  (isContrib: Boolean, annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
 
     val functions = functionsToGenerate(isSymbol = false, isContrib)
@@ -76,7 +77,7 @@ private[mxnet] object SymbolMacro extends GeneratorBase {
   */
 private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
 
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddSymbolAPIs($b)" => c.eval[Boolean](c.Expr(b))
@@ -140,7 +141,7 @@ private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
 private[mxnet] object TypedSymbolRandomAPIMacro extends GeneratorBase
   with RandomHelpers {
 
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = true)
       .map(f => buildTypedFunction(c)(f))
 
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
index fa3565b4fb8e..9bf0818c14a4 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
@@ -25,18 +25,16 @@ import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
 private[mxnet] class AddJNDArrayAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) = macro JavaNDArrayMacro.typeSafeAPIDefs
+  private[mxnet] def macroTransform(annottees: Any*): Any = macro JavaNDArrayMacro.typeSafeAPIDefs
 }
 
 private[mxnet] object JavaNDArrayMacro extends GeneratorBase {
 
-  // scalastyle:off havetype
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*) = {
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     typeSafeAPIImpl(c)(annottees: _*)
   }
-  // scalastyle:off havetype
 
-  private def typeSafeAPIImpl(c: blackbox.Context)(annottees: c.Expr[Any]*) : c.Expr[Any] = {
+  private def typeSafeAPIImpl(c: blackbox.Context)(annottees: c.Expr[Any]*) : c.Expr[Nothing] = {
     import c.universe._
 
     val isContrib: Boolean = c.prefix.tree match {
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
index 57c4cfba10b7..12d797f9b100 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
@@ -47,7 +47,8 @@ private[mxnet] object CToScalaUtils {
       case "double" | "doubleorNone" => types("double")
       case "string" => "String"
       case "boolean" | "booleanorNone" => types("bool")
-      case "tupleof<float>" | "tupleof<double>" | "tupleof<>" | "ptr" | "" => "Any"
+      case "tupleof<int>" | "tupleof<float>" | "tupleof<double>" | "tupleof<intorNone>" |
+           "tupleof<>" | "ptr" | "" => "Any"
       case default => throw new IllegalArgumentException(
         s"Invalid type for args: $default\nString argType: $argType\nargName: $argName")
     }
diff --git a/scala-package/mxnet-demo/java-demo/README.md b/scala-package/mxnet-demo/java-demo/README.md
index cad52cbc44cb..3e742fc29004 100644
--- a/scala-package/mxnet-demo/java-demo/README.md
+++ b/scala-package/mxnet-demo/java-demo/README.md
@@ -32,6 +32,8 @@ This command will pick the default values specified in the [pom](https://github.
 
 Note: If you are planning to use GPU, please add `-Dmxnet.profile=linux-x86_64-gpu`
 
+Note: The Maven package is built with CUDA 9.2.
+
 ### Use customized version set
 You can use the following instruction as an alternative to achieve the same result:
 You may use `mvn package` to build the package,
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index ea6e9c8f5ba4..7323d23ac556 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -354,9 +354,9 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayLoadFromRawBytes
 
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetShape
   (JNIEnv *env, jobject obj, jlong ndArrayPtr, jobject ndimRef, jobject dataBuf) {
-  mx_uint ndim;
-  const mx_uint *pdata;
-  int ret = MXNDArrayGetShape(reinterpret_cast<NDArrayHandle>(ndArrayPtr), &ndim, &pdata);
+  int ndim;
+  const int *pdata;
+  int ret = MXNDArrayGetShapeEx(reinterpret_cast<NDArrayHandle>(ndArrayPtr), &ndim, &pdata);
 
   // fill dataBuf
   jclass integerClass = env->FindClass("java/lang/Integer");
@@ -365,7 +365,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetShape
   jclass arrayClass = env->FindClass("scala/collection/mutable/ArrayBuffer");
   jmethodID arrayAppend = env->GetMethodID(arrayClass,
     "$plus$eq", "(Ljava/lang/Object;)Lscala/collection/mutable/ArrayBuffer;");
-  for (size_t i = 0; i < ndim; ++i) {
+  for (int i = 0; i < ndim; ++i) {
     jobject data = env->NewObject(integerClass, newInteger, pdata[i]);
     env->CallObjectMethod(dataBuf, arrayAppend, data);
     env->DeleteLocalRef(data);
@@ -404,14 +404,15 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayAt
   return ret;
 }
 
-JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayReshape
-  (JNIEnv *env, jobject obj, jlong ndArrayPtr, jint ndim, jintArray dims, jobject reshapedHandle) {
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayReshape64
+  (JNIEnv *env, jobject obj, jlong ndArrayPtr, jint ndim,
+   jlongArray dims, jboolean reverse, jobject reshapedHandle) {
   NDArrayHandle out;
-  jint *pdims = env->GetIntArrayElements(dims, NULL);
-  int ret = MXNDArrayReshape(reinterpret_cast<NDArrayHandle>(ndArrayPtr), ndim,
-                                    reinterpret_cast<int *>(pdims), &out);
+  jlong *pdims = env->GetLongArrayElements(dims, NULL);
+  int ret = MXNDArrayReshape64(reinterpret_cast<NDArrayHandle>(ndArrayPtr), ndim,
+                                    reinterpret_cast<dim_t *>(pdims), reverse, &out);
   SetLongField(env, reshapedHandle, reinterpret_cast<jlong>(out));
-  env->ReleaseIntArrayElements(dims, pdims, 0);
+  env->ReleaseLongArrayElements(dims, pdims, 0);
   return ret;
 }
 
@@ -891,6 +892,119 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorBackward
   return ret;
 }
 
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorReshape
+  (JNIEnv * env, jobject obj,
+    jint partialReshaping, jint allowUpSizing, jint devType, jint devId,
+    jobjectArray jmapKeys, jintArray jmapDevTypes, jintArray jmapDevIds,
+    jobjectArray jprovidedArgShapeNames, jintArray jprovidedArgShapeData,
+    jintArray jprovidedArgShapeIdx, jobject jrefInArgs, jobject jrefArgGrads,
+    jobject jrefAuxStates, jlong jsharedExec, jobject jrefOut) {
+  CHECK(jmapKeys != NULL);
+  CHECK(jprovidedArgShapeNames != NULL);
+
+  int numMapKeys = env->GetArrayLength(jmapKeys);
+  jint *mapDevTypes = env->GetIntArrayElements(jmapDevTypes, NULL);
+  jint *mapDevIds = env->GetIntArrayElements(jmapDevIds, NULL);
+  const char **mapKeys = NULL;
+  if (numMapKeys > 0) {
+    mapKeys = new const char*[numMapKeys];
+    for (int i = 0; i < numMapKeys; ++i) {
+      jstring jkey = reinterpret_cast<jstring>(env->GetObjectArrayElement(jmapKeys, i));
+      mapKeys[i] = env->GetStringUTFChars(jkey, 0);
+      env->DeleteLocalRef(jkey);
+    }
+  }
+
+  int numProvidedArgShapes = env->GetArrayLength(jprovidedArgShapeNames);
+  jint *providedArgShapeData = env->GetIntArrayElements(jprovidedArgShapeData, NULL);
+  jint *providedArgShapeIdx = env->GetIntArrayElements(jprovidedArgShapeIdx, NULL);
+  const char **providedArgShapeNames = NULL;
+  if (numProvidedArgShapes > 0) {
+    providedArgShapeNames = new const char*[numProvidedArgShapes];
+    for (int i = 0; i < numProvidedArgShapes; ++i) {
+      jstring jkey = reinterpret_cast<jstring>(
+          env->GetObjectArrayElement(jprovidedArgShapeNames, i));
+      providedArgShapeNames[i] = env->GetStringUTFChars(jkey, 0);
+      env->DeleteLocalRef(jkey);
+    }
+  }
+
+  mx_uint numInArgs = 0;
+  NDArrayHandle *inArgs;
+  NDArrayHandle *argGrads;
+
+  mx_uint numAuxStates = 0;
+  NDArrayHandle *auxStates;
+
+  ExecutorHandle out;
+
+  int ret = MXExecutorReshapeEx(partialReshaping,
+                                allowUpSizing,
+                                devType,
+                                devId,
+                                static_cast<mx_uint>(numMapKeys),
+                                mapKeys,
+                                static_cast<const int*>(mapDevTypes),
+                                static_cast<const int*>(mapDevIds),
+                                static_cast<const mx_uint>(numProvidedArgShapes),
+                                providedArgShapeNames,
+                                static_cast<const int*>(providedArgShapeData),
+                                reinterpret_cast<const mx_uint*>(providedArgShapeIdx),
+                                &numInArgs,
+                                &inArgs,
+                                &argGrads,
+                                &numAuxStates,
+                                &auxStates,
+                                reinterpret_cast<ExecutorHandle>(jsharedExec),
+                                &out);
+
+  jclass longCls = env->FindClass("java/lang/Long");
+  jmethodID newLong = env->GetMethodID(longCls, "<init>", "(J)V");
+
+  jclass arrayClass = env->FindClass("scala/collection/mutable/ArrayBuffer");
+  jmethodID arrayAppend = env->GetMethodID(arrayClass,
+    "$plus$eq", "(Ljava/lang/Object;)Lscala/collection/mutable/ArrayBuffer;");
+
+  for (size_t i = 0; i < numInArgs; ++i) {
+    jobject inArg = env->NewObject(longCls, newLong, inArgs[i]);
+    env->CallObjectMethod(jrefInArgs, arrayAppend, inArg);
+    env->DeleteLocalRef(inArg);
+
+    jobject argGrad = env->NewObject(longCls, newLong, argGrads[i]);
+    env->CallObjectMethod(jrefArgGrads, arrayAppend, argGrad);
+    env->DeleteLocalRef(argGrad);
+  }
+
+  for (size_t i = 0; i < numAuxStates; ++i) {
+    jobject auxState = env->NewObject(longCls, newLong, auxStates[i]);
+    env->CallObjectMethod(jrefAuxStates, arrayAppend, auxState);
+    env->DeleteLocalRef(auxState);
+  }
+
+  SetLongField(env, jrefOut, reinterpret_cast<jlong>(out));
+
+  // release allocated memory
+  for (int i = 0; i < numMapKeys; i++) {
+    jstring jkey = reinterpret_cast<jstring>(env->GetObjectArrayElement(jmapKeys, i));
+    env->ReleaseStringUTFChars(jkey, mapKeys[i]);
+    env->DeleteLocalRef(jkey);
+  }
+  if (mapKeys != NULL) {
+    delete[] mapKeys;
+  }
+
+  for (int i = 0; i < numProvidedArgShapes; i++) {
+    jstring jkey = reinterpret_cast<jstring>(env->GetObjectArrayElement(jprovidedArgShapeNames, i));
+    env->ReleaseStringUTFChars(jkey, providedArgShapeNames[i]);
+    env->DeleteLocalRef(jkey);
+  }
+  if (providedArgShapeNames != NULL) {
+    delete[] providedArgShapeNames;
+  }
+
+  return ret;
+}
+
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorPrint
   (JNIEnv * env, jobject obj, jlong ptr, jobject debugStr) {
   const char *retDebugStr;
@@ -1529,23 +1643,27 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolCreateFromFile
 
 int FillSymbolInferShape
   (JNIEnv *env, jmethodID listAppend, jobject joutData,
-    mx_uint shapeSize, const mx_uint *shapeNdim, const mx_uint **shapeData) {
-  for (size_t i = 0; i < shapeSize; ++i) {
-    jintArray jshape = env->NewIntArray(shapeNdim[i]);
-    if (jshape == NULL) {
-      // TODO(Yizhi): out of memory error thrown, return a specific error code ?
-      return -1;
+    int shapeSize, const int *shapeNdim, const int **shapeData) {
+  for (int i = 0; i < shapeSize; ++i) {
+    jintArray jshape = NULL;
+    if (shapeNdim[i] >= 0) {
+      jshape = env->NewIntArray(shapeNdim[i]);
+      if (jshape == NULL) {
+        // TODO(Yizhi): out of memory error thrown, return a specific error code ?
+        return -1;
+      }
+      env->SetIntArrayRegion(jshape, 0, shapeNdim[i], reinterpret_cast<const jint *>(shapeData[i]));
     }
-    env->SetIntArrayRegion(jshape, 0, shapeNdim[i], reinterpret_cast<const jint *>(shapeData[i]));
     env->CallObjectMethod(joutData, listAppend, jshape);
     env->DeleteLocalRef(jshape);
   }
   return 0;
 }
-JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
-  (JNIEnv *env, jobject obj, jlong symbolPtr, jint jnumArgs, jobjectArray jkeys,
-    jintArray jargIndPtr, jintArray jargShapeData,
-    jobject jinShapeData, jobject joutShapeData, jobject jauxShapeData, jobject jcomplete) {
+
+int SymbolInferShapeHelper(JNIEnv *env, jobject obj, jlong symbolPtr, jint jnumArgs,
+                            jobjectArray jkeys, jintArray jargIndPtr, jintArray jargShapeData,
+                            jobject jinShapeData, jobject joutShapeData, jobject jauxShapeData,
+                            jobject jcomplete, bool partial) {
   const char **keys = NULL;
   if (jkeys != NULL) {
     keys = new const char *[jnumArgs];
@@ -1558,26 +1676,28 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
   }
 
   mx_uint inShapeSize;
-  const mx_uint *inShapeNdim;
-  const mx_uint **inShapeData;
+  const int *inShapeNdim;
+  const int **inShapeData;
 
   mx_uint outShapeSize;
-  const mx_uint *outShapeNdim;
-  const mx_uint **outShapeData;
+  const int *outShapeNdim;
+  const int **outShapeData;
 
   mx_uint auxShapeSize;
-  const mx_uint *auxShapeNdim;
-  const mx_uint **auxShapeData;
+  const int *auxShapeNdim;
+  const int **auxShapeData;
 
   int complete;
 
   jint *argIndPtr = env->GetIntArrayElements(jargIndPtr, NULL);
   jint *argShapeData = env->GetIntArrayElements(jargShapeData, NULL);
-  int ret = MXSymbolInferShape(reinterpret_cast<SymbolHandle>(symbolPtr),
+  int ret;
+  if (!partial) {
+    ret = MXSymbolInferShapeEx(reinterpret_cast<SymbolHandle>(symbolPtr),
                                static_cast<mx_uint>(jnumArgs),
                                keys,
-                               reinterpret_cast<const mx_uint *>(argIndPtr),
-                               reinterpret_cast<const mx_uint *>(argShapeData),
+                               reinterpret_cast<mx_uint *>(argIndPtr),
+                               reinterpret_cast<const int *>(argShapeData),
                                &inShapeSize,
                                &inShapeNdim,
                                &inShapeData,
@@ -1588,6 +1708,23 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
                                &auxShapeNdim,
                                &auxShapeData,
                                &complete);
+  } else {
+    ret = MXSymbolInferShapePartialEx(reinterpret_cast<SymbolHandle>(symbolPtr),
+                                      static_cast<mx_uint>(jnumArgs),
+                                      keys,
+                                      reinterpret_cast<mx_uint *>(argIndPtr),
+                                      reinterpret_cast<const int *>(argShapeData),
+                                      &inShapeSize,
+                                      &inShapeNdim,
+                                      &inShapeData,
+                                      &outShapeSize,
+                                      &outShapeNdim,
+                                      &outShapeData,
+                                      &auxShapeSize,
+                                      &auxShapeNdim,
+                                      &auxShapeData,
+                                      &complete);
+  }
   env->ReleaseIntArrayElements(jargShapeData, argShapeData, 0);
   env->ReleaseIntArrayElements(jargIndPtr, argIndPtr, 0);
 
@@ -1628,6 +1765,24 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
   return ret;
 }
 
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
+  (JNIEnv *env, jobject obj, jlong symbolPtr, jint jnumArgs, jobjectArray jkeys,
+    jintArray jargIndPtr, jintArray jargShapeData,
+    jobject jinShapeData, jobject joutShapeData, jobject jauxShapeData, jobject jcomplete) {
+
+  return SymbolInferShapeHelper(env, obj, symbolPtr, jnumArgs, jkeys, jargIndPtr, jargShapeData,
+                                jinShapeData, joutShapeData, jauxShapeData, jcomplete, false);
+}
+
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShapePartial
+  (JNIEnv *env, jobject obj, jlong symbolPtr, jint jnumArgs, jobjectArray jkeys,
+    jintArray jargIndPtr, jintArray jargShapeData,
+    jobject jinShapeData, jobject joutShapeData, jobject jauxShapeData, jobject jcomplete) {
+
+  return SymbolInferShapeHelper(env, obj, symbolPtr, jnumArgs, jkeys, jargIndPtr, jargShapeData,
+                                jinShapeData, joutShapeData, jauxShapeData, jcomplete, true);
+}
+
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorBindX
   (JNIEnv *env, jobject obj, jlong symbolPtr, jint deviceTypeId, jint deviceID, jint numCtx,
     jobjectArray jctxMapKeys, jintArray jctxMapDevTypes, jintArray jctxMapDevIDs, jint numArgs,
@@ -2550,3 +2705,20 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDumpProfile
   (JNIEnv *env, jobject obj, jint finished) {
   return MXDumpProfile(finished);
 }
+
+// Numpy
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxIsNumpyCompatible
+  (JNIEnv *env, jobject obj, jobject compatibleRef) {
+  bool isCompatible;
+  int ret = MXIsNumpyCompatible(&isCompatible);
+  SetIntField(env, compatibleRef, static_cast<int>(isCompatible));
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetIsNumpyCompatible
+  (JNIEnv *env, jobject obj, jint isNpComp, jobject prevRef) {
+  int prev;
+  int ret = MXSetIsNumpyCompatible(isNpComp, &prev);
+  SetIntField(env, prevRef, prev);
+  return ret;
+}
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
index 7e8e03de9124..467272cea9cf 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
@@ -161,11 +161,11 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayAt
 
 /*
  * Class:     org_apache_mxnet_LibInfo
- * Method:    mxNDArrayReshape
- * Signature: (JI[ILorg/apache/mxnet/Base/RefLong;)I
+ * Method:    mxNDArrayReshape64
+ * Signature: (JI[JZLorg/apache/mxnet/Base/RefLong;)I
  */
-JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayReshape
-  (JNIEnv *, jobject, jlong, jint, jintArray, jobject);
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayReshape64
+  (JNIEnv *, jobject, jlong, jint, jlongArray, jboolean, jobject);
 
 /*
  * Class:     org_apache_mxnet_LibInfo
@@ -511,6 +511,14 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorPrint
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorSetMonitorCallback
   (JNIEnv *, jobject, jlong, jobject);
 
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxExecutorReshape
+ * Signature: (IIII[Ljava/lang/String;[I[I[Ljava/lang/String;[I[ILscala/collection/mutable/ArrayBuffer;Lscala/collection/mutable/ArrayBuffer;Lscala/collection/mutable/ArrayBuffer;JLorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorReshape
+  (JNIEnv *, jobject, jint, jint, jint, jint, jobjectArray, jintArray, jintArray, jobjectArray, jintArray, jintArray, jobject, jobject, jobject, jlong, jobject);
+
 /*
  * Class:     org_apache_mxnet_LibInfo
  * Method:    mxSymbolListAtomicSymbolCreators
@@ -655,6 +663,14 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferType
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
   (JNIEnv *, jobject, jlong, jint, jobjectArray, jintArray, jintArray, jobject, jobject, jobject, jobject);
 
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolInferShapePartial
+ * Signature: (JI[Ljava/lang/String;[I[ILscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShapePartial
+  (JNIEnv *, jobject, jlong, jint, jobjectArray, jintArray, jintArray, jobject, jobject, jobject, jobject);
+
 /*
  * Class:     org_apache_mxnet_LibInfo
  * Method:    mxSymbolGetOutput
@@ -855,6 +871,22 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetProfilerState
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDumpProfile
   (JNIEnv *, jobject, jint);
 
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxIsNumpyCompatible
+ * Signature: (Lorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxIsNumpyCompatible
+  (JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSetIsNumpyCompatible
+ * Signature: (ILorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetIsNumpyCompatible
+  (JNIEnv *, jobject, jint, jobject);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 62def16627b9..44b784a56f8e 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -53,8 +53,11 @@
   </scm>
 
   <properties>
+    <java.version>1.7</java.version>
     <scala.version>2.11.8</scala.version>
     <build.platform/>
+    <scala.binary.version>2.11</scala.binary.version>
+    <build.platform />
     <cxx>g++</cxx>
     <dollar>$</dollar>
     <MXNET_DIR>${project.basedir}/..</MXNET_DIR>
@@ -252,9 +255,11 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.3</version>
         <configuration>
-          <source>1.7</source>
-          <target>1.7</target>
+          <source>${java.version}</source>
+          <target>${java.version}</target>
           <encoding>UTF-8</encoding>
+          <skipMain>true</skipMain>
+          <skip>true</skip>
         </configuration>
       </plugin>
       <plugin>
@@ -339,11 +344,12 @@
         </executions>
       </plugin>
       <plugin>
-        <groupId>org.scala-tools</groupId>
-        <artifactId>maven-scala-plugin</artifactId>
-        <version>2.15.2</version>
+        <groupId>net.alchim31.maven</groupId>
+        <artifactId>scala-maven-plugin</artifactId>
+        <version>3.4.4</version>
         <configuration>
-          <recompileMode>incremental</recompileMode>
+          <source>${java.version}</source>
+          <target>${java.version}</target>
           <compilerPlugins>
             <compilerPlugin>
               <groupId>org.scalamacros</groupId>
@@ -354,25 +360,19 @@
         </configuration>
         <executions>
           <execution>
+            <id>compile</id>
             <goals>
+              <goal>add-source</goal>
               <goal>compile</goal>
               <goal>testCompile</goal>
+              <goal>doc-jar</goal>
             </goals>
           </execution>
-        </executions>
-      </plugin>
-      <plugin>
-        <groupId>net.alchim31.maven</groupId>
-        <artifactId>scala-maven-plugin</artifactId>
-        <version>3.3.2</version>
-        <configuration>
-        </configuration>
-        <executions>
           <execution>
-            <phase>package</phase>
-            <id>attach-javadocs</id>
+            <id>presite</id>
+            <phase>pre-site</phase>
             <goals>
-              <goal>doc-jar</goal>
+              <goal>add-source</goal>
             </goals>
           </execution>
         </executions>
@@ -388,6 +388,7 @@
       </plugin>
     </plugins>
   </build>
+
   <dependencies>
     <dependency>
       <groupId>commons-codec</groupId>
@@ -419,7 +420,7 @@
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_2.11</artifactId>
-      <version>3.0.4</version>
+      <version>3.0.2</version>
       <scope>test</scope>
     </dependency>
     <dependency>
@@ -454,7 +455,12 @@
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parser-combinators_2.11</artifactId>
-      <version>1.0.4</version>
+      <version>1.0.5</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang.modules</groupId>
+      <artifactId>scala-xml_2.11</artifactId>
+      <version>1.0.6</version>
     </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index 2acb70b43303..b61a427f83ef 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -50,5 +50,10 @@
       <artifactId>args4j</artifactId>
       <version>2.33</version>
     </dependency>
+    <dependency>
+        <groupId>org.json4s</groupId>
+        <artifactId>json4s-core_2.11</artifactId>
+        <version>3.5.1</version>
+    </dependency>
   </dependencies>
 </project>
diff --git a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNDArray.scala b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNDArray.scala
index a18c47d78741..0d4c18c2b8e9 100644
--- a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNDArray.scala
+++ b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNDArray.scala
@@ -20,7 +20,7 @@ package org.apache.mxnet.spark
 import org.apache.mxnet.NDArray
 
 /**
- * A wrapper for serialize & deserialize <pre>[[org.apache.mxnet.NDArray]]</pre> in spark job
+ * A wrapper for serialize & deserialize ``org.apache.mxnet.NDArray`` in spark job
  * @author Yizhi Liu
  */
 class MXNDArray(@transient private var ndArray: NDArray) extends Serializable {
diff --git a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNetModel.scala b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNetModel.scala
index 2c4c8fe42780..234e9a597cf5 100644
--- a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNetModel.scala
+++ b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNetModel.scala
@@ -23,7 +23,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.mllib.linalg.Vector
 
 /**
- * Wrapper for <pre>[[org.apache.mxnet.Model]]</pre> which used in Spark application
+ * Wrapper for ``org.apache.mxnet.Model`` which used in Spark application
  * @author Yizhi Liu
  */
 class MXNetModel private[mxnet](
diff --git a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/utils/Network.scala b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/utils/Network.scala
index c61229af0035..836901f69f8f 100644
--- a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/utils/Network.scala
+++ b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/utils/Network.scala
@@ -20,6 +20,7 @@ package org.apache.mxnet.spark.utils
 import java.io.IOException
 import java.net.{ServerSocket, NetworkInterface}
 import java.util.regex.Pattern
+import scala.collection.JavaConverters._
 
 /**
  * Helper functions to decide ip address / port
@@ -33,19 +34,16 @@ object Network {
       "([01]?\\d\\d?|2[0-4]\\d|25[0-5])$")
 
   def ipAddress: String = {
-    val interfaces = NetworkInterface.getNetworkInterfaces
-    while (interfaces.hasMoreElements) {
-      val interface = interfaces.nextElement
-      val addresses = interface.getInetAddresses
-      while (addresses.hasMoreElements) {
-        val address = addresses.nextElement
-        val ip = address.getHostAddress
-        if (!ip.startsWith("127.") && IPADDRESS_PATTERN.matcher(ip).matches()) {
-          return ip
+    val interfaces = NetworkInterface.getNetworkInterfaces.asScala
+    val interface = interfaces.toStream.flatMap(
+      _.getInetAddresses.asScala.toStream.flatMap(
+        address => {
+          val ip = address.getHostAddress
+          Option(ip).filter(ip => !ip.startsWith("127.") && IPADDRESS_PATTERN.matcher(ip).matches())
         }
-      }
-    }
-    "127.0.0.1"
+      )
+    ).headOption
+    interface.getOrElse("127.0.0.1")
   }
 
   def availablePort: Int = {
diff --git a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/SharedSparkContext.scala b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/SharedSparkContext.scala
index 6d36ca51db90..293cfa13cfce 100644
--- a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/SharedSparkContext.scala
+++ b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/SharedSparkContext.scala
@@ -92,20 +92,12 @@ trait SharedSparkContext extends FunSuite with BeforeAndAfterEach with BeforeAnd
 
   private def getJarFilePath(root: String): String = {
     val jarFiles = findJars(s"$root/target/")
-    if (jarFiles != null && jarFiles.nonEmpty) {
-      jarFiles.head.getAbsolutePath
-    } else {
-      null
-    }
+    Option(jarFiles).flatMap(_.headOption).map(_.getAbsolutePath).orNull
   }
 
   private def getSparkJar: String = {
     val jarFiles = findJars(s"$composeWorkingDirPath/target/")
-    if (jarFiles != null && jarFiles.nonEmpty) {
-      jarFiles.head.getAbsolutePath
-    } else {
-      null
-    }
+    Option(jarFiles).flatMap(_.headOption).map(_.getAbsolutePath).orNull
   }
 
   private def getNativeJars(root: String): String =
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 5a7329acaeab..f549ddd13994 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -44,9 +44,11 @@
 #include "mxnet/rtc.h"
 #include "mxnet/storage.h"
 #include "mxnet/libinfo.h"
+#include "mxnet/imperative.h"
 #include "./c_api_common.h"
 #include "../operator/custom/custom-inl.h"
 #include "../operator/tensor/matrix_op-inl.h"
+#include "../common/utils.h"
 
 using namespace mxnet;
 
@@ -111,6 +113,7 @@ int MXRandomSeedContext(int seed, int dev_type, int dev_id) {
 
 int MXNotifyShutdown() {
   API_BEGIN();
+  mxnet::op::custom::CustomOperator::Get()->Stop();
   Engine::Get()->NotifyShutdown();
   API_END();
 }
@@ -470,7 +473,7 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
   NDArray *ptr = new NDArray();
   API_BEGIN();
   NDArray *arr = static_cast<NDArray*>(handle);
-  nnvm::Tuple<dim_t> shape(dims, dims+ndim);
+  mxnet::Tuple<dim_t> shape(dims, dims+ndim);
   CHECK_GT(arr->shape().Size(), 0) << "Source ndarray's shape is undefined. Input shape: "
     << arr->shape();
   mxnet::TShape new_shape = mxnet::op::InferReshapeShape(shape, arr->shape(), reverse);
@@ -510,6 +513,34 @@ int MXNDArrayGetShape(NDArrayHandle handle,
   API_END();
 }
 
+int MXNDArrayGetShapeEx(NDArrayHandle handle,
+                        int *out_dim,
+                        const int **out_pdata) {
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  if (!arr->is_none()) {
+    mxnet::TShape s = arr->shape();
+    if (!Imperative::Get()->is_np_comp()) {
+      common::ConvertToLegacyShape(&s);
+    }
+    *out_dim = s.ndim();
+    if (s.ndim() >= 0) {
+      std::vector<int> &buffer = ret->arg_shape_buffer_ex;
+      buffer.resize(s.ndim());
+      mxnet::ShapeTypeCast(s.begin(), s.end(), buffer.data());
+      *out_pdata = buffer.data();
+    }
+  } else {
+    if (Imperative::Get()->is_np_comp()) {
+      *out_dim = -1;
+    } else {
+      *out_dim = 0;
+    }
+  }
+  API_END();
+}
+
 int MXNDArrayGetData(NDArrayHandle handle,
                      void **out_pdata) {
   API_BEGIN();
@@ -790,7 +821,7 @@ int MXDataIterGetLabel(DataIterHandle handle, NDArrayHandle *out) {
   // temp hack to make label 1D
   // TODO(tianjun) make label 1D when label_width=0
   mxnet::TShape shape = db.data[1].shape();
-  if (shape[1] == 1) {
+  if (shape.ndim() > 1 && shape[1] == 1) {
     *pndarray = db.data[1].Reshape(mshadow::Shape1(shape[0]));
   } else {
     *pndarray = db.data[1];
@@ -1400,3 +1431,98 @@ int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *s
   *out = new NDArray(shared_pid, shared_id, mxnet::TShape(shape, shape + ndim), dtype);
   API_END();
 }
+
+int MXNDArrayCreateFromSharedMemEx(int shared_pid, int shared_id, const int *shape,
+                                   int ndim, int dtype, NDArrayHandle *out) {
+  API_BEGIN();
+  *out = new NDArray(shared_pid, shared_id, mxnet::TShape(shape, shape + ndim), dtype);
+  API_END();
+}
+
+typedef Engine::VarHandle VarHandle;
+typedef Engine::CallbackOnComplete CallbackOnComplete;
+
+void AssertValidNumberVars(int num_const_vars, int num_mutable_vars) {
+  CHECK_GE(num_const_vars, 0) << "Non-negative number of const vars expected.";
+  CHECK_GE(num_mutable_vars, 0) << "Non-negative number of mutable vars expected.";
+}
+
+int MXEnginePushAsync(EngineAsyncFunc async_func, void* func_param,
+                      EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                      EngineVarHandle const_vars_handle, int num_const_vars,
+                      EngineVarHandle mutable_vars_handle, int num_mutable_vars,
+                      EngineFnPropertyHandle prop_handle, int priority,
+                      const char* opr_name, bool wait) {
+  API_BEGIN();
+
+  auto exec_ctx = *static_cast<const Context*>(ctx_handle);
+  auto const_vars = static_cast<VarHandle*>(const_vars_handle);
+  auto mutable_vars = static_cast<VarHandle*>(mutable_vars_handle);
+  auto prop = FnProperty::kNormal;
+  if (prop_handle) {
+    prop = *static_cast<const FnProperty*>(prop_handle);
+  }
+
+  Engine::AsyncFn exec_fn;
+  if (deleter == nullptr) {
+    exec_fn = [async_func, func_param](RunContext rctx,
+                                       CallbackOnComplete on_complete) {
+      async_func(&rctx, &on_complete, func_param);
+    };
+  } else {
+    // Wrap func_param in a shared_ptr with deleter such that deleter
+    // will be called when the lambda goes out of scope.
+    std::shared_ptr<void> shared_func_param(func_param, deleter);
+    exec_fn = [async_func, shared_func_param](RunContext rctx,
+                                              CallbackOnComplete on_complete) {
+      async_func(&rctx, &on_complete, shared_func_param.get());
+    };
+  }
+
+  AssertValidNumberVars(num_const_vars, num_mutable_vars);
+  std::vector<VarHandle> const_var_vec(const_vars, const_vars + num_const_vars);
+  std::vector<VarHandle> mutable_var_vec(mutable_vars, mutable_vars + num_mutable_vars);
+  Engine::Get()->PushAsync(exec_fn, exec_ctx, const_var_vec, mutable_var_vec,
+                           prop, priority, opr_name, wait);
+
+  API_END();
+}
+
+int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
+                     EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                     EngineVarHandle const_vars_handle, int num_const_vars,
+                     EngineVarHandle mutable_vars_handle, int num_mutable_vars,
+                     EngineFnPropertyHandle prop_handle, int priority,
+                     const char* opr_name) {
+  API_BEGIN();
+
+  auto exec_ctx = *static_cast<const Context*>(ctx_handle);
+  auto const_vars = static_cast<VarHandle*>(const_vars_handle);
+  auto mutable_vars = static_cast<VarHandle*>(mutable_vars_handle);
+  auto prop = FnProperty::kNormal;
+  if (prop_handle) {
+    prop = *static_cast<const FnProperty*>(prop_handle);
+  }
+
+  Engine::SyncFn exec_fn;
+  if (deleter == nullptr) {
+    exec_fn = [sync_func, func_param](RunContext rctx) {
+      sync_func(&rctx, func_param);
+    };
+  } else {
+    // Wrap func_param in a shared_ptr with deleter such that deleter
+    // will be called when the lambda goes out of scope.
+    std::shared_ptr<void> shared_func_param(func_param, deleter);
+    exec_fn = [sync_func, shared_func_param](RunContext rctx) {
+      sync_func(&rctx, shared_func_param.get());
+    };
+  }
+
+  AssertValidNumberVars(num_const_vars, num_mutable_vars);
+  std::vector<VarHandle> const_var_vec(const_vars, const_vars + num_const_vars);
+  std::vector<VarHandle> mutable_var_vec(mutable_vars, mutable_vars + num_mutable_vars);
+  Engine::Get()->PushSync(exec_fn, exec_ctx, const_var_vec, mutable_var_vec,
+                          prop, priority, opr_name);
+
+  API_END();
+}
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index b5adfa37eca9..013ecab93da8 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -75,12 +75,19 @@ struct MXAPIThreadLocalEntry {
   std::vector<int> arg_storage_types, out_storage_types, aux_storage_types;
   /*! \brief result holder for returning shape dimensions */
   std::vector<mx_uint> arg_shape_ndim, out_shape_ndim, aux_shape_ndim;
+  /*! \brief result holder for returning shape dimensions */
+  std::vector<int> arg_shape_ndim_ex, out_shape_ndim_ex, aux_shape_ndim_ex;
   /*! \brief result holder for returning shape pointer */
   std::vector<const mx_uint*> arg_shape_data, out_shape_data, aux_shape_data;
+  /*! \brief result holder for returning shape pointer */
+  std::vector<const int*> arg_shape_data_ex, out_shape_data_ex, aux_shape_data_ex;
   /*! \brief uint32_t buffer for returning shape pointer */
   std::vector<uint32_t> arg_shape_buffer, out_shape_buffer, aux_shape_buffer;
+  /*! \brief uint32_t buffer for returning shape pointer */
+  std::vector<int> arg_shape_buffer_ex, out_shape_buffer_ex, aux_shape_buffer_ex;
   /*! \brief bool buffer */
   std::vector<bool> save_inputs, save_outputs;
+  // DEPRECATED. Use SetupShapeArrayReturnWithBufferEx instead.
   // helper function to setup return value of shape array
   inline static void SetupShapeArrayReturnWithBuffer(
       const mxnet::ShapeVector &shapes,
@@ -99,6 +106,30 @@ struct MXAPIThreadLocalEntry {
       ptr = nnvm::ShapeTypeCast(shapes[i].begin(), shapes[i].end(), ptr);
     }
   }
+  // helper function to setup return value of shape array
+  inline static void SetupShapeArrayReturnWithBufferEx(
+      const mxnet::ShapeVector &shapes,
+      std::vector<int> *ndim,
+      std::vector<const int*> *data,
+      std::vector<int> *buffer) {
+    ndim->resize(shapes.size());
+    data->resize(shapes.size());
+    size_t size = 0;
+    for (const auto& s : shapes) {
+      if (s.ndim() > 0) {
+        size += s.ndim();
+      }
+    }
+    buffer->resize(size);
+    int *ptr = buffer->data();
+    for (size_t i = 0; i < shapes.size(); ++i) {
+      ndim->at(i) = shapes[i].ndim();
+      data->at(i) = ptr;
+      if (shapes[i].ndim() > 0) {
+        ptr = mxnet::ShapeTypeCast(shapes[i].begin(), shapes[i].end(), ptr);
+      }
+    }
+  }
 };
 
 // define the threadlocal store.
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index a2e8bb810e6f..5352fcfe0951 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -25,8 +25,10 @@
 #include <mxnet/base.h>
 #include <mxnet/c_api.h>
 #include <mxnet/executor.h>
+#include <mxnet/imperative.h>
 #include "./c_api_common.h"
 #include "../executor/graph_executor.h"
+#include "../common/utils.h"
 #if MXNET_USE_TENSORRT
 #include "../executor/trt_graph_executor.h"
 #endif  // MXNET_USE_TENSORRT
@@ -183,7 +185,7 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
 }
 
 /*!
- * \brief
+ * \brief DEPRECATED. Use MXExecutorSimpleBindEx instead.
  * \param symbol_handle symbol handle
  * \param dev_type default device type
  * \param dev_id default device id
@@ -416,6 +418,371 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
     CHECK(p.second) << "Duplicate shapes are provided for argument "
       << provided_arg_shape_names[i] << " in simple_bind";
   }
+  if (!Imperative::Get()->is_np_comp()) {
+    for (auto &kv : arg_shape_map) {
+      common::ConvertToNumpyShape(&kv.second);
+    }
+  }
+
+  // create para name set for sharing data array memory
+  std::unordered_set<std::string> shared_arg_name_set(num_shared_arg_names);
+  for (mx_uint i = 0; i < num_shared_arg_names; ++i) {
+    shared_arg_name_set.insert(shared_arg_name_list[i]);
+  }
+
+  // create shared_buffer_map
+  std::unordered_map<std::string, NDArray> shared_buffer_map;
+  bool use_shared_buffer = (*shared_buffer_len >= 0);
+  if (*shared_buffer_len > 0) {
+    // create shared_buffer_map
+    shared_buffer_map.reserve(*shared_buffer_len);
+    NDArray** shared_buffer_ptrs =
+      reinterpret_cast<NDArray**>(shared_buffer_handle_list);
+    for (int i = 0; i < *shared_buffer_len; ++i) {
+      shared_buffer_map[shared_buffer_name_list[i]] = *(shared_buffer_ptrs[i]);
+    }
+  }
+
+  // create temporary place holders for the initialized NDArrays
+  // to be passed back to front end
+  std::vector<NDArray> in_arg_vec;
+  std::vector<NDArray> arg_grad_vec;
+  std::vector<NDArray> aux_state_vec;
+#if MXNET_USE_TENSORRT
+  // If we've built with TensorRT support we by default return an TRTExecutor.
+  // Users can override this behaviour via env var, which is useful for example for A/B
+  // performance testing.
+  if (dmlc::GetEnv("MXNET_USE_TENSORRT", false)) {
+    *out = exec::TrtGraphExecutor::TensorRTBind(*sym, ctx, ctx_map, &in_arg_ctx_vec,
+                                                &arg_grad_ctx_vec, &aux_state_ctx_vec,
+                                                &arg_shape_map, &arg_dtype_map, &arg_stype_map,
+                                                &grad_req_type_vec, shared_arg_name_set,
+                                                &in_arg_vec, &arg_grad_vec, &aux_state_vec,
+                                                use_shared_buffer ? &shared_buffer_map : nullptr,
+                                                reinterpret_cast<Executor*>(shared_exec_handle));
+  } else {
+    // Checks to see if this env var has been set to true or false by the user.
+    // If the user is using a TensorRT build, but has not enabled TRT at inference time, warn
+    // them and describe further steps.
+    const int unset_indicator =  std::numeric_limits<int>::quiet_NaN();
+    if (dmlc::GetEnv("MXNET_USE_TENSORRT", unset_indicator) == unset_indicator) {
+      LOG(INFO) << "TensorRT not enabled by default.  Please set the MXNET_USE_TENSORRT "
+                   "environment variable to 1 or call mx.contrib.tensorrt.set_use_tensorrt(True) "
+                   "to enable.";
+    }
+#endif  // MXNET_USE_TENSORRT
+    *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
+                                aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
+                                grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
+                                &arg_grad_vec, &aux_state_vec,
+                                use_shared_buffer ? &shared_buffer_map : nullptr,
+                                reinterpret_cast<Executor*>(shared_exec_handle));
+#if MXNET_USE_TENSORRT
+  }
+#endif  // MXNET_USE_TENSORRT
+
+  // copy ndarray ptrs to ret->handles so that front end
+  // can access them
+  ret->ret_handles.clear();
+  ret->ret_handles.reserve(in_arg_vec.size()+arg_grad_vec.size()+aux_state_vec.size()
+                           +shared_buffer_map.size());
+  size_t nd_idx = 0;
+  for (const auto& nd : in_arg_vec) {
+    if (nd.is_none()) {
+      LOG(FATAL) << "Input argument NDArray cannot be un-allocated";
+    }
+    ret->ret_handles.push_back(new NDArray(nd));
+  }
+  if (in_arg_vec.size() > 0) {
+    *num_in_args = in_arg_vec.size();
+    *in_args = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  for (const auto& nd : arg_grad_vec) {
+    if (nd.is_none()) {
+      ret->ret_handles.push_back(nullptr);
+    } else {
+      ret->ret_handles.push_back(new NDArray(nd));
+    }
+  }
+  if (arg_grad_vec.size() > 0) {
+    *arg_grads = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  for (const auto& nd : aux_state_vec) {
+    if (nd.is_none()) {
+      LOG(FATAL) << "Auxiliary argument NDArray cannot be un-allocated";
+    }
+    ret->ret_handles.push_back(new NDArray(nd));
+  }
+  if (aux_state_vec.size() > 0) {
+    *num_aux_states = aux_state_vec.size();
+    *aux_states = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  if (use_shared_buffer) {
+    ret->ret_vec_str.clear();
+    ret->ret_vec_str.reserve(shared_buffer_map.size());
+    ret->ret_vec_charp.clear();
+    ret->ret_vec_charp.reserve(shared_buffer_map.size());
+    for (const auto& kv : shared_buffer_map) {
+      if (kv.second.is_none()) {
+        LOG(FATAL) << "Shared data NDArray cannot be un-allocated";
+      }
+      ret->ret_handles.push_back(new NDArray(kv.second));
+      ret->ret_vec_str.emplace_back(kv.first);
+      ret->ret_vec_charp.push_back(ret->ret_vec_str.back().c_str());
+    }
+    *shared_buffer_len = shared_buffer_map.size();
+    *updated_shared_buffer_handle_list = &(ret->ret_handles[nd_idx]);
+    *updated_shared_buffer_name_list = &(ret->ret_vec_charp[0]);
+  }
+
+  API_END();
+}
+
+/*!
+ * \brief
+ * \param symbol_handle symbol handle
+ * \param dev_type default device type
+ * \param dev_id default device id
+ * \param num_g2c_keys number of group2ctx keys
+ * \param g2c_keys key list of group2ctx
+ * \param g2c_dev_types device type list of group2ctx
+ * \param g2c_dev_ids id list of group2ctx
+ * \param provided_grad_req_list_len grad_req length provided by users in front-end
+ * \param provided_grad_req_names grad_req names provided by users in front-end
+ * \param provided_grad_req_types req types provided by users in front-end
+ * \param num_provided_arg_shapes number of user provided in_arg and aux_state shapes
+ * \param provided_arg_shape_names name list of provided shapes
+ * \param provided_arg_shape_data provided shape data
+ * \param provided_arg_shape_idx provided shape data index
+ * \param num_provided_arg_dtypes number of user provided in_arg and axu_state dtypes
+ * \param provided_arg_dtype_names argument name list of provided dtypes
+ * \param provided_arg_dtypes data of provided dtypes
+ * \param num_provided_arg_stypes number of user provided in_arg and axu_state storage types
+ * \param provided_arg_stype_names argument name list of provided storage types
+ * \param provided_arg_stypes data of provided storage types
+ * \param num_shared_arg_names number of parameter names passed from _bind_ith_exec
+ * \param shared_arg_name_list parameter name list passed from _bind_ith_exec
+ * \param shared_buffer_len number of shared data arrays passed from _bind_ith_exec
+ * \param shared_buffer_name_list shared data array names passed from _bind_ith_exec
+ * \param shared_buffer_handle_list shared data array handles passed from _bind_ith_exec
+ * \param updated_shared_buffer_name_list updated shared data array names after binding
+ * \param updated_shared_buffer_handle_list updated shared data arrays after binding
+ * \param num_in_args number of input arguments of this sym
+ * \param in_args list_arguments associated with the current executor
+ * \param arg_grads list of gradients of in_args associated with the current executor
+ * \param num_aux_states number of aux states of this sym
+ * \param aux_states list_auxiliary_states associated with the current executor
+ * \param shared_exec_handle shared excutor handle passed from _bind_ith_exec
+ * \param out the handle of the executor to be created
+ */
+int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
+                           int dev_type,
+                           int dev_id,
+                           const mx_uint num_g2c_keys,
+                           const char** g2c_keys,
+                           const int* g2c_dev_types,
+                           const int* g2c_dev_ids,
+                           const mx_uint provided_grad_req_list_len,
+                           const char** provided_grad_req_names,
+                           const char** provided_grad_req_types,
+                           const mx_uint num_provided_arg_shapes,
+                           const char** provided_arg_shape_names,
+                           const int* provided_arg_shape_data,
+                           const mx_uint* provided_arg_shape_idx,
+                           const mx_uint num_provided_arg_dtypes,
+                           const char** provided_arg_dtype_names,
+                           const int* provided_arg_dtypes,
+                           const mx_uint num_provided_arg_stypes,
+                           const char** provided_arg_stype_names,
+                           const int* provided_arg_stypes,
+                           const mx_uint num_shared_arg_names,
+                           const char** shared_arg_name_list,
+                           int* shared_buffer_len,
+                           const char** shared_buffer_name_list,
+                           NDArrayHandle* shared_buffer_handle_list,
+                           const char*** updated_shared_buffer_name_list,
+                           NDArrayHandle** updated_shared_buffer_handle_list,
+                           mx_uint* num_in_args,
+                           NDArrayHandle** in_args,
+                           NDArrayHandle** arg_grads,
+                           mx_uint* num_aux_states,
+                           NDArrayHandle** aux_states,
+                           ExecutorHandle shared_exec_handle,
+                           ExecutorHandle* out) {
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  nnvm::Symbol *sym = static_cast<nnvm::Symbol*>(symbol_handle);
+
+  // get in_arg names
+  std::vector<std::string> in_arg_names = sym->ListInputNames(nnvm::Symbol::kReadOnlyArgs);
+  std::vector<std::string> aux_state_names = sym->ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+
+  // attr_dict for setting up type_dict and arg/aux ctx
+  std::unordered_map<std::string, std::unordered_map<std::string, std::string>> attr_dict;
+  if (nullptr == provided_arg_dtypes || nullptr != g2c_keys || nullptr == provided_arg_stypes) {
+    std::vector<std::tuple<std::string, std::string, std::string>> attrs =
+      sym->ListAttrsRecursive();
+    attr_dict.reserve(attrs.size());
+    for (const auto& tp : attrs) {
+      attr_dict[std::get<0>(tp)][std::get<1>(tp)] = std::get<2>(tp);
+    }
+  }
+
+  // setup arg_dtype_map
+  std::unordered_map<std::string, int> arg_dtype_map;
+  if (nullptr == provided_arg_dtypes) {  // use attr_dict
+    for (const auto& arg_name : in_arg_names) {
+      const auto it = attr_dict.find(arg_name);
+      if (it == attr_dict.end() || !it->second.count("__dtype__")) {
+        arg_dtype_map[arg_name] = mshadow::kFloat32;
+      }
+    }
+  } else {  // use user input type_dict
+    // create dtype map for in_args and aux_states
+    arg_dtype_map.reserve(num_provided_arg_dtypes);
+    for (mx_uint i = 0; i < num_provided_arg_dtypes; ++i) {
+      arg_dtype_map[provided_arg_dtype_names[i]] = provided_arg_dtypes[i];
+    }
+  }
+
+  // setup arg_stype_map
+  std::unordered_map<std::string, int> arg_stype_map;
+  if (nullptr == provided_arg_stypes) {  // use attr_dict
+    for (const auto& arg_name : in_arg_names) {
+      const auto it = attr_dict.find(arg_name);
+      if (it == attr_dict.end() || !it->second.count("__storage_type__")) {
+        arg_stype_map[arg_name] = kDefaultStorage;
+      }
+    }
+  } else {  // use user input type_dict
+    // create stype map for in_args and aux_states
+    arg_stype_map.reserve(num_provided_arg_stypes);
+    for (mx_uint i = 0; i < num_provided_arg_stypes; ++i) {
+      arg_stype_map[provided_arg_stype_names[i]] = provided_arg_stypes[i];
+    }
+  }
+
+  // create default ctx
+  Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+  // create ctx map
+  std::map<std::string, Context> ctx_map;
+  std::vector<Context> in_arg_ctx_vec(in_arg_names.size(), ctx);
+  std::vector<Context> aux_state_ctx_vec(aux_state_names.size(), ctx);
+  if (nullptr != g2c_keys) {  // use user input group2ctx dict
+    for (mx_uint i = 0; i < num_g2c_keys; ++i) {
+      ctx_map[g2c_keys[i]] = Context::Create(
+          static_cast<Context::DeviceType>(g2c_dev_types[i]), g2c_dev_ids[i]);
+    }
+
+    // initialize in_arg_ctx_vec using group2ctx if there are any
+    for (size_t i = 0; i < in_arg_ctx_vec.size(); ++i) {
+      const auto it1 = attr_dict.find(in_arg_names[i]);
+      if (it1 != attr_dict.end()) {
+        const auto it2 = it1->second.find("__ctx_group__");
+        if (it2 != it1->second.end()) {
+          const auto it3 = ctx_map.find(it2->second);
+          if (it3 != ctx_map.end()) {
+            in_arg_ctx_vec[i] = it3->second;
+          }
+        }
+      }
+    }
+
+    // initialize aux_state_ctx_vec using group2ctx if there are any
+    for (size_t i = 0; i < aux_state_ctx_vec.size(); ++i) {
+      const auto it1 = attr_dict.find(aux_state_names[i]);
+      if (it1 != attr_dict.end()) {
+        const auto it2 = it1->second.find("__ctx_group__");
+        if (it2 != it1->second.end()) {
+          const auto it3 = ctx_map.find(it2->second);
+          if (it3 != ctx_map.end()) {
+            aux_state_ctx_vec[i] = it3->second;
+          }
+        }
+      }
+    }
+  }
+
+  // create provided_grad_req_map
+  const std::map<std::string, OpReqType> req_map =
+    {{"null", kNullOp}, {"write", kWriteTo}, {"add", kAddTo}};
+  std::unordered_map<std::string, std::string> provided_grad_req_map;
+  std::string grad_req_type;
+  if (0 == provided_grad_req_list_len
+      && nullptr == provided_grad_req_names
+      && nullptr != provided_grad_req_types) {  // string, grad_req='write'
+    CHECK_EQ(req_map.count(provided_grad_req_types[0]), 1U)
+      << "grad_req=" << provided_grad_req_types[0] << " is not a valid input in simple_bind; "
+      "only \'null\', \'write\', and \'add\' are supported";
+    grad_req_type = "string";
+  } else if (provided_grad_req_list_len > 0
+      && nullptr == provided_grad_req_names
+      && nullptr != provided_grad_req_types) {  // list, grad_req=['null', 'write']
+    grad_req_type = "list";
+    CHECK_EQ(provided_grad_req_list_len, in_arg_names.size())
+      << "The length of grad_req list does not match the number of input arguments in simple_bind, "
+      "expected " << in_arg_names.size() << ", provided " << provided_grad_req_list_len;
+  } else if (provided_grad_req_list_len > 0
+      && nullptr != provided_grad_req_names
+      && nullptr != provided_grad_req_types) {  // dict, grad_req=['lhs': 'null', 'rhs': 'write']
+    grad_req_type = "dict";
+    provided_grad_req_map.reserve(provided_grad_req_list_len);
+    for (mx_uint i = 0; i < provided_grad_req_list_len; ++i) {
+      CHECK_EQ(req_map.count(provided_grad_req_types[i]), 1U)
+        << "grad_req=" << provided_grad_req_types[i] << " is not a valid input in simple_bind; "
+        "only \'null\', \'write\', and \'add\' are supported";
+      provided_grad_req_map[provided_grad_req_names[i]] = provided_grad_req_types[i];
+    }
+  } else {  // grad_req is None
+    grad_req_type = "none";
+  }
+
+  // initialize arg_grad_ctx_vec and grad_req_type_vec
+  std::vector<Context> arg_grad_ctx_vec(in_arg_names.size(), ctx);
+  std::vector<OpReqType> grad_req_type_vec(in_arg_names.size(), kNullOp);
+  if ("none" != grad_req_type) {
+    for (size_t i = 0; i < in_arg_names.size(); ++i) {
+      OpReqType cur_req = kNullOp;
+      if ("string" == grad_req_type) {
+        cur_req = req_map.at(provided_grad_req_types[0]);
+      } else if ("list" == grad_req_type) {
+        CHECK_EQ(req_map.count(provided_grad_req_types[i]), 1U)
+          << "grad_req=" << provided_grad_req_types[i] << " is not a valid input in simple_bind; "
+          "only \'null\', \'write\', and \'add\' are supported";
+        cur_req = req_map.at(provided_grad_req_types[i]);
+      } else if ("dict" == grad_req_type) {
+        const auto it = provided_grad_req_map.find(in_arg_names[i]);
+        if (it != provided_grad_req_map.end()) {
+          cur_req = req_map.at(it->second);
+        }
+      }
+      if (kNullOp != cur_req) {
+        arg_grad_ctx_vec[i] = in_arg_ctx_vec[i];
+        grad_req_type_vec[i] = static_cast<OpReqType>(cur_req);
+      }
+    }
+  }
+
+  // create shape map for in_args and aux_states
+  std::unordered_map<std::string, mxnet::TShape> arg_shape_map(num_provided_arg_shapes);
+  for (mx_uint i = 0; i < num_provided_arg_shapes; ++i) {
+    auto p = arg_shape_map.emplace(provided_arg_shape_names[i],
+        mxnet::TShape(provided_arg_shape_data+provided_arg_shape_idx[i],
+          provided_arg_shape_data+provided_arg_shape_idx[i+1]));
+    CHECK(p.second) << "Duplicate shapes are provided for argument "
+      << provided_arg_shape_names[i] << " in simple_bind";
+  }
+  if (!Imperative::Get()->is_np_comp()) {
+    for (auto &kv : arg_shape_map) {
+      common::ConvertToNumpyShape(&kv.second);
+    }
+  }
 
   // create para name set for sharing data array memory
   std::unordered_set<std::string> shared_arg_name_set(num_shared_arg_names);
@@ -628,6 +995,97 @@ int MXExecutorReshape(int partial_shaping,
   API_END_HANDLE_ERROR(delete new_exec);
 }
 
+int MXExecutorReshapeEx(int partial_shaping,
+                        int allow_up_sizing,
+                        int dev_type,
+                        int dev_id,
+                        mx_uint num_map_keys,
+                        const char** map_keys,
+                        const int* map_dev_types,
+                        const int* map_dev_ids,
+                        const mx_uint num_provided_arg_shapes,
+                        const char** provided_arg_shape_names,
+                        const int* provided_arg_shape_data,
+                        const mx_uint* provided_arg_shape_idx,
+                        mx_uint* num_in_args,
+                        NDArrayHandle** in_args,
+                        NDArrayHandle** arg_grads,
+                        mx_uint* num_aux_states,
+                        NDArrayHandle** aux_states,
+                        ExecutorHandle shared_exec,
+                        ExecutorHandle *out) {
+  Executor* new_exec = nullptr;
+
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  *out = nullptr;  // ensure we can know whether to free executor on early abort
+  // create shape map for in_args and aux_states
+  std::unordered_map<std::string, mxnet::TShape> kwargs(num_provided_arg_shapes);
+  for (mx_uint i = 0; i < num_provided_arg_shapes; ++i) {
+    auto p = kwargs.emplace(provided_arg_shape_names[i],
+        mxnet::TShape(provided_arg_shape_data+provided_arg_shape_idx[i],
+          provided_arg_shape_data+provided_arg_shape_idx[i+1]));
+    CHECK(p.second) << "Duplicate shapes are provided for argument "
+      << provided_arg_shape_names[i] << " in reshape of executor";
+  }
+
+  Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+  std::map<std::string, Context> ctx_map;
+  for (mx_uint i = 0; i < num_map_keys; ++i) {
+    ctx_map[std::string(map_keys[i])] = Context::Create(
+        static_cast<Context::DeviceType>(map_dev_types[i]), map_dev_ids[i]);
+  }
+  std::vector<NDArray> in_arg_vec;
+  std::vector<NDArray> arg_grad_vec;
+  std::vector<NDArray> aux_state_vec;
+
+  Executor* exec = static_cast<Executor*>(shared_exec);
+  new_exec = exec->Reshape(partial_shaping, allow_up_sizing, ctx, ctx_map, kwargs,
+                       &in_arg_vec, &arg_grad_vec, &aux_state_vec);
+  *out = new_exec;
+
+  ret->ret_handles.clear();
+  ret->ret_handles.reserve(in_arg_vec.size()+arg_grad_vec.size()+aux_state_vec.size());
+
+  size_t nd_idx = 0;
+  for (const auto& nd : in_arg_vec) {
+    if (nd.is_none()) {
+      LOG(FATAL) << "Input argument NDArray cannot be un-allocated";
+    }
+    ret->ret_handles.push_back(new NDArray(nd));
+  }
+  if (in_arg_vec.size() > 0) {
+    *num_in_args = in_arg_vec.size();
+    *in_args = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  for (const auto& nd : arg_grad_vec) {
+    if (nd.is_none()) {
+      ret->ret_handles.push_back(nullptr);
+    } else {
+      ret->ret_handles.push_back(new NDArray(nd));
+    }
+  }
+  if (arg_grad_vec.size() > 0) {
+    *arg_grads = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  for (const auto& nd : aux_state_vec) {
+    if (nd.is_none()) {
+      LOG(FATAL) << "Auxiliary argument NDArray cannot be un-allocated";
+    }
+    ret->ret_handles.push_back(new NDArray(nd));
+  }
+  if (aux_state_vec.size() > 0) {
+    *num_aux_states = aux_state_vec.size();
+    *aux_states = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+  API_END_HANDLE_ERROR(delete new_exec);
+}
+
 int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
                                  SymbolHandle *out) {
   auto s = new nnvm::Symbol();
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 18f6c411e039..0e136b03ecd7 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -276,6 +276,18 @@ int MXAutogradSetIsRecording(int is_recording, int* prev) {
   API_END();
 }
 
+int MXIsNumpyCompatible(bool* curr) {
+  API_BEGIN();
+  *curr = Imperative::Get()->is_np_comp();
+  API_END();
+}
+
+int MXSetIsNumpyCompatible(int is_np_comp, int* prev) {
+  API_BEGIN();
+  *prev = Imperative::Get()->set_is_np_comp(static_cast<bool>(is_np_comp));
+  API_END();
+}
+
 int MXAutogradMarkVariables(mx_uint num_var,
                             NDArrayHandle *var_handles,
                             mx_uint *reqs_array,
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index e07716267288..24a88520376f 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -24,6 +24,7 @@
  */
 #include "mxnet/base.h"
 #include "mxnet/c_api.h"
+#include "mxnet/imperative.h"
 #include "nnvm/c_api.h"
 #include "nnvm/pass.h"
 #include "nnvm/pass_functions.h"
@@ -543,8 +544,14 @@ int MXSymbolInferShape(SymbolHandle sym,
     throw dmlc::Error(err.msg);
   }
 
+  // if use legacy shape definition, need to convert numpy shape to legacy shape
+  mxnet::ShapeVector shapes = g.GetAttr<mxnet::ShapeVector>("shape");
+  if (!Imperative::Get()->is_np_comp()) {
+    common::ConvertToLegacyShape(&shapes);
+  }
+
   // copy back
-  CopyAttr(g.indexed_graph(), g.GetAttr<mxnet::ShapeVector>("shape"),
+  CopyAttr(g.indexed_graph(), shapes,
            &(ret->arg_shapes), &(ret->out_shapes), &(ret->aux_shapes));
 
   // copy data back
@@ -568,6 +575,79 @@ int MXSymbolInferShape(SymbolHandle sym,
   API_END();
 }
 
+int MXSymbolInferShapeEx(SymbolHandle sym,
+                         mx_uint num_args,
+                         const char** keys,
+                         const mx_uint *arg_ind_ptr,
+                         const int *arg_shape_data,
+                         mx_uint *in_shape_size,
+                         const int **in_shape_ndim,
+                         const int ***in_shape_data,
+                         mx_uint *out_shape_size,
+                         const int **out_shape_ndim,
+                         const int ***out_shape_data,
+                         mx_uint *aux_shape_size,
+                         const int **aux_shape_ndim,
+                         const int ***aux_shape_data,
+                         int *complete) {
+  nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  nnvm::Graph g = Symbol2Graph(*s);
+  mxnet::ShapeVector arg_shapes(g.indexed_graph().input_nodes().size(), mxnet::TShape());
+  if (keys == nullptr && num_args != 0) {
+    std::vector<uint32_t> read_only_args = mxnet::ReadOnlyArgIndices(g.indexed_graph());
+    CHECK_LE(num_args, read_only_args.size());
+    for (mx_uint i = 0; i < num_args; ++i) {
+      arg_shapes[read_only_args[i]] = mxnet::ShapeTypeCast(
+          arg_shape_data + arg_ind_ptr[i], arg_shape_data + arg_ind_ptr[i+1]);
+    }
+  } else {
+    std::unordered_map<std::string, mxnet::TShape> kwargs;
+    for (mx_uint i = 0; i < num_args; ++i) {
+      kwargs[keys[i]] = mxnet::ShapeTypeCast(
+          arg_shape_data + arg_ind_ptr[i], arg_shape_data + arg_ind_ptr[i+1]);
+    }
+    mxnet::MatchArguments(g.indexed_graph(), kwargs, &arg_shapes, "InferShape");
+  }
+
+  try {
+    g = mxnet::exec::InferShape(std::move(g), std::move(arg_shapes), "__shape__");
+  } catch (const mxnet::op::InferShapeError &err) {
+    throw dmlc::Error(err.msg);
+  }
+
+  // if use legacy shape definition, need to convert numpy shape to legacy shape
+  mxnet::ShapeVector shapes = g.GetAttr<mxnet::ShapeVector>("shape");
+  if (!Imperative::Get()->is_np_comp()) {
+    common::ConvertToLegacyShape(&shapes);
+  }
+
+  // copy back
+  CopyAttr(g.indexed_graph(), shapes,
+           &(ret->arg_shapes), &(ret->out_shapes), &(ret->aux_shapes));
+
+  // copy data back
+  MXAPIThreadLocalEntry::SetupShapeArrayReturnWithBufferEx(ret->arg_shapes,
+      &(ret->arg_shape_ndim_ex), &(ret->arg_shape_data_ex), &(ret->arg_shape_buffer_ex));
+  MXAPIThreadLocalEntry::SetupShapeArrayReturnWithBufferEx(ret->out_shapes,
+      &(ret->out_shape_ndim_ex), &(ret->out_shape_data_ex), &(ret->out_shape_buffer_ex));
+  MXAPIThreadLocalEntry::SetupShapeArrayReturnWithBufferEx(ret->aux_shapes,
+      &(ret->aux_shape_ndim_ex), &(ret->aux_shape_data_ex), &(ret->aux_shape_buffer_ex));
+  *in_shape_size = static_cast<mx_uint>(ret->arg_shapes.size());
+  *in_shape_ndim = dmlc::BeginPtr(ret->arg_shape_ndim_ex);
+  *in_shape_data = dmlc::BeginPtr(ret->arg_shape_data_ex);
+  *out_shape_size = static_cast<mx_uint>(ret->out_shapes.size());
+  *out_shape_ndim = dmlc::BeginPtr(ret->out_shape_ndim_ex);
+  *out_shape_data = dmlc::BeginPtr(ret->out_shape_data_ex);
+  *aux_shape_size = static_cast<mx_uint>(ret->aux_shapes.size());
+  *aux_shape_ndim = dmlc::BeginPtr(ret->aux_shape_ndim_ex);
+  *aux_shape_data = dmlc::BeginPtr(ret->aux_shape_data_ex);
+  // mark complete
+  *complete = (g.GetAttr<size_t>("shape_num_unknown_nodes") == 0);
+  API_END();
+}
+
 int MXSymbolInferShapePartial(SymbolHandle sym,
                               mx_uint num_args,
                               const char** keys,
@@ -593,6 +673,31 @@ int MXSymbolInferShapePartial(SymbolHandle sym,
                             &succ);
 }
 
+int MXSymbolInferShapePartialEx(SymbolHandle sym,
+                                mx_uint num_args,
+                                const char** keys,
+                                const mx_uint *arg_ind_ptr,
+                                const int *arg_shape_data,
+                                mx_uint *in_shape_size,
+                                const int **in_shape_ndim,
+                                const int ***in_shape_data,
+                                mx_uint *out_shape_size,
+                                const int **out_shape_ndim,
+                                const int ***out_shape_data,
+                                mx_uint *aux_shape_size,
+                                const int **aux_shape_ndim,
+                                const int ***aux_shape_data,
+                                int *complete) {
+  int succ;
+  *complete = 1;
+  return MXSymbolInferShapeEx(sym, num_args, keys,
+                              arg_ind_ptr, arg_shape_data,
+                              in_shape_size, in_shape_ndim, in_shape_data,
+                              out_shape_size, out_shape_ndim, out_shape_data,
+                              aux_shape_size, aux_shape_ndim, aux_shape_data,
+                              &succ);
+}
+
 int MXSymbolInferType(SymbolHandle sym,
                       mx_uint num_args,
                       const char** keys,
@@ -722,14 +827,15 @@ int MXGenBackendSubgraph(SymbolHandle sym_handle, const char *backend,
   API_BEGIN();
   nnvm::Symbol *sym = static_cast<nnvm::Symbol *>(sym_handle);
   *s = sym->Copy();
-  nnvm::Graph g = Symbol2Graph(*s);
-  mxnet::op::SubgraphPropertyPtr property =
-      mxnet::op::SubgraphPropertyRegistry::Get()->CreateSubgraphProperty(
-          backend);
-  g.attrs["subgraph_property"] =
-      std::make_shared<nnvm::any>(std::move(property));
-  g = ApplyPass(std::move(g), "PartitionGraph");
-  s->outputs = g.outputs;
+  std::vector<mxnet::op::SubgraphPropertyPtr> properties =
+      mxnet::op::SubgraphPropertyRegistry::Get()->CreateSubgraphProperty(backend);
+  for (auto property : properties) {
+    nnvm::Graph g = Symbol2Graph(*s);
+    property->SetAttr("graph", g);
+    g.attrs["subgraph_property"] = std::make_shared<nnvm::any>(std::move(property));
+    g = ApplyPass(std::move(g), "BuildSubgraph");
+    s->outputs = g.outputs;
+  }
   *ret_sym_handle = s;
   API_END_HANDLE_ERROR(delete s);
 }
diff --git a/src/c_api/c_api_test.cc b/src/c_api/c_api_test.cc
index 623faa71adc9..ae36b7af2829 100644
--- a/src/c_api/c_api_test.cc
+++ b/src/c_api/c_api_test.cc
@@ -27,7 +27,7 @@
 #include "./c_api_common.h"
 #include "../operator/subgraph/subgraph_property.h"
 
-int MXPartitionGraphByOpNames(SymbolHandle sym_handle,
+int MXBuildSubgraphByOpNames(SymbolHandle sym_handle,
                               const char* prop_name,
                               const mx_uint num_ops,
                               const char** op_names,
@@ -40,16 +40,19 @@ int MXPartitionGraphByOpNames(SymbolHandle sym_handle,
   }
   nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(sym_handle);
   *s = sym->Copy();
-  nnvm::Graph g;
-  g.outputs = s->outputs;
   if (!op_name_set.empty()) {
-    mxnet::op::SubgraphPropertyPtr property
-        = mxnet::op::SubgraphPropertyRegistry::Get()->CreateSubgraphProperty(prop_name);
-    property->SetAttr("op_names", op_name_set);
-    g.attrs["subgraph_property"] = std::make_shared<nnvm::any>(std::move(property));
+    std::vector<mxnet::op::SubgraphPropertyPtr> properties =
+        mxnet::op::SubgraphPropertyRegistry::Get()->CreateSubgraphProperty(prop_name);
+    for (auto property : properties) {
+      nnvm::Graph g;
+      g.outputs = s->outputs;
+      property->SetAttr("graph", g);
+      property->SetAttr("op_names", op_name_set);
+      g.attrs["subgraph_property"] = std::make_shared<nnvm::any>(std::move(property));
+      g = nnvm::ApplyPass(std::move(g), "BuildSubgraph");
+      s->outputs = g.outputs;
+    }
   }
-  g = nnvm::ApplyPass(std::move(g), "PartitionGraph");
-  s->outputs = g.outputs;
   *ret_sym_handle = s;
   API_END_HANDLE_ERROR(delete s);
 }
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index 3b9f43d86079..7de23ef935ef 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -436,6 +436,7 @@ int MXPredGetOutputShape(PredictorHandle handle,
       << "Index exceed number of outputs";
 
   const mxnet::TShape& s = p->out_shapes[out_index];
+  CHECK_GE(s.ndim(), 0);
   p->out_shapes_buffer.resize(s.ndim());
   nnvm::ShapeTypeCast(s.begin(), s.end(), p->out_shapes_buffer.data());
   *shape_data = p->out_shapes_buffer.data();
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
index 279ecbd67f09..0551b429f17e 100644
--- a/src/common/exec_utils.h
+++ b/src/common/exec_utils.h
@@ -380,7 +380,7 @@ inline void HandleInferShapeError(const size_t num_forward_inputs,
     const uint32_t nid = idx.input_nodes().at(i);
     const uint32_t eid = idx.entry_id(nid, 0);
     const mxnet::TShape& inferred_shape = inferred_shapes[eid];
-    if (inferred_shape.ndim() == 0 || inferred_shape.Size() == 0U) {
+    if (!shape_is_known(inferred_shape)) {
       const std::string& arg_name = idx[nid].source->attrs.name;
       oss << arg_name << ": " << inferred_shape << ", ";
       if (--cnt == 0) {
@@ -390,7 +390,7 @@ inline void HandleInferShapeError(const size_t num_forward_inputs,
     }
   }
   LOG(FATAL) << "InferShape pass cannot decide shapes for the following arguments "
-                "(0s means unknown dimensions). Please consider providing them as inputs:\n"
+                "(-1 means unknown dimensions). Please consider providing them as inputs:\n"
              << oss.str();
 }
 
diff --git a/src/common/utils.h b/src/common/utils.h
index 8e6966952890..251a8fe3c190 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -654,7 +654,7 @@ FCompType GetFCompute(const nnvm::Op* op, const std::string& name,
   } else if (ctx.dev_mask() == gpu::kDevMask) {
     return fcompute_gpu.get(op, nullptr);
   } else {
-    LOG(FATAL) << "Unknown device mask";
+    LOG(FATAL) << "Unknown device mask " << ctx.dev_mask();
     return nullptr;
   }
 }
@@ -734,6 +734,64 @@ inline void ParallelCopy(DType* dst, const DType* src, index_t size) {
   }
 }
 
+/*!
+ * \brief If numpy compatibility is turned off (default), the shapes passed in
+ * by users follow the legacy shape definition:
+ * 1. 0 ndim means the shape is completely unknown.
+ * 2. 0 dim size means the dim size is unknown.
+ * We need to convert those shapes to use the numpy shape definition:
+ * 1. 0 ndim means it's a scalar tensor.
+ * 2. -1 ndim means the shape is unknown.
+ * 3. 0 dim size means no elements in that dimension.
+ * 4. -1 dim size means the dimension's size is unknown.
+ * so that operator's infer shape function can work in backend.
+ * \param shape to be converted.
+ * Note: It is possible that the shape to be converted is already
+ * numpy compatible. For example, when a subgraph operator's infer
+ * shape function is called from the infer shape pass of the whole
+ * graph, its input/output shapes have been converted to numpy
+ * compatible shapes.
+ */
+inline void ConvertToNumpyShape(mxnet::TShape* shape) {
+  if (shape->ndim() == 0) {  // legacy shape ndim = 0 means unknown
+    *shape = mxnet::TShape();  // unknown shape ndim = -1
+  } else {
+    for (int j = 0; j < shape->ndim(); ++j) {
+      if ((*shape)[j] == 0) {  // legacy shape dim_size = 0 means unknown
+        (*shape)[j] = -1;  // unknown dim size = -1
+      }
+    }
+  }
+}
+
+inline void ConvertToNumpyShape(mxnet::ShapeVector* shapes) {
+  for (size_t i = 0; i < shapes->size(); ++i) {
+    ConvertToNumpyShape(&(shapes->at(i)));
+  }
+}
+
+/*!
+ * \brief This is function is used to convert shapes returned by
+ * the infer shape functions/pass to the legacy shape definition.
+ */
+inline void ConvertToLegacyShape(mxnet::TShape* shape) {
+  if (!mxnet::ndim_is_known(*shape)) {
+    *shape = mxnet::TShape(0, -1);
+  } else {
+    for (int j = 0; j < shape->ndim(); ++j) {
+      if (!mxnet::dim_size_is_known(*shape, j)) {
+        (*shape)[j] = 0;
+      }
+    }
+  }
+}
+
+inline void ConvertToLegacyShape(mxnet::ShapeVector* shapes) {
+  for (size_t i = 0; i < shapes->size(); ++i) {
+    ConvertToLegacyShape(&(shapes->at(i)));
+  }
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index db4491981bdd..93853c459298 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -212,6 +212,9 @@ class NaiveEngine final : public Engine {
   void WaitForAll() override {
   }
 
+  void Throw(VarHandle var) override {
+  }
+
   void NotifyShutdown() override {
     shutdown_phase_.store(true);
   }
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index b5897a1ca9cd..38311908bdcd 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -415,6 +415,23 @@ void ThreadedEngine::WaitForAll() {
   finished_cv_.wait(lock, [this]() {
       return pending_.load() == 0 || kill_.load();
     });
+  std::exception_ptr exception_to_rethrow = nullptr;
+  if (!global_exception_refs_.empty()) {
+    // iterate through all exception refs
+    for (const auto& global_exception_ref : global_exception_refs_) {
+      // the first exception will be saved to be rethrown later
+      if (*global_exception_ref != nullptr && exception_to_rethrow == nullptr) {
+        exception_to_rethrow = *global_exception_ref;
+      }
+      // clear exceptions, WaitToRead following WaitForAll shouldn't throw
+      *global_exception_ref = nullptr;
+    }
+    // A waitall following a waitall shouldn't throw any exceptions
+    global_exception_refs_.clear();
+    if (exception_to_rethrow != nullptr) {
+      std::rethrow_exception(exception_to_rethrow);
+    }
+  }
 }
 
 inline void ThreadedEngine::OnComplete(ThreadedOpr* threaded_opr) {
@@ -428,6 +445,9 @@ inline void ThreadedEngine::OnComplete(ThreadedOpr* threaded_opr) {
   for (auto&& i : threaded_opr->mutable_vars) {
     if (threaded_opr->opr_exception && *threaded_opr->opr_exception) {
       i->var_exception = threaded_opr->opr_exception;
+      // add current operator exceptions to global exceptions if not already
+      // added
+      AddToGlobalExceptions(threaded_opr->opr_exception);
     }
     const bool debug_info = (engine_info_ && debug_wait_var_ == i);
     if (debug_info) {
@@ -478,6 +498,11 @@ inline void ThreadedEngine::ThrowException(ThreadedVar* threaded_var) {
   return;
 }
 
+void ThreadedEngine::Throw(VarHandle var) {
+  ThreadedVar *threaded_var = ThreadedVar::CastFromBase(var);
+  ThrowException(threaded_var);
+}
+
 void ThreadedEngine::OnCompleteStatic(Engine *engine, void *opr_block_,
                                       const dmlc::Error* error) {
   OprBlock *opr_block = static_cast<OprBlock*>(opr_block_);
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index ab06ca1b9b47..7df232b1c62a 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -30,6 +30,7 @@
 #include <dmlc/base.h>
 #include <dmlc/logging.h>
 #include <dmlc/omp.h>
+#include <mxnet/storage.h>
 #include <vector>
 #include <functional>
 #include <condition_variable>
@@ -59,6 +60,9 @@ namespace engine {
 // Forward declarations
 struct ThreadedOpr;
 
+/*! shared_ptr to exception_ptr, used for exception handling */
+typedef std::shared_ptr<std::exception_ptr> ExceptionRef;
+
 /*!
  * \brief Operation block in the scheduler.
  *  Each OprBlock corresponds to an operation pushed to the engine.
@@ -176,8 +180,12 @@ class ThreadedVar final
   static std::atomic<std::size_t> counter;
   ~ThreadedVar() { LOG(INFO) << __func__ << " " << --counter; }
 #endif  // ENGINE_DEBUG
-  /*! \brief exception_ptr associated with the ThreadedVar */
-  std::shared_ptr<std::exception_ptr> var_exception;
+  /*!
+   * \brief exception_ptr associated with the ThreadedOpr
+   * cannot modify state of exception object since dereferencing
+   * exception_ptr is undefined behavior. Using shared_ptr to hold
+   * exception_ptr and overcome this limitation */
+  ExceptionRef var_exception;
 
  private:
   // TODO(hotpxl) change this to spinlock for faster runtime
@@ -253,8 +261,12 @@ struct ThreadedOpr final : public Opr,
   }
   // define possible debug information
   DEFINE_ENGINE_DEBUG_INFO(ThreadedOpr);
-  /*! \brief exception_ptr associated with the ThreadedOpr */
-  std::shared_ptr<std::exception_ptr> opr_exception;
+  /*!
+   * \brief exception_ptr associated with the ThreadedOpr
+   * cannot modify state of exception object since dereferencing
+   * exception_ptr is undefined behavior. Using shared_ptr to hold
+   * exception_ptr and overcome this limitation */
+  ExceptionRef opr_exception;
 };  // struct ThreadedOpr
 
 /*!
@@ -294,6 +306,7 @@ class ThreadedEngine : public Engine {
   void DeleteVariable(SyncFn delete_fn, Context exec_ctx, VarHandle var) override;
   void WaitForVar(VarHandle var) override;
   void WaitForAll() override;
+  void Throw(VarHandle var) override;
   void NotifyShutdown() override {
     shutdown_phase_.store(true);
   }
@@ -306,6 +319,8 @@ class ThreadedEngine : public Engine {
     objpool_varblk_ref_ = common::ObjectPool<VersionedVarBlock>::_GetSharedRef();
     objpool_var_ref_    = common::ObjectPool<ThreadedVar>::_GetSharedRef();
 
+    storage_ref_ = Storage::_GetSharedRef();
+
     // Get a ref to the profiler so that it doesn't get killed before us
     profiler::Profiler::Get(&profiler_);
   }
@@ -360,8 +375,8 @@ class ThreadedEngine : public Engine {
           LOG(INFO) << "ExecuteOprFn ";
         }
         try {
-          if (!(threaded_opr->opr_exception && *threaded_opr->opr_exception) ||
-              threaded_opr->wait) {
+          if ((!(threaded_opr->opr_exception && *threaded_opr->opr_exception) ||
+              threaded_opr->prop == FnProperty::kNoSkip) || threaded_opr->wait) {
             threaded_opr->fn(run_ctx, callback);
           } else {
             callback();
@@ -429,6 +444,7 @@ class ThreadedEngine : public Engine {
   };
   /*! thread local store for bulk */
   typedef dmlc::ThreadLocalStore<BulkStatus> BulkStatusStore;
+
   /*!
    * \brief check if thee is duplication in const_vars and mutable_vars.
    * \param const_vars the variables to read from.
@@ -457,6 +473,7 @@ class ThreadedEngine : public Engine {
     for (auto&& i : threaded_opr->const_vars) {
       if (i->var_exception && *i->var_exception) {
         threaded_opr->opr_exception = i->var_exception;
+        AddToGlobalExceptions(threaded_opr->opr_exception);
         break;
       }
     }
@@ -464,6 +481,7 @@ class ThreadedEngine : public Engine {
       for (auto&& i : threaded_opr->mutable_vars) {
         if (i->var_exception && *i->var_exception) {
           threaded_opr->opr_exception = i->var_exception;
+          AddToGlobalExceptions(threaded_opr->opr_exception);
           break;
         }
       }
@@ -472,6 +490,18 @@ class ThreadedEngine : public Engine {
 
   static void OnCompleteStatic(Engine *engine, void *threaded_opr,
                                const dmlc::Error* error);
+  /*!
+   * \brief find exception in global_exception_refs and add it if missing
+   * \param opr_exception the exception to be added to global_exception_refs
+   */
+  inline void AddToGlobalExceptions(const ExceptionRef& opr_exception) {
+    auto it = std::find(global_exception_refs_.begin(),
+                        global_exception_refs_.end(), opr_exception);
+    if (it == global_exception_refs_.end()) {
+      global_exception_refs_.push_back(opr_exception);
+    }
+    return;
+  }
   /*! \brief append an operator to bulk */
   inline void BulkAppend(SyncFn exec_fn, Context exec_ctx,
                          std::vector<VarHandle> const& const_vars,
@@ -539,6 +569,8 @@ class ThreadedEngine : public Engine {
    */
   std::mutex finished_m_;
   std::condition_variable finished_cv_;
+  /*! \brief global exception refs, which are rethrown when WaitForAll is called */
+  std::vector<ExceptionRef> global_exception_refs_;
 
   /*!
    * \brief Holding a shared_ptr to the object pool to prevent it from being destructed too early
@@ -549,6 +581,12 @@ class ThreadedEngine : public Engine {
   std::shared_ptr<common::ObjectPool<VersionedVarBlock> > objpool_varblk_ref_;
   std::shared_ptr<common::ObjectPool<ThreadedVar> >       objpool_var_ref_;
 
+  /*!
+   * \brief Async destruction of some objects is relied on storage,
+   *  prevent it from being destructed too early
+   */
+  std::shared_ptr<Storage> storage_ref_;
+
 #if MXNET_USE_CUDA
   /*! \brief Number of GPU devices available */
   std::atomic<int> device_count_{-1};
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index ca2cea093c5d..4a4505581920 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -34,6 +34,7 @@
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
 #include "../operator/subgraph/subgraph_property.h"
+#include "../operator/operator_common.h"
 
 namespace mxnet {
 namespace exec {
@@ -966,7 +967,7 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     uint32_t oid = head_grad_map_.at(idx[nid].source);
     uint32_t eid = idx.entry_id(idx.outputs()[oid]);
     NDArrayStorageType stype = (NDArrayStorageType) vstorage_type[eid];
-    CHECK_NE(vshape[eid].ndim(), 0U);
+    CHECK(mxnet::shape_is_known(vshape[eid]));
     CHECK_NE(vdtype[eid], -1);
     auto data_eid = idx.entry_id(nid, 0);
     // initialize based on storage_type
@@ -1191,105 +1192,49 @@ void GraphExecutor::InitOpSegs() {
   cached_seg_opr_.resize(total_num_nodes, p);
   if (monitor_callback_) return;
 
+  // Symbolic bulking is set by the same environment variables as Imperative bulking.
   // Generate segments based on the graph structure
-  bool prefer_bulk_exec_inference = dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_INFERENCE", true);
+  bool prefer_bulk_exec_inference = Imperative::PreferBulkExecInference();
   // Whether to perform bulk exec for training
   const profiler::Profiler *prof = profiler::Profiler::Get();
-  bool prefer_bulk_exec = dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_TRAIN", 1)
-                          && (!prof || !prof->AggregateEnabled());
+  bool prefer_bulk_exec_train = Imperative::PreferBulkExecTrain()
+                                && (!prof || !prof->AggregateEnabled());
 
   bool is_training = num_forward_nodes_ != total_num_nodes;
 
-  if (prefer_bulk_exec  && is_training) {
-    this->BulkTrainingOpSegs(total_num_nodes);
+  if (prefer_bulk_exec_train && is_training) {
+    // Bulk the forward portion of the graph per the bulk segment max size for forward training
+    this->BulkOpSegs(0, num_forward_nodes_, Imperative::BulkExecMaxNodeTrainFwd());
+    // Bulk the backward portion of the graph per the bulk segment max size for backward training
+    this->BulkOpSegs(num_forward_nodes_, total_num_nodes, Imperative::BulkExecMaxNodeTrainBwd());
   }
 
   if (prefer_bulk_exec_inference && !is_training) {
-    this->BulkInferenceOpSegs();
+    // Bulk the entire graph as one bulk segment if possible
+    this->BulkOpSegs(0, total_num_nodes, total_num_nodes);
   }
 }
 
 
-void GraphExecutor::BulkTrainingOpSegs(size_t total_num_nodes) {
-  // The maximum number of node in a segment executed in bulk
-  size_t num_nodes_threshold = dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15);
-
-  // create forward segments for training
-  size_t topo_start = 0;
-  for (size_t nid = 0; nid < num_forward_nodes_; nid++) {
-    auto &node = graph_.indexed_graph()[nid].source;
-    auto &op_node = op_nodes_[nid];
-    // check if the segment relies on external input, or exceeds maxinum number of node,
-    // or requires async ops
-    if (node->is_variable() || nid - topo_start > num_nodes_threshold ||
-        op_node.exec->exec_type() != ExecType::kSync) {
-      // create a new segment for the previous nodes if the current one cannot be bulked
-      cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
-      topo_start = nid + 1;
-    }
-  }
-  // the last segment
-  if (topo_start != num_forward_nodes_) {
-    cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, num_forward_nodes_);
-  }
-
-  // create backward segments for training
-  // get all gradient variables
-  std::unordered_set<engine::VarHandle> grad_vars;
-  for (auto &kv : grad_store_) {
-    grad_vars.insert(kv.second.var());
-  }
-  auto &idx = graph_.indexed_graph();
-  topo_start = num_forward_nodes_;
-  for (size_t nid = num_forward_nodes_; nid < total_num_nodes; nid++) {
-    auto &op_node = op_nodes_[nid];
-    if (op_node.skip_exec_node || op_node.exec == nullptr) {
-      continue;
-    }
-    if (idx[nid].source->is_variable() || nid - topo_start > num_nodes_threshold ||
-        op_node.exec->exec_type() != ExecType::kSync) {
-      cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
-      topo_start = nid + 1;
-    } else {
-      // If it produces output gradient, don't include it in the segment
-      bool output_gradient = false;
-      for (auto &out_arr : op_node.exec->out_array) {
-        if (grad_vars.find(out_arr.var()) != grad_vars.end()) {
-          output_gradient = true;
-        }
-      }
-      if (output_gradient) {
-        cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
-        topo_start = nid + 1;
-      }
-    }
-  }
-  // last segment for backward
-  if (topo_start < total_num_nodes) {
-    cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, total_num_nodes);
-  }
-}
-
-void GraphExecutor::BulkInferenceOpSegs() {
-  // Attempt to bulk the whole graph for inference.  We will only create new segments when
-  // required for non-kSync operations.
-  size_t topo_start = 0;
-  for (size_t nid = 0; nid < num_forward_nodes_; nid++) {
+void GraphExecutor::BulkOpSegs(size_t from_node, size_t up_to_node, size_t segment_num_nodes_max) {
+  size_t topo_start = from_node;
+  size_t segment_node_count = 0;
+  for (size_t nid = from_node; nid < up_to_node; nid++) {
     auto &node = graph_.indexed_graph()[nid].source;
     auto &op_node = op_nodes_[nid];
-
-    // Variables do not need to be segmented at inference time.
-    if (node->is_variable()) continue;
-
-    if (op_node.exec->exec_type() != ExecType::kSync) {
-      cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
+    // Variables, such as learned weights, are ignored in the segment_node_count
+    bool ignore_node = node->is_variable() || op_node.skip_exec_node || op_node.exec == nullptr;
+    if (!ignore_node)
+      segment_node_count++;
+    bool can_bulk = ignore_node || op_node.exec->exec_type() == ExecType::kSync;
+    // check if we need to create the segment based on properties of this node
+    if (!can_bulk || nid == up_to_node - 1 || segment_node_count >= segment_num_nodes_max) {
+      // Create a new segment for the previous nodes- include also this node if it's bulkable
+      cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, can_bulk ? nid + 1 : nid);
       topo_start = nid + 1;
+      segment_node_count = 0;
     }
   }
-  // The last segment
-  if (topo_start != num_forward_nodes_) {
-    cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, num_forward_nodes_);
-  }
 }
 
 void GraphExecutor::ExecuteMonInputCallback(size_t nid) {
@@ -1498,39 +1443,29 @@ static nnvm::Graph InferForwardAttrs(nnvm::Graph g,
 
 // Given input attr arrays, partition the graph using the backend name equal to prop_name.
 // This is a common function for bind and simple_bind flows.
-static nnvm::Symbol PartitionGraph(const nnvm::Symbol& src,
-                                   const std::string& prop_name,
-                                   const mxnet::ShapeVector& arg_shapes,
-                                   const nnvm::DTypeVector& arg_dtypes,
-                                   const StorageTypeVector& arg_stypes,
-                                   const Context& default_ctx,
-                                   const std::map<std::string, Context>& ctx_map,
-                                   const std::vector<Context>& in_arg_ctxes,
-                                   const std::vector<Context>& aux_state_ctxes) {
-  auto subgraph_prop = op::SubgraphPropertyRegistry::Get()->CreateSubgraphProperty(prop_name);
+static nnvm::Symbol BuildSubgraph(const nnvm::Symbol& src,
+                                  mxnet::op::SubgraphPropertyPtr subgraph_prop,
+                                  const mxnet::ShapeVector& arg_shapes,
+                                  const nnvm::DTypeVector& arg_dtypes,
+                                  const StorageTypeVector& arg_stypes, const Context& default_ctx,
+                                  const std::map<std::string, Context>& ctx_map,
+                                  const std::vector<Context>& in_arg_ctxes,
+                                  const std::vector<Context>& aux_state_ctxes) {
   nnvm::Symbol ret = src.Copy();
   nnvm::Graph g;
   g.outputs = ret.outputs;
-  g = InferForwardAttrs(g, arg_shapes, arg_dtypes, arg_stypes, default_ctx,
-                        ctx_map, in_arg_ctxes, aux_state_ctxes);
+  g = InferForwardAttrs(g, arg_shapes, arg_dtypes, arg_stypes, default_ctx, ctx_map, in_arg_ctxes,
+                        aux_state_ctxes);
   subgraph_prop->SetAttr("graph", g);
-  auto it = op::SubgraphPropertyOpNameSet::Get()->find(prop_name);
-  // assign a op name set to the subgraph property if it has been provided by users
-  if (it != op::SubgraphPropertyOpNameSet::Get()->end()) {
-    LOG(INFO) << "SubgraphPropertyOpNameSet for subgraph property " << prop_name
-              << " has been assigned a value. Please make sure it is initialized"
-                 " only for the testing purpose.";
-    subgraph_prop->SetAttr("op_names", it->second);
-  }
   g.attrs["subgraph_property"] = std::make_shared<nnvm::any>(std::move(subgraph_prop));
-  g = ApplyPass(std::move(g), "PartitionGraph");
+  g = ApplyPass(std::move(g), "BuildSubgraph");
   ret.outputs = g.outputs;
   return ret;
 }
 
 // Given input attr dicts, partition the graph using the backend name equal to prop_name.
 // This is for simple_bind flow.
-static nnvm::Symbol PartitionGraph(const nnvm::Symbol& src,
+static nnvm::Symbol BuildSubgraph(const nnvm::Symbol& src,
                                    const std::string& prop_name,
                                    const std::unordered_map<std::string, mxnet::TShape>
                                                                              & arg_shape_map,
@@ -1538,89 +1473,221 @@ static nnvm::Symbol PartitionGraph(const nnvm::Symbol& src,
                                    const std::unordered_map<std::string, int>& arg_stype_map,
                                    const Context& default_ctx,
                                    const std::map<std::string, Context>& ctx_map,
-                                   const std::vector<Context>& in_arg_ctxes,
-                                   const std::vector<Context>& aux_state_ctxes) {
-  const std::vector<std::string> input_names = src.ListInputNames(Symbol::kAll);
-  mxnet::ShapeVector arg_shapes(input_names.size(), mxnet::TShape());
-  nnvm::DTypeVector arg_dtypes(input_names.size(), -1);
-  StorageTypeVector arg_stypes(input_names.size(), kUndefinedStorage);
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    auto it1 = arg_shape_map.find(input_names[i]);
-    if (arg_shape_map.end() != it1) {
-      arg_shapes[i] = it1->second;
+                                   std::vector<Context>* in_arg_ctxes,
+                                   std::vector<Context>* arg_grad_ctxes,
+                                   std::vector<OpReqType>* grad_req_types,
+                                   std::vector<Context>* aux_state_ctxes) {
+  // setup map for in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes and grad_req_types
+  std::unordered_map<std::string, Context> in_arg_ctx_map;
+  std::unordered_map<std::string, Context> arg_grad_ctx_map;
+  std::unordered_map<std::string, Context> aux_state_ctx_map;
+  std::unordered_map<std::string, OpReqType> grad_req_type_map;
+
+  auto arg_names = src.ListInputNames(nnvm::Symbol::kReadOnlyArgs);
+  auto aux_names = src.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+  for (size_t i = 0; i < arg_names.size(); ++i) {
+    auto name = arg_names[i];
+    in_arg_ctx_map[name] = in_arg_ctxes->at(i);
+    arg_grad_ctx_map[name] = arg_grad_ctxes->at(i);
+    grad_req_type_map[name] = grad_req_types->at(i);
+  }
+
+  for (size_t i = 0; i < aux_names.size(); ++i) {
+    aux_state_ctx_map[aux_names[i]] = aux_state_ctxes->at(i);
+  }
+
+  bool need_grad = false;
+  for (OpReqType req : *grad_req_types) {
+    if (req != kNullOp) {
+      need_grad = true;
+      break;
     }
-    auto it2 = arg_dtype_map.find(input_names[i]);
-    if (arg_dtype_map.end() != it2) {
-      arg_dtypes[i] = it2->second;
+  }
+  nnvm::Symbol ret = src.Copy();
+  std::unordered_set<std::string> op_names_set;
+  auto it = op::SubgraphPropertyOpNameSet::Get()->find(prop_name);
+  // assign a op name set to the subgraph property if it has been provided by users
+  if (it != op::SubgraphPropertyOpNameSet::Get()->end()) {
+    LOG(INFO) << "SubgraphPropertyOpNameSet for subgraph property " << prop_name
+              << " has been assigned a value. Please make sure it is initialized"
+                 " only for the testing purpose.";
+    op_names_set = it->second;
+  }
+  std::vector<mxnet::op::SubgraphPropertyPtr> properties =
+      op::SubgraphPropertyRegistry::Get()->CreateSubgraphProperty(prop_name);
+  for (auto subgraph_prop : properties) {
+    if (subgraph_prop->HasAttr("inference_only") &&
+        subgraph_prop->GetAttr<bool>("inference_only") == true) {
+      if (need_grad) {
+        auto full_name = subgraph_prop->HasAttr("property_name")
+                             ? subgraph_prop->GetAttr<std::string>("property_name")
+                             : prop_name;
+        LOG(INFO) << "skip partitioning graph with subgraph property " << full_name
+                  << " as it requires `grad_req=null`.";
+        continue;
+      }
     }
-    auto it3 = arg_stype_map.find(input_names[i]);
-    if (arg_stype_map.end() != it3) {
-      arg_stypes[i] = it3->second;
+    subgraph_prop->SetAttr("op_names", op_names_set);
+    const std::vector<std::string> input_names = ret.ListInputNames(Symbol::kAll);
+    mxnet::ShapeVector arg_shapes(input_names.size(), mxnet::TShape());
+    nnvm::DTypeVector arg_dtypes(input_names.size(), -1);
+    StorageTypeVector arg_stypes(input_names.size(), kUndefinedStorage);
+    for (size_t i = 0; i < input_names.size(); ++i) {
+      const auto& input_name = input_names[i];
+      auto it1 = arg_shape_map.find(input_name);
+      if (arg_shape_map.end() != it1) {
+        arg_shapes[i] = it1->second;
+      }
+      auto it2 = arg_dtype_map.find(input_name);
+      if (arg_dtype_map.end() != it2) {
+        arg_dtypes[i] = it2->second;
+      }
+      auto it3 = arg_stype_map.find(input_name);
+      if (arg_stype_map.end() != it3) {
+        arg_stypes[i] = it3->second;
+      }
+    }
+    ret = BuildSubgraph(ret, subgraph_prop, arg_shapes, arg_dtypes, arg_stypes, default_ctx,
+                         ctx_map, *in_arg_ctxes, *aux_state_ctxes);
+    // Reorder in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes and grad_req_types according to
+    // partitioned symbol input sequence
+    in_arg_ctxes->clear();
+    arg_grad_ctxes->clear();
+    aux_state_ctxes->clear();
+    grad_req_types->clear();
+    auto new_arg_names = ret.ListInputNames(nnvm::Symbol::kReadOnlyArgs);
+    auto new_aux_names = ret.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+    for (auto arg_name : new_arg_names) {
+      CHECK(in_arg_ctx_map.count(arg_name));
+      in_arg_ctxes->push_back(in_arg_ctx_map[arg_name]);
+      arg_grad_ctxes->push_back(arg_grad_ctx_map[arg_name]);
+      grad_req_types->push_back(grad_req_type_map[arg_name]);
+    }
+    for (auto arg_name : new_aux_names) {
+      CHECK(aux_state_ctx_map.count(arg_name));
+      aux_state_ctxes->push_back(aux_state_ctx_map[arg_name]);
     }
   }
-  return PartitionGraph(src, prop_name, arg_shapes, arg_dtypes, arg_stypes,
-                        default_ctx, ctx_map, in_arg_ctxes, aux_state_ctxes);
+  return ret;
 }
 
 // Given input ndarrays, partition the graph using the backend name equal to prop_name.
 // This is for bind flow.
-static nnvm::Symbol PartitionGraph(const nnvm::Symbol& src,
-                                   const std::string& prop_name,
-                                   std::vector<NDArray> *in_args,
-                                   const std::vector<NDArray> &aux_states,
-                                   const Context& default_ctx,
-                                   const std::map<std::string, Context>& ctx_map) {
-  const std::vector<std::string> input_names = src.ListInputNames(Symbol::kAll);
+static nnvm::Symbol BuildSubgraph(const nnvm::Symbol& src, const std::string& prop_name,
+                                  const Context& default_ctx,
+                                  const std::map<std::string, Context>& ctx_map,
+                                  std::vector<NDArray>* in_args,
+                                  std::vector<NDArray>* arg_grad_store,
+                                  std::vector<OpReqType>* grad_req_type,
+                                  std::vector<NDArray>* aux_states) {
+  // setup map for in_args, arg_grad_store, grad_req_type and aux_states
+  std::unordered_map<std::string, NDArray> in_args_map;
+  std::unordered_map<std::string, NDArray> arg_grad_store_map;
+  std::unordered_map<std::string, OpReqType> grad_req_type_map;
+  std::unordered_map<std::string, NDArray> aux_states_map;
   const std::vector<std::string> arg_names = src.ListInputNames(nnvm::Symbol::kReadOnlyArgs);
   const std::vector<std::string> aux_names = src.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
-  CHECK_EQ(arg_names.size(), in_args->size());
-  CHECK_EQ(aux_names.size(), aux_states.size());
-  mxnet::ShapeVector arg_shapes;  // all input shapes
-  arg_shapes.reserve(input_names.size());
-  nnvm::DTypeVector arg_dtypes;  // all input dtypes
-  arg_dtypes.reserve(input_names.size());
-  StorageTypeVector arg_stypes;  // all input stypes
-  arg_stypes.reserve(input_names.size());
-  std::vector<Context> in_arg_ctxes(in_args->size());
-  std::vector<Context> aux_state_ctxes(aux_states.size());
+  for (size_t i = 0; i < arg_names.size(); ++i) {
+    auto name = arg_names[i];
+    in_args_map[name] = in_args->at(i);
+    arg_grad_store_map[name] = arg_grad_store->at(i);
+    grad_req_type_map[name] = grad_req_type->at(i);
+  }
 
-  size_t i1 = 0, i2 = 0;
-  for (const auto& input_name : input_names) {
-    if (i2 < aux_names.size() && aux_names[i2] == input_name) {
-      arg_shapes.push_back(aux_states[i2].shape());
-      arg_dtypes.push_back(aux_states[i2].dtype());
-      arg_stypes.push_back(aux_states[i2].storage_type());
-      aux_state_ctxes[i2] = aux_states[i2].ctx();
-      ++i2;
-    } else {
-      CHECK(i1 < arg_names.size());
-      CHECK_EQ(arg_names[i1], input_name);
-      arg_shapes.push_back(in_args->at(i1).shape());
-      arg_dtypes.push_back(in_args->at(i1).dtype());
-      arg_stypes.push_back(in_args->at(i1).storage_type());
-      in_arg_ctxes[i1] = in_args->at(i1).ctx();
-      ++i1;
+  for (size_t i = 0; i < aux_names.size(); ++i) {
+    aux_states_map[aux_names[i]] = aux_states->at(i);
+  }
+
+  bool need_grad = false;
+  for (OpReqType req : *grad_req_type) {
+    if (req != kNullOp) {
+      need_grad = true;
+      break;
     }
   }
+  nnvm::Symbol ret = src.Copy();
+  std::unordered_set<std::string> op_names_set;
+  auto it = op::SubgraphPropertyOpNameSet::Get()->find(prop_name);
+  // assign a op name set to the subgraph property if it has been provided by users
+  if (it != op::SubgraphPropertyOpNameSet::Get()->end()) {
+    LOG(INFO) << "SubgraphPropertyOpNameSet for subgraph property " << prop_name
+              << " has been assigned a value. Please make sure it is initialized"
+                 " only for the testing purpose.";
+    op_names_set = it->second;
+  }
+  std::vector<mxnet::op::SubgraphPropertyPtr> properties =
+      op::SubgraphPropertyRegistry::Get()->CreateSubgraphProperty(prop_name);
+  for (auto subgraph_prop : properties) {
+    if (subgraph_prop->HasAttr("inference_only") &&
+        subgraph_prop->GetAttr<bool>("inference_only") == true) {
+      if (need_grad) {
+        auto full_name = subgraph_prop->HasAttr("property_name")
+                             ? subgraph_prop->GetAttr<std::string>("property_name")
+                             : prop_name;
+        LOG(INFO) << "Skip subgraph " << full_name << " as it requires `grad_req=null`.";
+        continue;
+      }
+    }
+    subgraph_prop->SetAttr("op_names", op_names_set);
+    const std::vector<std::string> input_names = ret.ListInputNames(Symbol::kAll);
+    const std::vector<std::string> arg_names = ret.ListInputNames(nnvm::Symbol::kReadOnlyArgs);
+    const std::vector<std::string> aux_names = ret.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+    CHECK_EQ(arg_names.size(), in_args_map.size());
+    CHECK_EQ(aux_names.size(), aux_states_map.size());
+    mxnet::ShapeVector arg_shapes;  // all input shapes
+    arg_shapes.reserve(input_names.size());
+    nnvm::DTypeVector arg_dtypes;  // all input dtypes
+    arg_dtypes.reserve(input_names.size());
+    StorageTypeVector arg_stypes;  // all input stypes
+    arg_stypes.reserve(input_names.size());
+    std::vector<Context> in_arg_ctxes(in_args_map.size());
+    std::vector<Context> aux_state_ctxes(aux_states_map.size());
+
+    size_t i1 = 0, i2 = 0;
+    for (const auto& input_name : input_names) {
+      if (i2 < aux_names.size() && aux_names[i2] == input_name) {
+        const auto &aux_st = aux_states_map[input_name];
+        arg_shapes.push_back(aux_st.shape());
+        arg_dtypes.push_back(aux_st.dtype());
+        arg_stypes.push_back(aux_st.storage_type());
+        aux_state_ctxes[i2] = aux_st.ctx();
+        ++i2;
+      } else {
+        CHECK(i1 < arg_names.size());
+        CHECK_EQ(arg_names[i1], input_name);
+        const auto &in_arg = in_args_map[input_name];
+        arg_shapes.push_back(in_arg.shape());
+        arg_dtypes.push_back(in_arg.dtype());
+        arg_stypes.push_back(in_arg.storage_type());
+        in_arg_ctxes[i1] = in_arg.ctx();
+        ++i1;
+      }
+    }
 
-  // setup in_args_map
-  std::unordered_map<std::string, NDArray> in_args_map;
-  for (size_t i = 0; i < in_args->size(); ++i) {
-    in_args_map[arg_names[i]] = in_args->at(i);
-  }
-  auto result = PartitionGraph(src, prop_name, arg_shapes, arg_dtypes, arg_stypes, default_ctx,
-                               ctx_map, in_arg_ctxes, aux_state_ctxes);
-  // Reorder in_args into new_in_args according to partitioned symbol input sequence
-  std::vector<NDArray> new_in_args(in_args->size());
-  // get new symbol in_arg names
-  std::vector<std::string> new_arg_names = result.ListInputNames(nnvm::Symbol::kReadOnlyArgs);
+    ret = BuildSubgraph(ret, subgraph_prop, arg_shapes, arg_dtypes, arg_stypes, default_ctx,
+                        ctx_map, in_arg_ctxes, aux_state_ctxes);
+  }
+  // Reorder in_args, arg_grad_store, grad_req_type and aux_states according to partitioned symbol
+  // input sequence
+  const auto new_arg_names = ret.ListInputNames(nnvm::Symbol::kReadOnlyArgs);
+  const auto new_aux_names = ret.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+  CHECK_EQ(arg_names.size(), new_arg_names.size());
   CHECK_EQ(arg_names.size(), new_arg_names.size());
   in_args->clear();
+  arg_grad_store->clear();
+  grad_req_type->clear();
+  aux_states->clear();
   for (auto arg_name : new_arg_names) {
     CHECK(in_args_map.count(arg_name));
     in_args->push_back(in_args_map[arg_name]);
+    arg_grad_store->push_back(arg_grad_store_map[arg_name]);
+    grad_req_type->push_back(grad_req_type_map[arg_name]);
+  }
+  for (auto arg_name : new_aux_names) {
+    CHECK(aux_states_map.count(arg_name));
+    aux_states->push_back(aux_states_map[arg_name]);
   }
-  return result;
+  return ret;
 }
 }  // namespace exec
 
@@ -1641,17 +1708,18 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
                                std::unordered_map<std::string, NDArray>* shared_buffer,
                                Executor* shared_exec) {
   auto exec = new exec::GraphExecutor();
+  std::vector<Context> tmp_in_arg_ctxes = in_arg_ctxes;
+  std::vector<Context> tmp_arg_grad_ctxes = arg_grad_ctxes;
+  std::vector<Context> tmp_aux_state_ctxes = aux_state_ctxes;
+  std::vector<OpReqType> tmp_grad_req_types = grad_req_types;
   if (!exec->subgraph_property().empty()) {
-    symbol = exec::PartitionGraph(symbol, exec->subgraph_property(), arg_shape_map, arg_dtype_map,
-                                  arg_stype_map, default_ctx, group2ctx, in_arg_ctxes,
-                                  aux_state_ctxes);
-  }
-  exec->Init(symbol, default_ctx, group2ctx,
-             in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
-             arg_shape_map, arg_dtype_map, arg_stype_map,
-             grad_req_types, shared_arg_names,
-             in_args, arg_grads, aux_states,
-             shared_buffer, shared_exec);
+    symbol = exec::BuildSubgraph(symbol, exec->subgraph_property(), arg_shape_map, arg_dtype_map,
+                                 arg_stype_map, default_ctx, group2ctx, &tmp_in_arg_ctxes,
+                                 &tmp_arg_grad_ctxes, &tmp_grad_req_types, &tmp_aux_state_ctxes);
+  }
+  exec->Init(symbol, default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes,
+             tmp_aux_state_ctxes, arg_shape_map, arg_dtype_map, arg_stype_map, tmp_grad_req_types,
+             shared_arg_names, in_args, arg_grads, aux_states, shared_buffer, shared_exec);
   return exec;
 }
 
@@ -1665,13 +1733,17 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
                          Executor* shared_exec) {
   auto exec = new exec::GraphExecutor();
   std::vector<NDArray> tmp_in_args = in_args;
+  std::vector<NDArray> tmp_arg_grad_store = arg_grad_store;
+  std::vector<OpReqType> tmp_grad_req_type = grad_req_type;
+  std::vector<NDArray> tmp_aux_states = aux_states;
+
   if (!exec->subgraph_property().empty()) {
-    symbol = exec::PartitionGraph(symbol, exec->subgraph_property(), &tmp_in_args, aux_states,
-                                  default_ctx, group2ctx);
+    symbol =
+        exec::BuildSubgraph(symbol, exec->subgraph_property(), default_ctx, group2ctx, &tmp_in_args,
+                            &tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states);
   }
-  exec->Init(symbol, default_ctx, group2ctx,
-             tmp_in_args, arg_grad_store, grad_req_type, aux_states,
-             reinterpret_cast<Executor*>(shared_exec));
+  exec->Init(symbol, default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store, tmp_grad_req_type,
+             tmp_aux_states, reinterpret_cast<Executor*>(shared_exec));
   return exec;
 }
 }  // namespace mxnet
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index ed49e5bc8bc9..b556a2bd0fe9 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -213,10 +213,8 @@ class GraphExecutor : public Executor {
   void ExecuteMonInputCallback(size_t nid);
   // run the monitor callback for output of node `nid`
   void ExecuteMonOutputCallback(size_t nid);
-  // peform bulking and segmentation on an inference graph
-  void BulkInferenceOpSegs();
-  // perform bulking and segmentation on a training graph
-  void BulkTrainingOpSegs(size_t total_num_nodes);
+  // peform bulking and segmentation on the region [from_node, up_to_node) of a graph
+  void BulkOpSegs(size_t from_node, size_t up_to_node, size_t segment_num_nodes_max);
   // indicate whether there is a backward graph for gradients.
   bool need_grad_;
   // internal graph
diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc
index af8094ad92af..fa7aee518486 100644
--- a/src/executor/infer_graph_attr_pass.cc
+++ b/src/executor/infer_graph_attr_pass.cc
@@ -24,6 +24,7 @@
 
 #include <mxnet/op_attr_types.h>
 #include <mxnet/graph_attr_types.h>
+#include <mxnet/imperative.h>
 #include "./exec_pass.h"
 #include "../operator/operator_common.h"
 #include "../common/exec_utils.h"
@@ -68,6 +69,26 @@ bool ApplyOpInferAttr<int, FInferStorageType>(const nnvm::Graph& g,
  * shape/type inference functions'. The nnvm InferAttr will be deprecated
  * in the future. Please use interfaces InferShape, InferType, and InferStorageType
  * to call this function.
+ *
+ * \param ret graph used for attribute inference
+ * \param emmpty_val empty value of the attribute
+ * \param infer_name name of the function used for attribute inference
+ * \param input_name name of the attribute in the graph used to store the
+ *                   input data for attribute inference
+ * \param attr_key_name name of the attribute used for inference for variable nodes
+ * \param attr_name name of the inferred attribute
+ * \param unknown_name name of the attribute storing number of entries
+ *                     impossible to infer
+ * \param fis_none function returning true for not fully inferred values
+ * \param fdefault default function used for inference if the node does not
+ *                 provide its own implementation.
+ * \param bwd_identity_assign whether the attributes of forward NDArray and backward
+ *                            NDArray have to be the same. False only for storage
+ *                            type inference
+ * \param dispatch_mode_name name of the dispatch mode attribute on the node. Used for
+ *                           storage type inference
+ * \param default_mode_val default value of the dispatch mode attribute on the node. Used
+ *                         for storage type inference
  */
 template<typename AttrType, typename FInferType, typename IsNone, typename FDefault>
 nnvm::Graph InferAttr(nnvm::Graph &&ret,
@@ -322,23 +343,49 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret,
   return ret;
 }
 
-template<typename IsNone, typename FDefault>
+/*!\brief
+ * This is a version of the InferAttr function specifically for shape inference.
+ *
+ * \param ret graph used for attribute inference
+ * \param emmpty_val empty value of the attribute
+ * \param infer_name name of the function used for attribute inference
+ * \param input_name name of the attribute in the graph used to store the
+ *                   input data for attribute inference
+ * \param attr_key_name name of the attribute used for inference for variable nodes
+ * \param attr_name name of the inferred attribute
+ * \param unknown_name name of the attribute storing number of entries
+ *                     impossible to infer
+ * \param fis_none function returning true for not fully inferred values
+ * \param fnum_unknown function returning how many elements are unknown in
+ *                     partially inferred value of the attribute
+ * \param fdefault default function used for inference if the node does not
+ *                 provide its own implementation.
+ * \param bwd_identity_assign whether the attributes of forward NDArray and backward
+ *                            NDArray have to be the same. False only for storage
+ *                            type inference
+ * \param dispatch_mode_name name of the dispatch mode attribute on the node. Used for
+ *                           storage type inference
+ * \param default_mode_val default value of the dispatch mode attribute on the node. Used
+ *                         for storage type inference
+ */
+template<typename IsNone, typename FDefault, typename FNumUnknown>
 nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
-                           const nnvm::TShape empty_val,
+                           const mxnet::TShape empty_val,
                            const char* infer_name,
                            const char* input_name,
                            const char* attr_key_name,
                            const char* attr_name,
                            const char* unknown_name,
                            IsNone fis_none,
+                           FNumUnknown fnum_unknown,
                            FDefault fdefault,
                            bool bwd_identity_assign,
                            const char* dispatch_mode_name,
                            const DispatchMode default_mode_val = DispatchMode::kUndefined) {
   using nnvm::IndexedGraph;
   using nnvm::Op;
-  using AttrType = nnvm::TShape;
-  using FInferType = nnvm::FInferShape;
+  using AttrType = mxnet::TShape;
+  using FInferType = mxnet::FInferShape;
   using AttrVector = std::vector<AttrType>;
   using NodeAttrVector = std::vector<DispatchMode>;
   using dmlc::any;
@@ -421,6 +468,12 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
   std::vector<AttrType> ishape, oshape;
   // whether a shape is dynamic
   std::vector<int> is_dynamic(rshape.size(), 0);
+
+  // convert to numpy compatible shape to use operator's infer shape function
+  if (!Imperative::Get()->is_np_comp()) {
+    common::ConvertToNumpyShape(&rshape);
+  }
+
   // inference step function for nid
   auto infer_step = [&](uint32_t nid, bool last_iter) {
     const auto& inode = idx[nid];
@@ -437,6 +490,9 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
         if (it != inode.source->attrs.dict.end()) {
           std::istringstream is(it->second);
           CHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
+          if (!Imperative::Get()->is_np_comp()) {
+            common::ConvertToNumpyShape(&rshape[out_ent_id]);
+          }
         }
       }
       // assign a default value to node attribute
@@ -500,7 +556,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
       bool is_input_dynamic_shape = false;
       for (uint32_t i = 0; i < ishape.size(); ++i) {
         ishape[i] = rshape[idx.entry_id(inode.inputs[i])];
-        if (ishape[i].ndim() == 0 && is_dynamic[idx.entry_id(inode.inputs[i])]) {
+        if (!mxnet::ndim_is_known(ishape[i]) && is_dynamic[idx.entry_id(inode.inputs[i])]) {
           is_input_dynamic_shape = true;
         }
         if (fis_none(ishape[i])) forward_known = false;
@@ -517,7 +573,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
       auto finfer = finfer_shape.get(inode.source->op(), fdefault);
       if (finfer == nullptr || is_input_dynamic_shape) {
         for (uint32_t i = 0; i < oshape.size(); ++i) {
-          if (oshape[i].ndim() == 0) {
+          if (!mxnet::ndim_is_known(oshape[i].ndim())) {
             is_dynamic[idx.entry_id(nid, i)] = 1;
           }
         }
@@ -548,12 +604,12 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
   };
 
   size_t last_num_unknown;
-  size_t num_unknown_dispatch_mode = dispatch_mode_name ? node_end - node_start : 0;
-  size_t num_unknown_entry_attr = entry_end - entry_start;
-  size_t num_unknown = num_unknown_entry_attr + num_unknown_dispatch_mode;
+  size_t num_unknown = static_cast<size_t>(-1);  // Infinity
+
   int i = 0;
   do {
     if (i % 2 == 0) {
+      // forward inference
       for (uint32_t nid = node_start; nid < node_end; ++nid) {
         infer_step(nid, false);
       }
@@ -567,7 +623,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
     num_unknown = 0;
     for (size_t j = entry_start; j < entry_end; ++j) {
       if (fis_none(rshape[j])) {
-        ++num_unknown;
+        num_unknown += fnum_unknown(rshape[j]);
       }
     }
     if (dispatch_mode_name) {
@@ -598,11 +654,23 @@ nnvm::Graph InferShape(nnvm::Graph&& graph,
   if (shape_attr_key.length() != 0) {
     graph.attrs["shape_attr_key"] = std::make_shared<any>(shape_attr_key);
   }
-  return InferAttr<mxnet::TShape, mxnet::FInferShape>(
+  return InferShapeAttr(
       std::move(graph), mxnet::TShape(),
       "FInferShape", "shape_inputs", "shape_attr_key",
       "shape", "shape_num_unknown_nodes",
-      [](const mxnet::TShape& s) { return s.ndim() == 0 || s.Size() == 0; },
+      [](const mxnet::TShape& s) { return !mxnet::shape_is_known(s); },
+      [](const mxnet::TShape& s) {
+        if (!mxnet::ndim_is_known(s)) {
+          return static_cast<size_t>(1);
+        }
+        size_t ret = 0;
+        for (const auto& val : s) {
+          if (!mxnet::dim_size_is_known(val)) {
+            ++ret;
+          }
+        }
+        return ret;
+      },
       nullptr, true, nullptr);
 }
 
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 9db235bca532..c9215c5c8827 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -262,6 +262,35 @@ std::vector<nnvm::NodeEntry> CachedOp::Gradient(
   return ret;
 }
 
+bool CachedOp::CheckDynamicShapeExists(const Context& default_ctx,
+                                       const std::vector<NDArray*>& inputs,
+                                       bool erase_result) {
+  using namespace nnvm;
+  using namespace imperative;
+  CHECK_EQ(inputs.size(), num_inputs());
+
+  auto state_ptr = GetCachedOpState(default_ctx);
+  auto& state = state_ptr.get_state<CachedOpState>();
+
+  nnvm::Graph& g = state.info.fwd_graph;
+  ShapeVector shape_inputs;
+  shape_inputs.reserve(inputs.size());
+  for (auto input : inputs) {
+    shape_inputs.emplace_back(input->shape());
+  }
+  // We leverage the shape inference pass to detect whether dynamic shape exists.
+  // If so, the pass will fail with `contain_dynamic_shape = true`,
+  // This method is only called once, so the overhead is negligible.
+  bool contain_dynamic_shape = false;
+  CheckAndInferShape(&g, std::move(shape_inputs), true,
+                     {0, 0}, {0, 0},
+                     &contain_dynamic_shape);
+  if (erase_result) {
+    g.attrs.erase("shape");
+    g.attrs.erase("shape_inputs");
+  }
+  return contain_dynamic_shape;
+}
 
 bool CachedOp::SetForwardGraph(
     GraphInfo* info,
@@ -590,9 +619,18 @@ void CachedOp::StaticInitExec(
       SetupOpExec(g, i, state.execs[i], state.arrays, state.array_reqs);
     }
 
+    // Init bulk_size for Inference mode with bulking enabled (= entire forward graph).
     size_t bulk_size = idx.num_nodes();
     if (recording || keep_fwd) {
-      bulk_size = keep_fwd ? config_.backward_bulk_size : config_.forward_bulk_size;
+      // Training mode
+      if (!Imperative::PreferBulkExecTrain())
+        bulk_size = 0;
+      else
+        bulk_size = keep_fwd ? config_.backward_bulk_size : config_.forward_bulk_size;
+    } else {
+      // Inference mode
+      if (!Imperative::PreferBulkExecInference())
+        bulk_size = 0;
     }
 
     CreateEngineOpSeg(idx, default_ctx, start_nid, end_nid, bulk_size,
@@ -762,7 +800,8 @@ OpStatePtr CachedOp::StaticForward(
 OpStatePtr CachedOp::DynamicForward(
     const Context& default_ctx,
     const std::vector<NDArray*>& inputs,
-    const std::vector<NDArray*>& outputs) {
+    const std::vector<NDArray*>& outputs,
+    bool use_naive_run) {
   using namespace nnvm;
   using namespace imperative;
 
@@ -784,9 +823,8 @@ OpStatePtr CachedOp::DynamicForward(
   auto& states = runtime.op_states;
 
   // Allocate entries
-  states.resize(idx.num_nodes());
   buff.resize(idx.num_node_entries());
-  states.reserve(idx.num_nodes());
+  states.resize(idx.num_nodes());
   std::vector<NDArray*> arrays;
   arrays.reserve(buff.size());
   for (auto& buffered_array : buff) {
@@ -809,33 +847,42 @@ OpStatePtr CachedOp::DynamicForward(
   for (size_t i = 0; i < idx.num_node_entries(); ++i) {
     if (ref_count[i] == 0) array_reqs[i] = kNullOp;
   }
-
-  const auto& mem_plan = g.GetAttr<MemoryPlanVector >(
-      recording ? "full_mem_plan" : "forward_mem_plan");
-  AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
-                 mem_plan, arrays, &array_reqs);
-
-  const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
-  const auto& shapes = g.GetAttr<mxnet::ShapeVector>("shape");
-  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
-
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    auto eid = idx.entry_id(idx.outputs()[i]);
-    arrays[eid] = outputs[i];
-    if (!outputs[i]->is_none()) continue;
-    *outputs[i] = NDArray(static_cast<NDArrayStorageType>(stypes[eid]),
-                          shapes[eid], default_ctx, true, dtypes[eid]);
-  }
-
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
-
-  // If CachedOp is running in the inline mode, it uses RunGraph to record
-  // computation; otherwise, CachedOp records computation itself.
-  // So if it's not the inline mode, we disable recording.
-  RunGraph(false, idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
-           std::move(ref_count), &states, dispatch_modes,
-           recording && inlining_);
-
+  if (!use_naive_run) {
+    const auto& mem_plan = g.GetAttr<MemoryPlanVector >(
+        recording ? "full_mem_plan" : "forward_mem_plan");
+    AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
+                  mem_plan, arrays, &array_reqs);
+    const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
+    const auto& shapes = g.GetAttr<mxnet::ShapeVector>("shape");
+    const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      auto eid = idx.entry_id(idx.outputs()[i]);
+      arrays[eid] = outputs[i];
+      if (!outputs[i]->is_none()) continue;
+      *outputs[i] = NDArray(static_cast<NDArrayStorageType>(stypes[eid]),
+                            shapes[eid], default_ctx, true, dtypes[eid]);
+    }
+    // If CachedOp is running in the inline mode, it uses RunGraph to record
+    // computation; otherwise, CachedOp records computation itself.
+    // So if it's not the inline mode, we disable recording.
+    RunGraph(false, idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
+            std::move(ref_count), &states, dispatch_modes,
+            recording && inlining_);
+  } else {
+    mxnet::ShapeVector shapes = g.GetAttr<mxnet::ShapeVector>("shape");
+    NaiveRunGraph(false, default_ctx, idx, arrays, 0, idx.num_nodes(),
+                  std::move(array_reqs), std::move(ref_count), &states,
+                  dispatch_modes, recording && inlining_, &shapes);
+    {
+      auto state_ptr = GetCachedOpState(default_ctx);
+      auto& state = state_ptr.get_state<CachedOpState>();
+      auto copied_shape = shapes;
+      std::lock_guard<std::mutex> lock(state.mutex);
+      state.info.fwd_graph.attrs["shape"] = std::make_shared<dmlc::any>(std::move(copied_shape));
+    }
+    g.attrs["shape"] = std::make_shared<dmlc::any>(std::move(shapes));
+  }
   return op_state;
 }
 
@@ -863,10 +910,14 @@ OpStatePtr CachedOp::Forward(
 
   OpStatePtr op_state;
   try {
-    if (config_.static_alloc) {
+    if (config_.is_dynamic || CheckDynamicShapeExists(default_ctx, inputs, true)) {
+      config_.is_dynamic = true;
+      config_.static_alloc = false;
+      op_state = DynamicForward(default_ctx, inputs, outputs, true);
+    } else if (config_.static_alloc) {
       op_state = StaticForward(default_ctx, inputs, outputs);
     } else {
-      op_state = DynamicForward(default_ctx, inputs, outputs);
+      op_state = DynamicForward(default_ctx, inputs, outputs, false);
     }
   } catch (const dmlc::Error& e) {
     Engine::Get()->set_bulk_size(prev_bulk_size);
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index 3b173c8654a4..b3192dc8281b 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -53,10 +53,10 @@ struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
     .set_default(2)
     .describe("Maximum number of operators that can be inlined.");
     DMLC_DECLARE_FIELD(forward_bulk_size)
-    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
+    .set_default(Imperative::BulkExecMaxNodeTrainFwd())
     .describe("Segment size of bulk execution during forward pass.");
     DMLC_DECLARE_FIELD(backward_bulk_size)
-    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
+    .set_default(Imperative::BulkExecMaxNodeTrainBwd())
     .describe("Segment size of bulk execution during backward pass.");
     DMLC_DECLARE_FIELD(data_indices)
     .set_default(nnvm::Tuple<uint32_t>())
@@ -153,7 +153,8 @@ class CachedOp {
   OpStatePtr DynamicForward(
       const Context& default_ctx,
       const std::vector<NDArray*>& inputs,
-      const std::vector<NDArray*>& outputs);
+      const std::vector<NDArray*>& outputs,
+      bool use_naive_run = false);
   void DynamicBackward(
       const bool retain_graph,
       const OpStatePtr& op_state,
@@ -185,6 +186,10 @@ class CachedOp {
       const std::vector<NDArray*>& inputs,
       const std::vector<OpReqType>& reqs,
       const std::vector<NDArray*>& outputs);
+  bool CheckDynamicShapeExists(
+      const Context& default_ctx,
+      const std::vector<NDArray*>& inputs,
+      bool erase_result);
 
   CachedOpConfig config_;
   nnvm::Graph fwd_graph_;
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index 8d1f65518565..b027de0a0f6f 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -25,9 +25,11 @@ namespace mxnet {
 #if DMLC_CXX11_THREAD_LOCAL
 thread_local bool Imperative::is_train_ = false;
 thread_local bool Imperative::is_recording_ = false;
+thread_local bool Imperative::is_np_comp_ = false;
 #else
 MX_THREAD_LOCAL bool Imperative::is_train_ = false;
 MX_THREAD_LOCAL bool Imperative::is_recording_ = false;
+MX_THREAD_LOCAL bool Imperative::is_np_comp_ = false;
 #endif
 
 Imperative* Imperative::Get() {
@@ -109,7 +111,7 @@ OpStatePtr Imperative::Invoke(
   OpStatePtr ret = InvokeOp(ctx, attrs, inputs, outputs, req, dispatch_mode);
   // the followinng loop is used for finding out the correct shape when some shapes are dynamic
   for (size_t i = 0; i < outputs.size(); i++) {
-    if (outputs[i]->shape().ndim() == 0) {
+    if (!shape_is_known(outputs[i]->shape())) {
       // the WaitToRead overhead here does not seem to be avoidable
       outputs[i]->WaitToRead();
       outputs[i]->SetShapeFromChunk();
@@ -442,9 +444,10 @@ std::vector<NDArray*> Imperative::Backward(
 
     ShapeVector shapes;
     shapes.reserve(idx.num_node_entries());
+    bool contain_unknown = false;
     for (const auto& i : arrays) shapes.emplace_back(i->shape());
     CheckAndInferShape(&graph, std::move(shapes), false,
-                       node_range, entry_range);
+                       node_range, entry_range, &contain_unknown);
 
     DTypeVector dtypes;
     dtypes.reserve(idx.num_node_entries());
diff --git a/src/imperative/imperative_utils.cc b/src/imperative/imperative_utils.cc
index 1a676e62000d..568d39fc8043 100644
--- a/src/imperative/imperative_utils.cc
+++ b/src/imperative/imperative_utils.cc
@@ -19,100 +19,192 @@
 
 #include "./imperative_utils.h"
 #include "./cached_op.h"
+#include "../operator/operator_common.h"
+
+namespace {
+
+std::vector<NDArray*> NodeInputs(const nnvm::IndexedGraph& idx,
+                                 const int node_idx,
+                                 const std::vector<NDArray*>& arrays) {
+  const nnvm::IndexedGraph::Node& node = idx[node_idx];
+  const size_t num_inputs = node.inputs.size();
+  std::vector<NDArray*> ndinputs;
+  ndinputs.reserve(num_inputs);
+  for (const auto& j : node.inputs) {
+    const size_t eid = idx.entry_id(j);
+    ndinputs.emplace_back(arrays[eid]);
+  }
+  return ndinputs;
+}
+
+std::vector<NDArray*> NodeOutputs(const nnvm::IndexedGraph& idx,
+                                  const int node_idx,
+                                  const std::vector<NDArray*>& arrays) {
+  const nnvm::IndexedGraph::Node& node = idx[node_idx];
+  const size_t num_outputs = node.source->num_outputs();
+  std::vector<NDArray*> ndoutputs;
+  ndoutputs.reserve(num_outputs);
+  for (size_t j = 0; j < num_outputs; ++j) {
+    const size_t eid = idx.entry_id(node_idx, j);
+    ndoutputs.emplace_back(arrays[eid]);
+  }
+  return ndoutputs;
+}
+
+std::vector<OpReqType> NodeReq(const nnvm::IndexedGraph& idx,
+                               const int node_idx,
+                               const std::vector<OpReqType>& array_reqs) {
+  const nnvm::IndexedGraph::Node& node = idx[node_idx];
+  const size_t num_outputs = node.source->num_outputs();
+  std::vector<OpReqType> req;
+  req.reserve(num_outputs);
+  for (size_t j = 0; j < num_outputs; ++j) {
+    const size_t eid = idx.entry_id(node_idx, j);
+    req.push_back(array_reqs[eid]);
+  }
+  return req;
+}
+
+void InvokeOperator(const nnvm::IndexedGraph& idx,
+                    const int node_idx,
+                    const bool retain_graph,
+                    const std::vector<NDArray*>& arrays,
+                    Context ctx,
+                    std::vector<OpStatePtr>* p_states,
+                    const std::vector<NDArray*>& ndinputs,
+                    const std::vector<NDArray*>& ndoutputs,
+                    std::vector<OpReqType> *p_req,
+                    std::vector<uint32_t> *p_ref_count,
+                    std::function<void(const OpStatePtr &state)> invoke) {
+  static const auto bwd_cached_op = Op::Get("_backward_CachedOp");
+  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
+  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
+  std::vector<OpStatePtr>& states = *p_states;
+  std::vector<OpReqType> &req = *p_req;
+  std::vector<uint32_t> &ref_count = *p_ref_count;
+
+  const nnvm::IndexedGraph::Node& node = idx[node_idx];
+  if (node.source->op() == bwd_cached_op) {
+    const auto& cached_op = dmlc::get<CachedOpPtr>(node.source->attrs.parsed);
+    nnvm::Node* fwd_node = node.source->control_deps[0].get();
+    auto fwd_node_id = idx.node_id(fwd_node);
+    cached_op->Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs);
+  } else if (createop.count(node.source->op())) {
+    mxnet::ShapeVector arg_shapes;
+    nnvm::DTypeVector arg_dtypes;
+    arg_shapes.reserve(ndinputs.size());
+    arg_dtypes.reserve(ndinputs.size());
+    for (auto& ndinput : ndinputs) {
+      arg_shapes.emplace_back(ndinput->shape());
+      arg_dtypes.emplace_back(ndinput->dtype());
+    }
+    states[node_idx] = createop[node.source->op()](node.source->attrs, ctx, arg_shapes, arg_dtypes);
+    invoke(states[node_idx]);
+  } else if (is_layer_backward.get(node.source->op(), false)) {
+    nnvm::Node* fwd_node = node.source->control_deps[0].get();
+    auto fwd_node_id = idx.node_id(fwd_node);
+    invoke(states[fwd_node_id]);
+  } else {
+    invoke(OpStatePtr());
+  }
+  for (const auto& j : node.inputs) {
+    size_t eid = idx.entry_id(j);
+    --ref_count[eid];
+    if (ref_count[eid] == 0) {
+      *arrays[eid] = NDArray();
+    }
+  }
+  for (size_t j = 0; j < ndoutputs.size(); ++j) {
+    size_t eid = idx.entry_id(node_idx, j);
+    if (ref_count[eid] == 0) {
+      *arrays[eid] = NDArray();
+    }
+  }
+}
+
+}  // namespace
 
 namespace mxnet {
 namespace imperative {
+
 void RunGraph(
     const bool retain_graph,
     const nnvm::IndexedGraph& idx,
-    const std::vector<NDArray*> arrays,
+    const std::vector<NDArray*>& arrays,
     size_t node_start, size_t node_end,
     std::vector<OpReqType>&& array_reqs,
     std::vector<uint32_t>&& ref_count,
     std::vector<OpStatePtr> *p_states,
     const DispatchModeVector &dispatch_modes,
-    bool recording) {
-  using namespace nnvm;
-  using namespace imperative;
-  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
-  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
-  static const auto bwd_cached_op = Op::Get("_backward_CachedOp");
-
-  const auto imp = Imperative::Get();
-
-  std::vector<OpStatePtr>& states = *p_states;
-
-  std::vector<NDArray*> ndinputs, ndoutputs;
-  ShapeVector arg_shapes;
-  DTypeVector arg_dtypes;
-  std::vector<OpReqType> req;
-
+    bool recording,
+    mxnet::ShapeVector *shapes) {
+  CHECK(shapes == nullptr);
   for (size_t i = node_start; i < node_end; ++i) {
     const nnvm::IndexedGraph::Node& node = idx[i];
-    if (node.source->op() == nullptr) continue;
-    auto num_outputs = node.source->num_outputs();
-    ndinputs.clear();
-    ndinputs.reserve(node.inputs.size());
-    for (const auto& j : node.inputs) {
-      ndinputs.emplace_back(arrays[idx.entry_id(j)]);
-      CHECK(!ndinputs.back()->is_none()) << idx[j.node_id].source->attrs.name << " " << j.index;
+    if (node.source->op() == nullptr) {
+      continue;
     }
-    ndoutputs.clear();
-    ndoutputs.reserve(num_outputs);
-    req.clear();
-    req.reserve(num_outputs);
-    for (size_t j = 0; j < num_outputs; ++j) {
-      size_t eid = idx.entry_id(i, j);
-      ndoutputs.emplace_back(arrays[eid]);
-      req.push_back(array_reqs[eid]);
-      CHECK(array_reqs[eid] == kNullOp || !ndoutputs.back()->is_none());
-    }
-    const Context& ctx = ndoutputs[0]->ctx();
-    const DispatchMode dispatch_mode = dispatch_modes[i];
-    if (node.source->op() == bwd_cached_op) {
-      const auto& cached_op = dmlc::get<CachedOpPtr>(node.source->attrs.parsed);
-      nnvm::Node* fwd_node = node.source->control_deps[0].get();
-      auto fwd_node_id = idx.node_id(fwd_node);
-      cached_op->Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs);
-    } else if (createop.count(node.source->op())) {
-      arg_shapes.clear();
-      arg_dtypes.clear();
-      arg_shapes.reserve(ndinputs.size());
-      arg_dtypes.reserve(ndinputs.size());
-      for (auto& ndinput : ndinputs) {
-        arg_shapes.emplace_back(ndinput->shape());
-        arg_dtypes.emplace_back(ndinput->dtype());
-      }
-      states[i] = createop[node.source->op()](
-          node.source->attrs, ctx, arg_shapes, arg_dtypes);
-      imp->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode, states[i]);
+    std::vector<NDArray*> ndinputs = NodeInputs(idx, i, arrays);
+    std::vector<NDArray*> ndoutputs = NodeOutputs(idx, i, arrays);
+    std::vector<OpReqType> req = NodeReq(idx, i, array_reqs);
+    Context ctx = ndoutputs[0]->ctx();
+    auto invoke = [&](const OpStatePtr &state) {
+      const nnvm::IndexedGraph::Node& node = idx[i];
+      DispatchMode dispatch_mode = dispatch_modes[i];
+      Imperative::Get()->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
+                                  req, dispatch_mode, state);
       if (recording) {
-        imp->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[i]);
+        Imperative::Get()->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, state);
       }
-    } else if (is_layer_backward.get(node.source->op(), false)) {
-      nnvm::Node* fwd_node = node.source->control_deps[0].get();
-      auto fwd_node_id = idx.node_id(fwd_node);
-      imp->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
-               req, dispatch_mode, states[fwd_node_id]);
-      if (recording) {
-        imp->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[fwd_node_id]);
+    };
+    InvokeOperator(idx, i, retain_graph, arrays, ctx, p_states, ndinputs, ndoutputs,
+                   &req, &ref_count, invoke);
+  }
+}
+
+void NaiveRunGraph(
+    const bool retain_graph,
+    const Context& default_ctx,
+    const nnvm::IndexedGraph& idx,
+    const std::vector<NDArray*>& arrays,
+    size_t node_start, size_t node_end,
+    std::vector<OpReqType>&& array_reqs,
+    std::vector<uint32_t>&& ref_count,
+    std::vector<OpStatePtr> *p_states,
+    const DispatchModeVector &dispatch_modes,
+    bool recording,
+    mxnet::ShapeVector *shapes) {
+  for (size_t i = node_start; i < node_end; ++i) {
+    const nnvm::IndexedGraph::Node& node = idx[i];
+    if (node.source->op() == nullptr) {
+      continue;
+    }
+    std::vector<NDArray*> ndinputs = NodeInputs(idx, i, arrays);
+    std::vector<NDArray*> ndoutputs = NodeOutputs(idx, i, arrays);
+    std::vector<OpReqType> req;
+    Context ctx = GetContext(node.source->attrs, ndinputs, ndoutputs, default_ctx);
+    auto invoke = [&](const OpStatePtr &state) {
+      const nnvm::IndexedGraph::Node& node = idx[i];
+      DispatchMode dispatch_mode = DispatchMode::kUndefined;
+      SetShapeType(ctx, node.source->attrs, ndinputs, ndoutputs, &dispatch_mode);
+      SetWriteInplaceReq(ndinputs, ndoutputs, &req);
+      Imperative::Get()->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
+                                  req, dispatch_mode, state);
+      for (size_t j = 0; j < ndoutputs.size(); ++j) {
+        if (mxnet::op::shape_is_none(ndoutputs[j]->shape())) {
+          ndoutputs[j]->WaitToRead();
+          ndoutputs[j]->SetShapeFromChunk();
+        }
+        size_t eid = idx.entry_id(i, j);
+        auto shape = ndoutputs[j]->shape();
+        (*shapes)[eid] = shape;
       }
-    } else {
-      imp->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode);
       if (recording) {
-        imp->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs);
+        Imperative::Get()->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, state);
       }
-    }
-
-    for (const auto& j : node.inputs) {
-      size_t eid = idx.entry_id(j);
-      --ref_count[eid];
-      if (ref_count[eid] == 0) *arrays[eid] = NDArray();
-    }
-    for (size_t j = 0; j < ndoutputs.size(); ++j) {
-      size_t eid = idx.entry_id(i, j);
-      if (ref_count[eid] == 0) *arrays[eid] = NDArray();
-    }
+    };
+    InvokeOperator(idx, i, retain_graph, arrays, ctx, p_states, ndinputs, ndoutputs,
+                   &req, &ref_count, invoke);
   }
 }
 
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 5eecfe8c6f23..9d4e4bd15a37 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -31,6 +31,7 @@
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
 #include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../operator/operator_common.h"
 
 #ifndef MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
 #define MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
@@ -121,7 +122,28 @@ inline void SetShapeType(const Context& ctx,
   if (!infershape.count(attrs.op)) {
     is_dynamic_shape_existing = true;
   } else {
-    CHECK(infershape[attrs.op](attrs, &in_shapes, &out_shapes));
+    if (!Imperative::Get()->is_np_comp()) {
+      common::ConvertToNumpyShape(&in_shapes);
+      common::ConvertToNumpyShape(&out_shapes);
+    }
+    const bool success = infershape[attrs.op](attrs, &in_shapes, &out_shapes);
+    if (!success) {
+      std::stringstream os;
+      os << "Operator " << attrs.op->name << " inferring shapes failed.\n";
+      os << "input shapes:\n";
+      for (const auto& s : in_shapes) {
+        os << s << '\n';
+      }
+      os << "output shapes:\n";
+      for (const auto& s : out_shapes) {
+        os << s << '\n';
+      }
+      os << "operator attributes:\n";
+      for (const auto& kv : attrs.dict) {
+        os << kv.first << " : " << kv.second << '\n';
+      }
+      LOG(FATAL) << os.str();
+    }
     CHECK_EQ(out_shapes.size(), outputs.size());
   }
   // infer type
@@ -179,7 +201,7 @@ inline void SetShapeType(const Context& ctx,
 
   for (size_t i = 0; i < outputs.size(); ++i) {
     NDArrayStorageType storage_type = static_cast<NDArrayStorageType>(out_storage_types[i]);
-    if (outputs[i]->is_none() || outputs[i]->shape().ndim() == 0) {
+    if (outputs[i]->is_none() || mxnet::op::shape_is_none(outputs[i]->shape())) {
       if (is_dynamic_shape_existing) {
         // once there is dynamic shape somewhere, we could not pre-determine the shape.
         *outputs[i] = NDArray(ctx, out_types[i]);
@@ -999,13 +1021,26 @@ inline void CreateEngineOpSeg(
 
 void RunGraph(const bool retain_graph,
               const nnvm::IndexedGraph& idx,
-              const std::vector<NDArray*> arrays,
+              const std::vector<NDArray*>& arrays,
               size_t node_start, size_t node_end,
               std::vector<OpReqType>&& array_reqs,
               std::vector<uint32_t>&& ref_count,
               std::vector<OpStatePtr> *p_states,
               const DispatchModeVector &dispatch_modes,
-              bool recording);
+              bool recording,
+              mxnet::ShapeVector *shapes = nullptr);
+
+void NaiveRunGraph(const bool retain_graph,
+                   const Context& default_ctx,
+                   const nnvm::IndexedGraph& idx,
+                   const std::vector<NDArray*>& arrays,
+                   size_t node_start, size_t node_end,
+                   std::vector<OpReqType>&& array_reqs,
+                   std::vector<uint32_t>&& ref_count,
+                   std::vector<OpStatePtr> *p_states,
+                   const DispatchModeVector &dispatch_modes,
+                   bool recording,
+                   mxnet::ShapeVector *shapes);
 
 }  // namespace imperative
 }  // namespace mxnet
diff --git a/src/initialize.cc b/src/initialize.cc
index 8d0e3c304216..00a736abd8ba 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -26,6 +26,7 @@
 #include <dmlc/logging.h>
 #include <mxnet/engine.h>
 #include "./engine/openmp.h"
+#include "./operator/custom/custom-inl.h"
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
 #endif  // MXNET_USE_OPENCV
@@ -53,12 +54,15 @@ class LibraryInitializer {
 
 // disable openmp for multithreaded workers
 #ifndef _WIN32
+    using op::custom::CustomOperator;
     pthread_atfork(
       []() {
+        CustomOperator::Get()->Stop();
         Engine::Get()->Stop();
       },
       []() {
         Engine::Get()->Start();
+        CustomOperator::Get()->Start();
       },
       []() {
         // Conservative thread management for multiprocess workers
@@ -71,6 +75,7 @@ class LibraryInitializer {
 #endif  // MXNET_USE_OPENCV
         engine::OpenMP::Get()->set_enabled(false);
         Engine::Get()->Start();
+        CustomOperator::Get()->Start();
       });
 #endif
   }
diff --git a/src/io/image_aug_default.cc b/src/io/image_aug_default.cc
index 5fb0b0f21e7e..b090d0486ae9 100644
--- a/src/io/image_aug_default.cc
+++ b/src/io/image_aug_default.cc
@@ -32,6 +32,7 @@
 #include "../common/utils.h"
 
 #if MXNET_USE_OPENCV
+#include "./opencv_compatibility.h"
 // Registers
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::mxnet::io::ImageAugmenterReg);
@@ -461,7 +462,7 @@ class DefaultImageAugmenter : public ImageAugmenter {
       float alpha_s = 1.0 + std::uniform_real_distribution<float>(-param_.saturation,
                                                                   param_.saturation)(*prnd);
       int rand_order[3] = {0, 1, 2};
-      std::random_shuffle(std::begin(rand_order), std::end(rand_order));
+      std::shuffle(std::begin(rand_order), std::end(rand_order), *prnd);
       for (int i : rand_order) {
         if (i == 0) {
           // brightness
diff --git a/src/io/image_det_aug_default.cc b/src/io/image_det_aug_default.cc
index 91711bfb0359..74e51b51603b 100644
--- a/src/io/image_det_aug_default.cc
+++ b/src/io/image_det_aug_default.cc
@@ -203,6 +203,7 @@ std::vector<dmlc::ParamFieldInfo> ListDefaultDetAugParams() {
 }
 
 #if MXNET_USE_OPENCV
+#include "./opencv_compatibility.h"
 using Rect = cv::Rect_<float>;
 
 #ifdef _MSC_VER
diff --git a/src/io/image_io.cc b/src/io/image_io.cc
index ab55c1a4c901..965078cb2766 100644
--- a/src/io/image_io.cc
+++ b/src/io/image_io.cc
@@ -41,6 +41,7 @@
 
 #if MXNET_USE_OPENCV
   #include <opencv2/opencv.hpp>
+  #include "./opencv_compatibility.h"
 #endif  // MXNET_USE_OPENCV
 
 namespace mxnet {
@@ -188,7 +189,7 @@ void Imdecode(const nnvm::NodeAttrs& attrs,
   size_t len = inputs[0].shape().Size();
   CHECK(len > 0) << "Input cannot be an empty buffer";
 
-  mxnet::TShape oshape(3);
+  mxnet::TShape oshape(3, 1);
   oshape[2] = param.flag == 0 ? 1 : 3;
   if (get_jpeg_size(str_img, len, &oshape[1], &oshape[0])) {
   } else if (get_png_size(str_img, len, &oshape[1], &oshape[0])) {
@@ -228,7 +229,7 @@ void Imread(const nnvm::NodeAttrs& attrs,
   CHECK(file.good()) << "Failed reading image file: '" << param.filename << "' "
             << strerror(errno);
 
-  mxnet::TShape oshape(3);
+  mxnet::TShape oshape(3, 1);
   oshape[2] = param.flag == 0 ? 1 : 3;
   if (get_jpeg_size(buff.get(), fsize, &oshape[1], &oshape[0])) {
   } else if (get_png_size(buff.get(), fsize, &oshape[1], &oshape[0])) {
diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index 5d5261b22611..0834dd7786ee 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -372,6 +372,7 @@ void ImageRecordIOParser2<DType>::ProcessImage(const cv::Mat& res,
   float RGBA_MULT[4] = { 0 };
   float RGBA_BIAS[4] = { 0 };
   float RGBA_MEAN[4] = { 0 };
+  int16_t RGBA_MEAN_INT[4] = {0};
   mshadow::Tensor<cpu, 3, DType>& data = (*data_ptr);
   if (!std::is_same<DType, uint8_t>::value) {
     RGBA_MULT[0] = contrast_scaled / normalize_param_.std_r;
@@ -387,6 +388,10 @@ void ImageRecordIOParser2<DType>::ProcessImage(const cv::Mat& res,
       RGBA_MEAN[1] = normalize_param_.mean_g;
       RGBA_MEAN[2] = normalize_param_.mean_b;
       RGBA_MEAN[3] = normalize_param_.mean_a;
+      RGBA_MEAN_INT[0] = std::round(normalize_param_.mean_r);
+      RGBA_MEAN_INT[1] = std::round(normalize_param_.mean_g);
+      RGBA_MEAN_INT[2] = std::round(normalize_param_.mean_b);
+      RGBA_MEAN_INT[3] = std::round(normalize_param_.mean_a);
     }
   }
 
@@ -408,17 +413,30 @@ void ImageRecordIOParser2<DType>::ProcessImage(const cv::Mat& res,
   for (int i = 0; i < res.rows; ++i) {
     const uchar* im_data = res.ptr<uchar>(i);
     for (int j = 0; j < res.cols; ++j) {
-      for (int k = 0; k < n_channels; ++k) {
-        RGBA[k] = im_data[swap_indices[k]];
-      }
-      if (!std::is_same<DType, uint8_t>::value) {
-        // normalize/mirror here to avoid memory copies
-        // logic from iter_normalize.h, function SetOutImg
+      if (std::is_same<DType, int8_t>::value) {
+        if (meanfile_ready_) {
+          for (int k = 0; k < n_channels; ++k) {
+            RGBA[k] = cv::saturate_cast<int8_t>(im_data[swap_indices[k]] -
+                                    static_cast<int16_t>(std::round(meanimg_[k][i][j])));
+          }
+        } else {
+          for (int k = 0; k < n_channels; ++k) {
+            RGBA[k] = cv::saturate_cast<int8_t>(im_data[swap_indices[k]] - RGBA_MEAN_INT[k]);
+          }
+        }
+      } else {
         for (int k = 0; k < n_channels; ++k) {
-          if (meanfile_ready_) {
-            RGBA[k] = (RGBA[k] - meanimg_[k][i][j]) * RGBA_MULT[k] + RGBA_BIAS[k];
-          } else {
-            RGBA[k] = (RGBA[k] - RGBA_MEAN[k]) * RGBA_MULT[k] + RGBA_BIAS[k];
+          RGBA[k] = im_data[swap_indices[k]];
+        }
+        if (!std::is_same<DType, uint8_t>::value) {
+          // normalize/mirror here to avoid memory copies
+          // logic from iter_normalize.h, function SetOutImg
+          for (int k = 0; k < n_channels; ++k) {
+            if (meanfile_ready_) {
+              RGBA[k] = (RGBA[k] - meanimg_[k][i][j]) * RGBA_MULT[k] + RGBA_BIAS[k];
+            } else {
+              RGBA[k] = (RGBA[k] - RGBA_MEAN[k]) * RGBA_MULT[k] + RGBA_BIAS[k];
+            }
           }
         }
       }
@@ -795,5 +813,22 @@ the data type instead of ``float``.
 .set_body([]() {
     return new ImageRecordIter2<uint8_t>();
   });
+
+MXNET_REGISTER_IO_ITER(ImageRecordInt8Iter)
+.describe(R"code(Iterating on image RecordIO files
+
+This iterator is identical to ``ImageRecordIter`` except for using ``int8`` as
+the data type instead of ``float``.
+
+)code" ADD_FILELINE)
+.add_arguments(ImageRecParserParam::__FIELDS__())
+.add_arguments(ImageRecordParam::__FIELDS__())
+.add_arguments(BatchParam::__FIELDS__())
+.add_arguments(PrefetcherParam::__FIELDS__())
+.add_arguments(ListDefaultAugParams())
+.set_body([]() {
+    return new ImageRecordIter2<int8_t>();
+  });
+
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/opencv_compatibility.h b/src/io/opencv_compatibility.h
new file mode 100644
index 000000000000..7f42328497ed
--- /dev/null
+++ b/src/io/opencv_compatibility.h
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file opencv_compatibility.h
+ * \brief To be compatible with multiple versions of opencv
+ */
+#ifndef MXNET_IO_OPENCV_COMPATIBILITY_H_
+#define MXNET_IO_OPENCV_COMPATIBILITY_H_
+
+#if MXNET_USE_OPENCV
+#include <opencv2/core/version.hpp>
+
+#if CV_VERSION_MAJOR >= 4
+#include <opencv2/opencv.hpp>
+#define CV_RGB2GRAY cv::COLOR_RGB2GRAY
+#define CV_BGR2GRAY cv::COLOR_BGR2GRAY
+
+#define CV_GRAY2RGB cv::COLOR_GRAY2RGB
+#define CV_GRAY2BGR cv::COLOR_GRAY2BGR
+
+#define CV_RGB2HLS cv::COLOR_RGB2HLS
+#define CV_BGR2HLS cv::COLOR_BGR2HLS
+
+#define CV_HLS2RGB cv::COLOR_HLS2RGB
+#define CV_HLS2BGR cv::COLOR_HLS2BGR
+
+#define CV_RGB2BGR cv::COLOR_RGB2BGR
+#define CV_BGR2RGB cv::COLOR_BGR2RGB
+
+#define CV_INTER_LINEAR cv::INTER_LINEAR
+#define CV_INTER_NEAREST cv::INTER_NEAREST
+
+#define CV_LOAD_IMAGE_COLOR cv::IMREAD_COLOR
+#define CV_IMWRITE_PNG_COMPRESSION cv::IMWRITE_PNG_COMPRESSION
+#define CV_IMWRITE_JPEG_QUALITY cv::IMWRITE_JPEG_QUALITY
+
+#endif  // CV_VERSION_MAJOR >= 4
+
+#endif  // MXNET_USE_OPENCV
+
+#endif  // MXNET_IO_OPENCV_COMPATIBILITY_H_
diff --git a/src/kvstore/gradient_compression.cc b/src/kvstore/gradient_compression.cc
index e4a06fa9a1f2..30aaec91e27f 100644
--- a/src/kvstore/gradient_compression.cc
+++ b/src/kvstore/gradient_compression.cc
@@ -100,9 +100,9 @@ int64_t GradientCompression::GetCompressedSize(const int64_t original_size) {
 
 void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to,
                   mxnet::NDArray *residual, const int priority) {
-  CHECK(from.shape().ndim() != 0) << "source operand has zero dimension shape";
-  CHECK(to->shape().ndim() != 0) << "destination operand has zero dimension shape";
-  CHECK(residual->shape().ndim() != 0) << "residual operand has zero dimension shape";
+  CHECK(shape_is_known(from.shape())) << "source operand has undefined shape";
+  CHECK(shape_is_known(to->shape())) << "destination operand has undefined shape";
+  CHECK(shape_is_known(residual->shape())) << "residual operand has undefined shape";
   const int a = from.ctx().dev_mask();
   const int b = to->ctx().dev_mask();
   const float threshold = threshold_;
@@ -137,8 +137,8 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
 
 void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to,
                                      const int priority) {
-  CHECK(from.shape().ndim() != 0) << "source operands has zero dimension shape";
-  CHECK(to->shape().ndim() != 0) << "destination operand has zero dimension shape";
+  CHECK(shape_is_known(from.shape())) << "source operand has undefined shape";
+  CHECK(shape_is_known(to->shape())) << "destination operand has undefined shape";
   const int a = from.ctx().dev_mask();
   const int b = to->ctx().dev_mask();
   const float threshold = threshold_;
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index b09d38aa1863..eddfbcff9ce8 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -113,20 +113,22 @@ NDArray::Chunk::~Chunk() {
   // We want to delete mkldnn memory after deleting the variable.
   mem.mem = this->mkl_mem_;
 #endif
-  Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) {
-    if (skip_free == false) {
+  if (auto engine = engine_ref_.lock()) {
+    engine->DeleteVariable([mem, skip_free](RunContext s) {
+      if (skip_free == false) {
 #if MXNET_USE_MKLDNN == 1
-      if (mem.mem) {
-        CHECK_LE(mem.mem->GetSize(), mem.h.size);
-        CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
-      }
+        if (mem.mem) {
+          CHECK_LE(mem.mem->GetSize(), mem.h.size);
+          CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
+        }
 #endif
-      if (mem.h.size > 0) Storage::Get()->Free(mem.h);
-      for (const auto& aux : mem.aux_h) {
-        if (aux.size > 0) Storage::Get()->Free(aux);
+        Storage::Get()->Free(mem.h);
+        for (const auto &aux : mem.aux_h) {
+          Storage::Get()->Free(aux);
+        }
       }
-    }
-  }, shandle.ctx, var);
+    }, shandle.ctx, var);
+  }
 }
 
 void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) {
@@ -134,8 +136,8 @@ void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) {
       << "data is expected to be allocated after aux_data";
   auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
   if (shandle.size < dbytes) {
-    // free storage if necessary and alloc again
-    if (shandle.size > 0) Storage::Get()->Free(shandle);
+    // free storage
+    Storage::Get()->Free(shandle);
     // init storage
     shandle = Storage::Get()->Alloc(dbytes, ctx);
 #if MXNET_USE_MKLDNN == 1
@@ -549,7 +551,7 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
     // If they have different shapes, we need to reshape the array first.
     // Since this method will only be used inside an operator, we can call
     // MKLDNNDataReshape to reshape an array.
-    mxnet::TShape required_shape(desc2.data.ndims);
+    mxnet::TShape required_shape(desc2.data.ndims, -1);
     for (int i = 0; i < desc2.data.ndims; i++)
       required_shape[i] = desc2.data.dims[i];
     NDArray reshaped = MKLDNNDataReshape(required_shape);
@@ -575,7 +577,7 @@ NDArray NDArray::Reorder2Default() const {
 
   // create new ndarray from  mkldnn layout
   mkldnn::memory::desc from_desc = ptr_->mkl_mem_->GetPrimitiveDesc().desc();
-  mxnet::TShape tshape(from_desc.data.ndims);
+  mxnet::TShape tshape(from_desc.data.ndims, -1);
   for (int i = 0; i < from_desc.data.ndims; i++) tshape[i] = from_desc.data.dims[i];
   NDArray ret(tshape, ctx(), false, dtype());
   mkldnn::memory::primitive_desc def_pd = ptr_->mkl_mem_->GetPrimitiveDesc(format);
@@ -1191,8 +1193,8 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
   CHECK(from.shape() == to.shape())
       << "operands shape mismatch"
       << "from.shape = " << from.shape() << " to.shape=" << to.shape();
-  CHECK(from.shape().ndim() != 0)
-      << "source operands have zero dimension shape";
+  CHECK(!mxnet::op::shape_is_none(from.shape()))
+      << "source operands have undefined shape";
   // important: callback must always capture by value
   const Context from_ctx = from.ctx();
   const int a = from_ctx.dev_mask();
@@ -1650,7 +1652,7 @@ bool LegacyTShapeLoad(dmlc::Stream *strm, mxnet::TShape *shape, const uint32_t m
     default:
       // meet legacy mxnet::TShape, magic is ndim here
       uint32_t ndim = magic;
-      *shape = mxnet::TShape(ndim);
+      *shape = mxnet::TShape(ndim, -1);
       std::vector<uint32_t> buffer(ndim);
       size_t nread = ndim * sizeof(uint32_t);
       if (strm->Read(buffer.data(), nread) != nread) return false;
@@ -1663,7 +1665,7 @@ bool NDArray::LegacyLoad(dmlc::Stream *strm, const uint32_t magic) {
   // load shape
   mxnet::TShape shape;
   if (!LegacyTShapeLoad(strm, &shape, magic)) return false;
-  if (shape.ndim() == 0) {
+  if (mxnet::op::shape_is_none(shape)) {
     *this = NDArray(); return true;
   }
   // load context
@@ -1711,7 +1713,10 @@ bool NDArray::Load(dmlc::Stream *strm) {
   // load shape
   mxnet::TShape shape;
   if (!shape.Load(strm)) return false;
-  if (shape.ndim() == 0) {
+  if (!Imperative::Get()->is_np_comp()) {
+    common::ConvertToNumpyShape(&shape);
+  }
+  if (mxnet::op::shape_is_none(shape)) {
     *this = NDArray(); return true;
   }
 
@@ -2027,11 +2032,6 @@ MXNET_REGISTER_NDARRAY_FUN(_set_value)
 MXNET_REGISTER_NDARRAY_FUN(_onehot_encode)
 .set_function(BinaryOp<ndarray::OneHotEncode>);
 
-MXNET_REGISTER_NDARRAY_FUN(choose_element_0index)
-.set_function(BinaryOp<ndarray::MatChooseRowElem>)
-.describe("Choose one element from each line(row for python, column for R/Julia)"
-          " in lhs according to index indicated by rhs."
-          " This function assume rhs uses 0-based index.");
 
 MXNET_REGISTER_NDARRAY_FUN(fill_element_0index)
 .set_function(TernaryOp<ndarray::MatFillRowElem>)
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index a613d5a3decc..8f72bc259afc 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -210,8 +210,6 @@ void ElementwiseSumContainsDnsImpl(mshadow::Stream<cpu>* s,
     Kernel<set_zero, cpu>::Launch(s, out_data.Size(), out_data.dptr<DType>());
     for (size_t i = 0; i < nds.size(); ++i) {
       const NDArray& nd = nds[i];
-      const nnvm::dim_t num_rows = nd.shape()[0];
-      const nnvm::dim_t num_cols = nd.shape()[1];
       const TBlob& nd_data = nd.data();
 
       if (i == 0) {
@@ -234,6 +232,8 @@ void ElementwiseSumContainsDnsImpl(mshadow::Stream<cpu>* s,
         case kCSRStorage: {
           const TBlob& nd_indices = nd.aux_data(csr::kIdx);
           const TBlob& nd_indptr = nd.aux_data(csr::kIndPtr);
+          const nnvm::dim_t num_rows = nd.shape()[0];
+          const nnvm::dim_t num_cols = nd.shape()[1];
           MSHADOW_IDX_TYPE_SWITCH(nd_indices.type_flag_, IType, {  // indices type
             MSHADOW_IDX_TYPE_SWITCH(nd_indptr.type_flag_, CType, {  // indptr type
               if (nd.storage_initialized()) {
@@ -248,6 +248,8 @@ void ElementwiseSumContainsDnsImpl(mshadow::Stream<cpu>* s,
         }
         case kRowSparseStorage: {
           const TBlob& nd_indices = nd.aux_data(rowsparse::kIdx);
+          const nnvm::dim_t num_rows = nd.shape()[0];
+          const nnvm::dim_t num_cols = nd.shape()[1];
           MSHADOW_IDX_TYPE_SWITCH(nd_indices.type_flag_, IType, {  // indices type
             if (nd.storage_initialized()) {
               const nnvm::dim_t nz_rows = nd_indices.Size();
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index 70b626dbb9b7..505bd205a8d5 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -40,7 +40,7 @@ namespace ndarray {
 struct BinaryBase {
   inline static mxnet::TShape GetShape(const mxnet::TShape &lshape, const mxnet::TShape &rshape) {
     CHECK(lshape == rshape) << "operands shape mismatch";
-    CHECK(lshape.ndim() != 0) << "source operand have zero dimension shape";
+    CHECK(!mxnet::op::shape_is_none(lshape)) << "source operand have zero dimension shape";
     return lshape;
   }
 };
diff --git a/src/nnvm/plan_memory.cc b/src/nnvm/plan_memory.cc
index 2b18f990c845..41b8559d16c2 100644
--- a/src/nnvm/plan_memory.cc
+++ b/src/nnvm/plan_memory.cc
@@ -30,6 +30,7 @@
 #include <mxnet/base.h>
 #include <memory>
 #include "graph_algorithm.h"
+#include "../operator/operator_common.h"
 
 namespace nnvm {
 namespace pass {
@@ -75,7 +76,7 @@ class GraphAllocator {
 
   // request a free storage
   StorageID Request(int dev_id, int dtype, mxnet::TShape shape, uint32_t node_id) {
-    if (shape.ndim() == 0) return kBadStorageID;
+    if (!mxnet::shape_is_known(shape)) return kBadStorageID;
     // search memory block in [size / match_range_, size * match_range_)
     // TODO(tqchen) add size of the dtype, assume 4 bytes for now
     size_t size = shape.Size() * 4;
@@ -267,8 +268,7 @@ size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
       // only request memory for kBadStorageID
       if (storage[eid] == GraphAllocator::kBadStorageID) {
         auto &eshape = shape_vec[eid];
-        size_t esize = 0;
-        if (eshape.ndim() != 0) esize = eshape.Size();
+        size_t esize = eshape.Size();
         eids.insert(std::make_pair(esize, eid));
       }
     }
diff --git a/src/operator/batch_norm_v1-inl.h b/src/operator/batch_norm_v1-inl.h
index f407a5cce61b..89412357ac67 100644
--- a/src/operator/batch_norm_v1-inl.h
+++ b/src/operator/batch_norm_v1-inl.h
@@ -261,7 +261,7 @@ class BatchNormV1Prop : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
     const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
+    if (!mxnet::ndim_is_known(dshape)) return false;
     in_shape->at(1) = mxnet::TShape(Shape1(dshape[1]));
     in_shape->at(2) = mxnet::TShape(Shape1(dshape[1]));
     out_shape->clear();
diff --git a/src/operator/bilinear_sampler-inl.h b/src/operator/bilinear_sampler-inl.h
index 8b1ff38709b6..abb4a61dc84c 100644
--- a/src/operator/bilinear_sampler-inl.h
+++ b/src/operator/bilinear_sampler-inl.h
@@ -149,10 +149,10 @@ class BilinearSamplerProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 2U) << "Input:[data, grid]";
     const mxnet::TShape &dshape = (*in_shape)[bs::kData];
     const mxnet::TShape &lshape = (*in_shape)[bs::kGrid];
-    if (dshape.ndim() == 0) return false;
+    if (!shape_is_known(dshape)) return false;
     CHECK_EQ(dshape.ndim(), 4U) \
         << "input data should be 4D in batch-num_filter-y-x";
-    if (lshape.ndim() ==  0) return false;
+    if (!shape_is_known(lshape)) return false;
     CHECK_EQ(lshape.ndim(), 4U) \
       << "Sampler grid should be 4D in batch-2-y-x";
     CHECK_EQ(dshape[0], lshape[0]);
diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index 1afc13ad2594..43f689d2defa 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -45,6 +45,8 @@ inline void concatenate_helper(const std::vector<mshadow::Tensor<xpu, dim, DType
     size_t size = input.size();
     index_t begin = 0;
     for (size_t i = 0; i < size; ++i) {
+      // If input[i] is a zero-size tensor, do nothing.
+      if (input[i].shape_.Size() == 0) continue;
       index_t end = begin + input[i].size(cdim);
       Assign(slice<cdim>(out, begin, end), req, input[i]);
       begin = end;
@@ -80,6 +82,8 @@ void split_helper(const mshadow::Tensor<xpu, dim, DType> &input,
     size_t size = out.size();
     index_t begin = 0;
     for (size_t i = 0; i < size; ++i) {
+      // If out[i] is a zero-size tensor, do nothing.
+      if (out[i].shape_.Size() == 0) continue;
       index_t end = begin + out[i].size(cdim);
       Assign(out[i], req[i], slice<cdim>(input, begin, end));
       begin = end;
diff --git a/src/operator/contrib/adamw-inl.h b/src/operator/contrib/adamw-inl.h
index 07feaefe87aa..6ae9e46b7def 100644
--- a/src/operator/contrib/adamw-inl.h
+++ b/src/operator/contrib/adamw-inl.h
@@ -87,8 +87,9 @@ inline bool MPUpdateInferShape(const nnvm::NodeAttrs& attrs,
                                mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), static_cast<size_t>(total_in)) << " in operator " << attrs.name;
   CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
-  // rescale_grad.shape = (1,)
-  SHAPE_ASSIGN_CHECK(*in_attrs, total_in - 1, mshadow::Shape1(1));
+  // rescale_grad.shape = ()
+  SHAPE_ASSIGN_CHECK(*in_attrs, total_in - 1, mxnet::TShape());
+  // TODO(@reminisce): change "none" behavior in ElemwiseAttr
   return ElemwiseAttr<mxnet::TShape, shape_is_none, shape_assign, true, shape_string, n_in, n_out>(
       attrs, in_attrs, out_attrs, mxnet::TShape());
 }
diff --git a/src/operator/contrib/adaptive_avg_pooling-inl.h b/src/operator/contrib/adaptive_avg_pooling-inl.h
index 0d66de0a5692..eedab78db0c5 100644
--- a/src/operator/contrib/adaptive_avg_pooling-inl.h
+++ b/src/operator/contrib/adaptive_avg_pooling-inl.h
@@ -48,9 +48,9 @@ namespace mxnet {
 namespace op {
 
 struct AdaptiveAvgPoolParam : public dmlc::Parameter<AdaptiveAvgPoolParam> {
-  mxnet::TShape output_size;
+  mxnet::Tuple<int> output_size;
   DMLC_DECLARE_PARAMETER(AdaptiveAvgPoolParam) {
-    DMLC_DECLARE_FIELD(output_size).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(output_size).set_default(mxnet::Tuple<int>())
     .describe("int (output size) or a tuple of int for output (height, width).");
   }
 };
@@ -125,7 +125,7 @@ static bool AdaptiveAvgPoolOpInferShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_shape->size(), 1U) << "Output:[data]";
   const AdaptiveAvgPoolParam& param = nnvm::get<AdaptiveAvgPoolParam>(attrs.parsed);
   mxnet::TShape dshape(in_shape->at(0));
-  if (dshape.ndim() == 0) return false;
+  if (mxnet::op::shape_is_none(dshape)) return false;
   if (param.output_size.ndim() == 0) {
     dshape[2] = 1;
     dshape[3] = 1;
diff --git a/src/operator/contrib/bilinear_resize-inl.h b/src/operator/contrib/bilinear_resize-inl.h
index 46c8e1aa7c0d..ce9c6c83504c 100644
--- a/src/operator/contrib/bilinear_resize-inl.h
+++ b/src/operator/contrib/bilinear_resize-inl.h
@@ -134,7 +134,7 @@ static bool BilinearSampleOpInferShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_shape->size(), 1U) << "Output:[data]";
   const BilinearSampleParam& param = nnvm::get<BilinearSampleParam>(attrs.parsed);
   mxnet::TShape dshape(in_shape->at(0));
-  if (dshape.ndim() == 0) return false;
+  if (mxnet::op::shape_is_none(dshape)) return false;
   if (param.scale_height.has_value()) {
     dshape[2] = static_cast<int>(param.scale_height.value() * in_shape->at(0)[2]);
   } else {
diff --git a/src/operator/contrib/boolean_mask.cc b/src/operator/contrib/boolean_mask.cc
index e22c493d5e2c..06d8439e23a0 100644
--- a/src/operator/contrib/boolean_mask.cc
+++ b/src/operator/contrib/boolean_mask.cc
@@ -121,7 +121,7 @@ inline void BooleanMaskForward<cpu>(const nnvm::NodeAttrs& attrs,
   const NDArray &out = outputs[0];
   CHECK_EQ(axis, 0) << "Not supported yet";
   CHECK_EQ(data.shape()[axis], idx.shape()[0]);
-  CHECK_EQ(idx.shape().ndim(), 1U);
+  CHECK_EQ(idx.shape().ndim(), 1U);  // idx is required to be 1-d.
   // count the number of 1s in `idx`, so that we could know the output dimension
   size_t idx_size = idx.shape()[0];
   std::vector<int32_t> prefix_sum(idx_size, 0);
diff --git a/src/operator/contrib/bounding_box-common.h b/src/operator/contrib/bounding_box-common.h
new file mode 100644
index 000000000000..69a96c60569a
--- /dev/null
+++ b/src/operator/contrib/bounding_box-common.h
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file bounding_box-common.h
+ * \brief bounding box util functions and operators commonly used by CPU and GPU implementations
+ * \author Joshua Zhang
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_COMMON_H_
+#define MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_COMMON_H_
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+
+namespace mxnet {
+namespace op {
+namespace box_common_enum {
+enum BoxType {kCorner, kCenter};
+}
+
+// compute line intersect along either height or width
+template<typename DType>
+MSHADOW_XINLINE DType Intersect(const DType *a, const DType *b, int encode) {
+  DType a1 = a[0];
+  DType a2 = a[2];
+  DType b1 = b[0];
+  DType b2 = b[2];
+  DType w;
+  if (box_common_enum::kCorner == encode) {
+    DType left = a1 > b1 ? a1 : b1;
+    DType right = a2 < b2 ? a2 : b2;
+    w = right - left;
+  } else {
+    DType aw = a2 / 2;
+    DType bw = b2 / 2;
+    DType al = a1 - aw;
+    DType ar = a1 + aw;
+    DType bl = b1 - bw;
+    DType br = b1 + bw;
+    DType left = bl > al ? bl : al;
+    DType right = br < ar ? br : ar;
+    w = right - left;
+  }
+  return w > 0 ? w : DType(0);
+}
+
+/*!
+   * \brief Implementation of the non-maximum suppression operation
+   *
+   * \param i the launched thread index
+   * \param index sorted index in descending order
+   * \param batch_start map (b, k) to compact index by indices[batch_start[b] + k]
+   * \param input the input of nms op
+   * \param areas pre-computed box areas
+   * \param k nms topk number
+   * \param ref compare reference position
+   * \param num number of input boxes in each batch
+   * \param stride input stride, usually 6 (id-score-x1-y1-x2-y2)
+   * \param offset_box box offset, usually 2
+   * \param thresh nms threshold
+   * \param force force suppress regardless of class id
+   * \param offset_id class id offset, used when force == false, usually 0
+   * \param encode box encoding type, corner(0) or center(1)
+   * \param DType the data type
+   */
+struct nms_impl {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, int32_t *index, const int32_t *batch_start,
+                                  const DType *input, const DType *areas,
+                                  int k, int ref, int num,
+                                  int stride, int offset_box, int offset_id,
+                                  float thresh, bool force, int encode) {
+    int b = i / k;  // batch
+    int pos = i % k + ref + 1;  // position
+    ref = static_cast<int>(batch_start[b]) + ref;
+    pos = static_cast<int>(batch_start[b]) + pos;
+    if (ref >= static_cast<int>(batch_start[b + 1])) return;
+    if (pos >= static_cast<int>(batch_start[b + 1])) return;
+    if (index[ref] < 0) return;  // reference has been suppressed
+    if (index[pos] < 0) return;  // self been suppressed
+    int ref_offset = static_cast<int>(index[ref]) * stride + offset_box;
+    int pos_offset = static_cast<int>(index[pos]) * stride + offset_box;
+    if (!force && offset_id >=0) {
+      int ref_id = static_cast<int>(input[ref_offset - offset_box + offset_id]);
+      int pos_id = static_cast<int>(input[pos_offset - offset_box + offset_id]);
+      if (ref_id != pos_id) return;  // different class
+    }
+    DType intersect = Intersect(input + ref_offset, input + pos_offset, encode);
+    intersect *= Intersect(input + ref_offset + 1, input + pos_offset + 1, encode);
+    int ref_area_offset = static_cast<int>(index[ref]);
+    int pos_area_offset = static_cast<int>(index[pos]);
+    DType iou = intersect / (areas[ref_area_offset] + areas[pos_area_offset] - intersect);
+    if (iou > thresh) {
+      index[pos] = -1;
+    }
+  }
+};
+
+namespace mshadow_op {
+struct less_than : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return static_cast<DType>(a < b);
+  }
+};
+
+struct greater_than : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return static_cast<DType>(a > b);
+  }
+};
+
+struct not_equal : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return static_cast<DType>(a != b);
+  }
+};
+
+struct bool_and : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return static_cast<DType>(a && b);
+  }
+};
+}   // namespace mshadow_op
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_COMMON_H_
diff --git a/src/operator/contrib/bounding_box-inl.cuh b/src/operator/contrib/bounding_box-inl.cuh
index fd5e30b25b2d..de215ce35a98 100644
--- a/src/operator/contrib/bounding_box-inl.cuh
+++ b/src/operator/contrib/bounding_box-inl.cuh
@@ -24,37 +24,298 @@
 */
 #ifndef MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_CUH_
 #define MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_CUH_
+#include <cmath>
+#include <cstdio>
 #include <mxnet/operator_util.h>
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include "../mshadow_op.h"
 #include "../mxnet_op.h"
 #include "../operator_common.h"
+#include "./bounding_box-common.h"
 
 namespace mxnet {
 namespace op {
 
 template<typename DType>
-struct valid_score {
-  DType thresh;
-  explicit valid_score(DType _thresh) : thresh(_thresh) {}
+struct valid_value {
   __host__ __device__ bool operator()(const DType x) {
-    return x > thresh;
+    return static_cast<bool>(x);
   }
 };
 
+template<typename DType, typename FType>
+int CopyIf(mshadow::Tensor<gpu, 1, DType> out,
+           mshadow::Tensor<gpu, 1, DType> value,
+           mshadow::Tensor<gpu, 1, FType> flag) {
+  valid_value<FType> pred;
+  DType *end_out = thrust::copy_if(thrust::device, value.dptr_, value.dptr_ + value.MSize(),
+                                   flag.dptr_, out.dptr_, pred);
+  return end_out - out.dptr_;
+}
+
+// compute line intersect along either height or width
 template<typename DType>
-int FilterScores(mshadow::Tensor<gpu, 1, DType> out_scores,
-                 mshadow::Tensor<gpu, 1, int32_t> out_sorted_index,
-                 mshadow::Tensor<gpu, 1, DType> scores,
-                 mshadow::Tensor<gpu, 1, int32_t> sorted_index,
-                 float valid_thresh) {
-  valid_score<DType> pred(static_cast<DType>(valid_thresh));
-  DType * end_scores = thrust::copy_if(thrust::device, scores.dptr_, scores.dptr_ + scores.MSize(),
-                                       out_scores.dptr_, pred);
-  thrust::copy_if(thrust::device, sorted_index.dptr_, sorted_index.dptr_ + sorted_index.MSize(),
-                  scores.dptr_, out_sorted_index.dptr_, pred);
-  return end_scores - out_scores.dptr_;
+MSHADOW_XINLINE DType Intersect2(const DType *a, const DType b1, const DType b2, int encode) {
+  const DType a1 = a[0];
+  const DType a2 = a[2];
+  DType left, right;
+  if (box_common_enum::kCorner == encode) {
+    left = a1 > b1 ? a1 : b1;
+    right = a2 < b2 ? a2 : b2;
+  } else {
+    const DType aw = a2 / 2;
+    const DType bw = b2 / 2;
+    const DType al = a1 - aw;
+    const DType ar = a1 + aw;
+    const DType bl = b1 - bw;
+    const DType br = b1 + bw;
+    left = bl > al ? bl : al;
+    right = br < ar ? br : ar;
+  }
+  const DType w = right - left;
+  return w > 0 ? w : DType(0);
+}
+
+template<typename DType, int N, bool check_class>
+__launch_bounds__(512)
+__global__ void nms_apply_kernel(const int topk, int32_t *index,
+                                 const int32_t *batch_start,
+                                 const DType *input,
+                                 const DType *areas,
+                                 const int num, const int stride,
+                                 const int offset_box, const int offset_id,
+                                 const float thresh, const bool force,
+                                 const int encode, const int start_offset) {
+  constexpr int block_size = 512;
+  const int start = static_cast<int>(batch_start[blockIdx.x]) + start_offset;
+  const int size_of_batch = static_cast<int>(batch_start[blockIdx.x + 1]) - start;
+  const int end = min(min(size_of_batch, topk - start_offset), N * block_size);
+  __shared__ int s_index[N * block_size];
+
+  for (int i = threadIdx.x; i < end; i += block_size) {
+    s_index[i] = static_cast<int>(index[start + i]);
+  }
+
+  __syncthreads();
+  for (int ref = 0; ref < end; ++ref) {
+    const int ref_area_offset = static_cast<int>(s_index[ref]);
+    if (ref_area_offset >= 0) {
+      const int ref_offset = ref_area_offset * stride + offset_box;
+      int ref_id = 0;
+      if (check_class) {
+        ref_id = static_cast<int>(input[ref_offset - offset_box + offset_id]);
+      }
+      for (int i = 0; i < N; ++i) {
+        const int my_pos = threadIdx.x + i * block_size;
+        if (my_pos > ref && my_pos < end && s_index[my_pos] >= 0) {
+          const int pos_area_offset = static_cast<int>(s_index[my_pos]);
+          const int pos_offset = pos_area_offset * stride + offset_box;
+          if (check_class) {
+            const int pos_id = static_cast<int>(input[pos_offset - offset_box + offset_id]);
+            if (ref_id != pos_id) continue;  // different class
+          }
+          DType intersect = Intersect(input + ref_offset, input + pos_offset, encode);
+          intersect *= Intersect(input + ref_offset + 1, input + pos_offset + 1, encode);
+          const DType iou = intersect /
+                            (areas[ref_area_offset] + areas[pos_area_offset] - intersect);
+          if (iou > thresh) {
+            s_index[my_pos] = -1;
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+
+  for (int i = threadIdx.x; i < end; i += block_size) {
+    index[start + i] = s_index[i];
+  }
+}
+
+template<typename DType, int N, bool check_class>
+__launch_bounds__(512)
+__global__ void nms_apply_kernel_rest(const int topk, int32_t *index,
+                                 const int32_t *batch_start,
+                                 const DType *input,
+                                 const DType *areas,
+                                 const int num, const int stride,
+                                 const int offset_box, const int offset_id,
+                                 const float thresh, const bool force,
+                                 const int encode, const int start_offset,
+                                 const int blocks_per_batch) {
+  constexpr int block_size = 512;
+  const int batch = blockIdx.x / blocks_per_batch;
+  const int start_ref = static_cast<int>(batch_start[batch]) + start_offset;
+  const int block_offset = (N + blockIdx.x % blocks_per_batch) * block_size;
+  const int start = start_ref + block_offset;
+
+  const int size_of_batch = static_cast<int>(batch_start[batch + 1]) - start;
+  const int end = min(size_of_batch, topk - start_offset - block_offset);
+  const int my_pos = start + threadIdx.x;
+  if (threadIdx.x < end && index[my_pos] >= 0) {
+    const int pos_area_offset = static_cast<int>(index[my_pos]);
+    const int pos_offset = pos_area_offset * stride + offset_box;
+    DType my_box[4];
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+      my_box[i] = input[pos_offset + i];
+    }
+    const DType my_area = areas[pos_area_offset];
+    int pos_id = 0;
+    if (check_class) {
+      pos_id = static_cast<int>(input[pos_offset - offset_box + offset_id]);
+    }
+
+    for (int ref = start_ref; ref < start_ref + N * block_size; ++ref) {
+      const int ref_area_offset = static_cast<int>(index[ref]);
+      if (ref_area_offset >= 0) {
+        const int ref_offset = ref_area_offset * stride + offset_box;
+        int ref_id = 0;
+        if (check_class) {
+          ref_id = static_cast<int>(input[ref_offset - offset_box + offset_id]);
+          if (ref_id != pos_id) continue;  // different class
+        }
+        DType intersect = Intersect2(input + ref_offset, my_box[0], my_box[2], encode);
+        intersect *= Intersect2(input + ref_offset + 1, my_box[1], my_box[3], encode);
+        const DType iou = intersect /
+          (areas[ref_area_offset] + my_area - intersect);
+        if (iou > thresh) {
+          index[my_pos] = -1;
+          break;
+        }
+      }
+    }
+  }
+}
+
+template<typename DType>
+void NMSApply(mshadow::Stream<gpu> *s,
+              int num_batch, int topk,
+              mshadow::Tensor<gpu, 1, int32_t>* sorted_index,
+              mshadow::Tensor<gpu, 1, int32_t>* batch_start,
+              mshadow::Tensor<gpu, 3, DType>* buffer,
+              mshadow::Tensor<gpu, 1, DType>* areas,
+              int num_elem, int width_elem,
+              int coord_start, int id_index,
+              float threshold, bool force_suppress,
+              int in_format) {
+  using namespace mxnet_op;
+  constexpr int THRESHOLD = 1024;
+  for (int ref = 0; ref < topk; ref += THRESHOLD) {
+    constexpr int block_size = 512;
+    constexpr int N = THRESHOLD / block_size;
+    auto stream = mshadow::Stream<gpu>::GetStream(s);
+    if (!force_suppress && id_index >= 0) {
+      nms_apply_kernel<DType, N, true><<<num_batch, block_size, 0, stream>>>(topk,
+                                                                      sorted_index->dptr_,
+                                                                      batch_start->dptr_,
+                                                                      buffer->dptr_,
+                                                                      areas->dptr_,
+                                                                      num_elem,
+                                                                      width_elem,
+                                                                      coord_start,
+                                                                      id_index,
+                                                                      threshold,
+                                                                      force_suppress,
+                                                                      in_format,
+                                                                      ref);
+      int blocks_per_batch = (topk - ref - THRESHOLD + block_size - 1)/block_size;
+      int blocks = blocks_per_batch  * num_batch;
+      if (blocks > 0) {
+        nms_apply_kernel_rest<DType, N, true><<<blocks, block_size, 0, stream>>>(topk,
+                                                                        sorted_index->dptr_,
+                                                                        batch_start->dptr_,
+                                                                        buffer->dptr_,
+                                                                        areas->dptr_,
+                                                                        num_elem,
+                                                                        width_elem,
+                                                                        coord_start,
+                                                                        id_index,
+                                                                        threshold,
+                                                                        force_suppress,
+                                                                        in_format,
+                                                                        ref,
+                                                                        blocks_per_batch);
+      }
+    } else {
+      nms_apply_kernel<DType, N, false><<<num_batch, block_size, 0, stream>>>(topk,
+                                                                       sorted_index->dptr_,
+                                                                       batch_start->dptr_,
+                                                                       buffer->dptr_,
+                                                                       areas->dptr_,
+                                                                       num_elem,
+                                                                       width_elem,
+                                                                       coord_start,
+                                                                       id_index,
+                                                                       threshold,
+                                                                       force_suppress,
+                                                                       in_format,
+                                                                       ref);
+      int blocks_per_batch = (topk - ref - THRESHOLD + block_size - 1)/block_size;
+      int blocks = blocks_per_batch  * num_batch;
+      if (blocks > 0) {
+        nms_apply_kernel_rest<DType, N, false><<<blocks, block_size, 0, stream>>>(topk,
+                                                                        sorted_index->dptr_,
+                                                                        batch_start->dptr_,
+                                                                        buffer->dptr_,
+                                                                        areas->dptr_,
+                                                                        num_elem,
+                                                                        width_elem,
+                                                                        coord_start,
+                                                                        id_index,
+                                                                        threshold,
+                                                                        force_suppress,
+                                                                        in_format,
+                                                                        ref,
+                                                                        blocks_per_batch);
+      }
+    }
+  }
+}
+
+__launch_bounds__(512)
+__global__ void nms_calculate_batch_start_kernel(int32_t * batch_start,
+                                                 int32_t * valid_batch_id,
+                                                 size_t N,
+                                                 int num_batch) {
+  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 350
+    const int32_t previous = tid > 0 ? __ldg(valid_batch_id + tid - 1) : -1;
+    const int32_t my = __ldg(valid_batch_id + tid);
+#else
+    const int32_t previous = tid > 0 ? valid_batch_id[tid - 1] : -1;
+    const int32_t my = valid_batch_id[tid];
+#endif
+    if (my > previous) {
+      for (int32_t current = previous + 1; current <= my; ++current) {
+        batch_start[current] = tid;
+      }
+    }
+    if (tid == N - 1) {
+      for (int32_t current = my + 1; current <= num_batch; ++current) {
+        batch_start[current] = tid + 1;
+      }
+    }
+  }
+}
+
+inline void NMSCalculateBatchStart(mshadow::Stream<gpu> *s,
+                                   mshadow::Tensor<gpu, 1, int32_t>* batch_start,
+                                   mshadow::Tensor<gpu, 1, int32_t>* valid_batch_id,
+                                   int num_batch) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  auto stream = mshadow::Stream<gpu>::GetStream(s);
+  constexpr int block_size = 512;
+  const int num_elements = valid_batch_id->size(0);
+  const int blocks = (num_elements + block_size - 1) / block_size;
+  nms_calculate_batch_start_kernel<<<blocks, block_size, 0, stream>>>(batch_start->dptr_,
+                                                                      valid_batch_id->dptr_,
+                                                                      num_elements,
+                                                                      num_batch);
 }
 
 }  // namespace op
diff --git a/src/operator/contrib/bounding_box-inl.h b/src/operator/contrib/bounding_box-inl.h
index 650e58d0e0cd..686f1666a310 100644
--- a/src/operator/contrib/bounding_box-inl.h
+++ b/src/operator/contrib/bounding_box-inl.h
@@ -34,12 +34,10 @@
 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #include "../tensor/sort_op.h"
+#include "./bounding_box-common.h"
 
 namespace mxnet {
 namespace op {
-namespace box_common_enum {
-enum BoxType {kCorner, kCenter};
-}
 namespace box_nms_enum {
 enum BoxNMSOpInputs {kData};
 enum BoxNMSOpOutputs {kOut, kTemp};
@@ -53,6 +51,7 @@ struct BoxNMSParam : public dmlc::Parameter<BoxNMSParam> {
   int coord_start;
   int score_index;
   int id_index;
+  int background_id;
   bool force_suppress;
   int in_format;
   int out_format;
@@ -69,6 +68,8 @@ struct BoxNMSParam : public dmlc::Parameter<BoxNMSParam> {
     .describe("Index of the scores/confidence of boxes.");
     DMLC_DECLARE_FIELD(id_index).set_default(-1)
     .describe("Optional, index of the class categories, -1 to disable.");
+    DMLC_DECLARE_FIELD(background_id).set_default(-1)
+    .describe("Optional, id of the background class which will be ignored in nms.");
     DMLC_DECLARE_FIELD(force_suppress).set_default(false)
     .describe("Optional, if set false and id_index is provided, nms will only apply"
     " to boxes belongs to the same category");
@@ -93,7 +94,8 @@ inline bool BoxNMSShape(const nnvm::NodeAttrs& attrs,
   const BoxNMSParam& param = nnvm::get<BoxNMSParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 2U);
-  if (in_attrs->at(0).ndim() == 0U && out_attrs->at(0).ndim() == 0U) {
+  if (mxnet::op::shape_is_none(in_attrs->at(0))
+    && mxnet::op::shape_is_none(out_attrs->at(0))) {
     return false;
   }
 
@@ -105,7 +107,7 @@ inline bool BoxNMSShape(const nnvm::NodeAttrs& attrs,
     << ishape << " provided";
   int width_elem = ishape[indim - 1];
   int expected = 5;
-  if (param.id_index > 0) {
+  if (param.id_index >= 0) {
     expected += 1;
   }
   CHECK_GE(width_elem, expected)
@@ -147,33 +149,20 @@ inline uint32_t BoxNMSNumVisibleOutputs(const NodeAttrs& attrs) {
   return static_cast<uint32_t>(1);
 }
 
-template<typename DType>
-int FilterScores(mshadow::Tensor<cpu, 1, DType> out_scores,
-                 mshadow::Tensor<cpu, 1, int32_t> out_sorted_index,
-                 mshadow::Tensor<cpu, 1, DType> scores,
-                 mshadow::Tensor<cpu, 1, int32_t> sorted_index,
-                 float valid_thresh) {
+template<typename DType, typename FType>
+int CopyIf(mshadow::Tensor<cpu, 1, DType> out,
+           mshadow::Tensor<cpu, 1, DType> value,
+           mshadow::Tensor<cpu, 1, FType> flag) {
   index_t j = 0;
-  for (index_t i = 0; i < scores.size(0); i++) {
-    if (scores[i] > valid_thresh) {
-      out_scores[j] = scores[i];
-      out_sorted_index[j] = sorted_index[i];
+  for (index_t i = 0; i < flag.size(0); i++) {
+    if (static_cast<bool>(flag[i])) {
+      out[j] = value[i];
       j++;
     }
   }
   return j;
 }
 
-namespace mshadow_op {
-struct less_than : public mxnet_op::tunable {
-  // a is x, b is sigma
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return static_cast<DType>(a < b);
-  }
-};  // struct equal_to
-}   // namespace mshadow_op
-
 struct corner_to_center {
   template<typename DType>
   MSHADOW_XINLINE static void Map(int i, DType *data, int stride) {
@@ -254,83 +243,43 @@ struct compute_area {
   }
 };
 
-// compute line intersect along either height or width
 template<typename DType>
-MSHADOW_XINLINE DType Intersect(const DType *a, const DType *b, int encode) {
-  DType a1 = a[0];
-  DType a2 = a[2];
-  DType b1 = b[0];
-  DType b2 = b[2];
-  DType w;
-  if (box_common_enum::kCorner == encode) {
-    DType left = a1 > b1 ? a1 : b1;
-    DType right = a2 < b2 ? a2 : b2;
-    w = right - left;
-  } else {
-    DType aw = a2 / 2;
-    DType bw = b2 / 2;
-    DType al = a1 - aw;
-    DType ar = a1 + aw;
-    DType bl = b1 - bw;
-    DType br = b1 + bw;
-    DType left = bl > al ? bl : al;
-    DType right = br < ar ? br : ar;
-    w = right - left;
+void NMSApply(mshadow::Stream<cpu> *s,
+              int num_batch, int topk,
+              mshadow::Tensor<cpu, 1, int32_t>* sorted_index,
+              mshadow::Tensor<cpu, 1, int32_t>* batch_start,
+              mshadow::Tensor<cpu, 3, DType>* buffer,
+              mshadow::Tensor<cpu, 1, DType>* areas,
+              int num_elem, int width_elem,
+              int coord_start, int id_index,
+              float threshold, bool force_suppress,
+              int in_format) {
+  using namespace mxnet_op;
+  // go through each box as reference, suppress if overlap > threshold
+  // sorted_index with -1 is marked as suppressed
+  for (int ref = 0; ref < topk; ++ref) {
+    int num_worker = topk - ref - 1;
+    if (num_worker < 1) continue;
+    Kernel<nms_impl, cpu>::Launch(s, num_batch * num_worker,
+      sorted_index->dptr_, batch_start->dptr_, buffer->dptr_, areas->dptr_,
+      num_worker, ref, num_elem,
+      width_elem, coord_start, id_index,
+      threshold, force_suppress, in_format);
   }
-  return w > 0 ? w : DType(0);
 }
 
-/*!
-   * \brief Implementation of the non-maximum suppression operation
-   *
-   * \param i the launched thread index
-   * \param index sorted index in descending order
-   * \param batch_start map (b, k) to compact index by indices[batch_start[b] + k]
-   * \param input the input of nms op
-   * \param areas pre-computed box areas
-   * \param k nms topk number
-   * \param ref compare reference position
-   * \param num number of input boxes in each batch
-   * \param stride input stride, usually 6 (id-score-x1-y1-x2-y2)
-   * \param offset_box box offset, usually 2
-   * \param thresh nms threshold
-   * \param force force suppress regardless of class id
-   * \param offset_id class id offset, used when force == false, usually 0
-   * \param encode box encoding type, corner(0) or center(1)
-   * \param DType the data type
-   */
-struct nms_impl {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, int32_t *index, const int32_t *batch_start,
-                                  const DType *input, const DType *areas,
-                                  int k, int ref, int num,
-                                  int stride, int offset_box, int offset_id,
-                                  float thresh, bool force, int encode) {
-    int b = i / k;  // batch
-    int pos = i % k + ref + 1;  // position
-    ref = static_cast<int>(batch_start[b]) + ref;
-    pos = static_cast<int>(batch_start[b]) + pos;
-    if (ref >= static_cast<int>(batch_start[b + 1])) return;
-    if (pos >= static_cast<int>(batch_start[b + 1])) return;
-    if (index[ref] < 0) return;  // reference has been suppressed
-    if (index[pos] < 0) return;  // self been suppressed
-    int ref_offset = static_cast<int>(index[ref]) * stride + offset_box;
-    int pos_offset = static_cast<int>(index[pos]) * stride + offset_box;
-    if (!force && offset_id >=0) {
-      int ref_id = static_cast<int>(input[ref_offset - offset_box + offset_id]);
-      int pos_id = static_cast<int>(input[pos_offset - offset_box + offset_id]);
-      if (ref_id != pos_id) return;  // different class
-    }
-    DType intersect = Intersect(input + ref_offset, input + pos_offset, encode);
-    intersect *= Intersect(input + ref_offset + 1, input + pos_offset + 1, encode);
-    int ref_area_offset = static_cast<int>(index[ref]);
-    int pos_area_offset = static_cast<int>(index[pos]);
-    DType iou = intersect / (areas[ref_area_offset] + areas[pos_area_offset] - intersect);
-    if (iou > thresh) {
-      index[pos] = -1;
-    }
+inline void NMSCalculateBatchStart(mshadow::Stream<cpu> *s,
+                                   mshadow::Tensor<cpu, 1, int32_t>* batch_start,
+                                   mshadow::Tensor<cpu, 1, int32_t>* valid_batch_id,
+                                   int num_batch) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  for (int b = 0; b < num_batch + 1; b++) {
+    slice<0>(*batch_start, b, b + 1) = reduce_keepdim<red::sum, false>(
+        F<mshadow_op::less_than>(*valid_batch_id, ScalarExp<int32_t>(b)), 0);
   }
-};
+}
 
 /*!
    * \brief Assign output of nms by indexing input
@@ -402,6 +351,7 @@ void BoxNMSForward(const nnvm::NodeAttrs& attrs,
   int num_batch = indim <= 2? 1 : in_shape.ProdShape(0, indim - 2);
   int num_elem = in_shape[indim - 2];
   int width_elem = in_shape[indim - 1];
+  bool class_exist = param.id_index >= 0;
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Tensor<xpu, 3, DType> data = inputs[box_nms_enum::kData]
      .get_with_shape<xpu, 3, DType>(Shape3(num_batch, num_elem, width_elem), s);
@@ -417,7 +367,7 @@ void BoxNMSForward(const nnvm::NodeAttrs& attrs,
 
     // index
     index_t int32_size = sort_index_shape.Size() * 3 + batch_start_shape.Size();
-    index_t dtype_size = sort_index_shape.Size() * 2;
+    index_t dtype_size = sort_index_shape.Size() * 3;
     if (req[0] == kWriteInplace) {
       dtype_size += buffer_shape.Size();
     }
@@ -436,6 +386,7 @@ void BoxNMSForward(const nnvm::NodeAttrs& attrs,
     Tensor<xpu, 1, DType> scores(workspace.dptr_ + int32_offset,
       sort_index_shape, s);
     Tensor<xpu, 1, DType> areas(scores.dptr_ + scores.MSize(), sort_index_shape, s);
+    Tensor<xpu, 1, DType> classes(areas.dptr_ + areas.MSize(), sort_index_shape, s);
     Tensor<xpu, 3, DType> buffer = data;
     if (req[0] == kWriteInplace) {
       // make copy
@@ -456,16 +407,30 @@ void BoxNMSForward(const nnvm::NodeAttrs& attrs,
       return;
     }
 
-    // use batch_id and areas as temporary storage
+    // use classes, areas and scores as temporary storage
     Tensor<xpu, 1, DType> all_scores = areas;
-    // Tensor<xpu, 1, DType> all_sorted_index = areas;
     all_scores = reshape(slice<2>(buffer, score_index, score_index + 1), all_scores.shape_);
     all_sorted_index = range<int32_t>(0, num_batch * num_elem);
+    Tensor<xpu, 1, DType> all_classes = classes;
+    if (class_exist) {
+      all_classes = reshape(slice<2>(buffer, id_index, id_index + 1), classes.shape_);
+    }
 
     // filter scores but keep original sorted_index value
     // move valid score and index to the front, return valid size
-    int num_valid = mxnet::op::FilterScores(scores, sorted_index, all_scores, all_sorted_index,
-                                            param.valid_thresh);
+    Tensor<xpu, 1, DType> valid_box = scores;
+    if (class_exist) {
+      valid_box = F<mshadow_op::bool_and>(
+        F<mshadow_op::greater_than>(all_scores, ScalarExp<DType>(param.valid_thresh)),
+        F<mshadow_op::not_equal>(all_classes, ScalarExp<DType>(param.background_id)));
+    } else {
+      valid_box = F<mshadow_op::greater_than>(all_scores, ScalarExp<DType>(param.valid_thresh));
+    }
+    classes = F<mshadow_op::identity>(valid_box);
+    valid_box = classes;
+    int num_valid = mxnet::op::CopyIf(scores, all_scores, valid_box);
+    mxnet::op::CopyIf(sorted_index, all_sorted_index, valid_box);
+
     // if everything is filtered, output -1
     if (num_valid == 0) {
       record = -1;
@@ -490,10 +455,7 @@ void BoxNMSForward(const nnvm::NodeAttrs& attrs,
 
     // calculate batch_start: accumulated sum to denote 1st sorted_index for a given batch_index
     valid_batch_id = (valid_sorted_index / ScalarExp<int32_t>(num_elem));
-    for (int b = 0; b < num_batch + 1; b++) {
-      slice<0>(batch_start, b, b + 1) = reduce_keepdim<red::sum, false>(
-        F<mshadow_op::less_than>(valid_batch_id, ScalarExp<int32_t>(b)), 0);
-    }
+    mxnet::op::NMSCalculateBatchStart(s, &batch_start, &valid_batch_id, num_batch);
 
     // pre-compute areas of candidates
     areas = 0;
@@ -502,17 +464,11 @@ void BoxNMSForward(const nnvm::NodeAttrs& attrs,
      topk, num_elem, width_elem, param.in_format);
 
     // apply nms
-    // go through each box as reference, suppress if overlap > threshold
-    // sorted_index with -1 is marked as suppressed
-    for (int ref = 0; ref < topk; ++ref) {
-      int num_worker = topk - ref - 1;
-      if (num_worker < 1) continue;
-      Kernel<nms_impl, xpu>::Launch(s, num_batch * num_worker,
-        sorted_index.dptr_, batch_start.dptr_, buffer.dptr_, areas.dptr_,
-        num_worker, ref, num_elem,
-        width_elem, coord_start, id_index,
-        param.overlap_thresh, param.force_suppress, param.in_format);
-    }
+    mxnet::op::NMSApply(s, num_batch, topk, &sorted_index,
+                        &batch_start, &buffer, &areas,
+                        num_elem, width_elem, coord_start,
+                        id_index, param.overlap_thresh,
+                        param.force_suppress, param.in_format);
 
     // store the results to output, keep a record for backward
     record = -1;
@@ -601,7 +557,7 @@ inline bool BoxOverlapShape(const nnvm::NodeAttrs& attrs,
     << rdim << " provided";
 
   // assign output shape
-  mxnet::TShape oshape(lshape.ndim() + rshape.ndim() - 2);
+  mxnet::TShape oshape(lshape.ndim() + rshape.ndim() - 2, -1);
   int idx = 0;
   for (index_t i = 0; i < lshape.ndim() - 1; ++i) {
     oshape[idx++] = lshape[i];
@@ -610,7 +566,7 @@ inline bool BoxOverlapShape(const nnvm::NodeAttrs& attrs,
     oshape[idx++] = rshape[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 struct compute_overlap {
@@ -714,14 +670,14 @@ inline bool MatchingShape(const nnvm::NodeAttrs& attrs,
     << dshape.ndim() << " provided";
 
   // assign output shape
-  mxnet::TShape oshape(dshape.ndim() - 1);
+  mxnet::TShape oshape(dshape.ndim() - 1, -1);
   for (index_t i = 0; i < dshape.ndim() - 1; ++i) {
     oshape[i] = dshape[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   oshape[oshape.ndim() - 1] = dshape[dshape.ndim() - 1];
   SHAPE_ASSIGN_CHECK(*out_attrs, 1, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 struct bipartite_matching {
diff --git a/src/operator/contrib/bounding_box.cc b/src/operator/contrib/bounding_box.cc
index d73f99245118..d682fafec092 100644
--- a/src/operator/contrib/bounding_box.cc
+++ b/src/operator/contrib/bounding_box.cc
@@ -38,8 +38,9 @@ NNVM_REGISTER_OP(_contrib_box_nms)
 .describe(R"code(Apply non-maximum suppression to input.
 
 The output will be sorted in descending order according to `score`. Boxes with
-overlaps larger than `overlap_thresh` and smaller scores will be removed and
-filled with -1, the corresponding position will be recorded for backward propogation.
+overlaps larger than `overlap_thresh`, smaller scores and background boxes
+will be removed and filled with -1, the corresponding position will be recorded
+for backward propogation.
 
 During back-propagation, the gradient will be copied to the original
 position according to the input index. For positions that have been suppressed,
@@ -60,6 +61,9 @@ additional elements are allowed.
 - `id_index`: optional, use -1 to ignore, useful if `force_suppress=False`, which means
   we will skip highly overlapped boxes if one is `apple` while the other is `car`.
 
+- `background_id`: optional, default=-1, class id for background boxes, useful
+  when `id_index >= 0` which means boxes with background id will be filtered before nms.
+
 - `coord_start`: required, default=2, the starting index of the 4 coordinates.
   Two formats are supported:
 
diff --git a/src/operator/contrib/count_sketch-inl.h b/src/operator/contrib/count_sketch-inl.h
index f3a294f6ad46..3ea93e63d6fc 100644
--- a/src/operator/contrib/count_sketch-inl.h
+++ b/src/operator/contrib/count_sketch-inl.h
@@ -151,7 +151,7 @@ class CountSketchProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 3) <<"Input:[data, h, s]";
     const mxnet::TShape &dshape = (*in_shape)[CountSketch::kData];
     // require data to be known
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
 
     out_shape->clear();
     if (dshape.ndim() == 4) {
diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h
index f50641fca6d6..000d703066d7 100644
--- a/src/operator/contrib/deformable_convolution-inl.h
+++ b/src/operator/contrib/deformable_convolution-inl.h
@@ -69,11 +69,11 @@ struct DeformableConvolutionParam : public dmlc::Parameter<DeformableConvolution
   dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(DeformableConvolutionParam) {
     DMLC_DECLARE_FIELD(kernel).describe("Convolution kernel size: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, -1))
       .describe("Convolution stride: (h, w) or (d, h, w). Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape(0, -1))
       .describe("Convolution dilate: (h, w) or (d, h, w). Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, -1))
       .describe("Zero pad for convolution: (h, w) or (d, h, w). Defaults to no padding.");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
       .describe("Convolution filter(channel) number");
@@ -127,9 +127,9 @@ class DeformableConvolutionOp : public Operator {
     Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
       .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
     // calculate the shape of col_buffer
-    mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1);
+    mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, -1);
     col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-    for (size_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+    for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
       col_buffer_shape[i] = out_data[0].shape_[i + 1];
     }
     // create a column buffer using workspace and col_buffer_shape
@@ -189,7 +189,7 @@ class DeformableConvolutionOp : public Operator {
     Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
       .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
     // calculate the shape of col_buffer
-    mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1);
+    mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, -1);
     col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
     for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
       col_buffer_shape[i] = out_grad[conv::kData].shape_[i + 1];
@@ -371,7 +371,7 @@ class DeformableConvolutionProp : public OperatorProperty {
     out_shape->resize(1, mxnet::TShape());
     const mxnet::TShape &dshp = (*in_shape)[conv::kData];
     const mxnet::TShape &oshp = (*in_shape)[conv::kOffset];
-    if (dshp.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshp)) return false;
     if (param_.kernel.ndim() == 2) {
       // 2d conv
       CHECK_EQ(dshp.ndim(), 4U) \
diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index f19af84ce9c6..313b855f0d2d 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -259,34 +259,28 @@ static bool CSRNeighborUniformSampleShape(const nnvm::NodeAttrs& attrs,
 
   // Output
   bool success = true;
-  mxnet::TShape out_shape(1);
+  mxnet::TShape out_shape(1, -1);
   // We use the last element to store the actual
   // number of vertices in the subgraph.
   out_shape[0] = params.max_num_vertices + 1;
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i, out_shape);
-    success = success &&
-              out_attrs->at(i).ndim() != 0U &&
-              out_attrs->at(i).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i));
   }
   // sub_csr
-  mxnet::TShape out_csr_shape(2);
+  mxnet::TShape out_csr_shape(2, -1);
   out_csr_shape[0] = params.max_num_vertices;
   out_csr_shape[1] = in_attrs->at(0)[1];
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i + num_subgraphs, out_csr_shape);
-    success = success &&
-              out_attrs->at(i + num_subgraphs).ndim() != 0U &&
-              out_attrs->at(i + num_subgraphs).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i + num_subgraphs));
   }
   // sub_layer
-  mxnet::TShape out_layer_shape(1);
+  mxnet::TShape out_layer_shape(1, -1);
   out_layer_shape[0] = params.max_num_vertices;
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i + 2*num_subgraphs, out_layer_shape);
-    success = success &&
-              out_attrs->at(i + 2*num_subgraphs).ndim() != 0U &&
-              out_attrs->at(i + 2*num_subgraphs).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i + 2 * num_subgraphs));
   }
 
   return success;
@@ -317,43 +311,35 @@ static bool CSRNeighborNonUniformSampleShape(const nnvm::NodeAttrs& attrs,
 
   // Output
   bool success = true;
-  mxnet::TShape out_shape(1);
+  mxnet::TShape out_shape(1, -1);
   // We use the last element to store the actual
   // number of vertices in the subgraph.
   out_shape[0] = params.max_num_vertices + 1;
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i, out_shape);
-    success = success &&
-              out_attrs->at(i).ndim() != 0U &&
-              out_attrs->at(i).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i));
   }
   // sub_csr
-  mxnet::TShape out_csr_shape(2);
+  mxnet::TShape out_csr_shape(2, -1);
   out_csr_shape[0] = params.max_num_vertices;
   out_csr_shape[1] = in_attrs->at(0)[1];
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i + num_subgraphs, out_csr_shape);
-    success = success &&
-              out_attrs->at(i + num_subgraphs).ndim() != 0U &&
-              out_attrs->at(i + num_subgraphs).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i + num_subgraphs));
   }
   // sub_probability
-  mxnet::TShape out_prob_shape(1);
+  mxnet::TShape out_prob_shape(1, -1);
   out_prob_shape[0] = params.max_num_vertices;
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i + 2*num_subgraphs, out_prob_shape);
-    success = success &&
-              out_attrs->at(i + 2*num_subgraphs).ndim() != 0U &&
-              out_attrs->at(i + 2*num_subgraphs).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i + 2 * num_subgraphs));
   }
   // sub_layer
-  mxnet::TShape out_layer_shape(1);
+  mxnet::TShape out_layer_shape(1, -1);
   out_layer_shape[0] = params.max_num_vertices;
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i + 3*num_subgraphs, out_prob_shape);
-    success = success &&
-              out_attrs->at(i + 3*num_subgraphs).ndim() != 0U &&
-              out_attrs->at(i + 3*num_subgraphs).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i + 3 * num_subgraphs));
   }
 
   return success;
@@ -679,8 +665,8 @@ static void SampleSubgraph(const NDArray &csr,
     }
   }
   // Construct sub_csr_graph
-  mxnet::TShape shape_1(1);
-  mxnet::TShape shape_2(1);
+  mxnet::TShape shape_1(1, -1);
+  mxnet::TShape shape_2(1, -1);
   shape_1[0] = num_edges;
   shape_2[0] = max_num_vertices+1;
   sub_csr.CheckAndAllocData(shape_1);
@@ -960,13 +946,13 @@ static bool DGLSubgraphShape(const nnvm::NodeAttrs& attrs,
 
   size_t num_g = params.num_args - 1;
   for (size_t i = 0; i < num_g; i++) {
-    mxnet::TShape gshape(2);
+    mxnet::TShape gshape(2, -1);
     gshape[0] = in_attrs->at(i + 1)[0];
     gshape[1] = in_attrs->at(i + 1)[0];
     out_attrs->at(i) = gshape;
   }
   for (size_t i = num_g; i < out_attrs->size(); i++) {
-    mxnet::TShape gshape(2);
+    mxnet::TShape gshape(2, -1);
     gshape[0] = in_attrs->at(i - num_g + 1)[0];
     gshape[1] = in_attrs->at(i - num_g + 1)[0];
     out_attrs->at(i) = gshape;
@@ -1081,9 +1067,9 @@ static void GetSubgraph(const NDArray &csr_arr, const NDArray &varr,
     row_idx[i + 1] = col_idx.size();
   }
 
-  mxnet::TShape nz_shape(1);
+  mxnet::TShape nz_shape(1, -1);
   nz_shape[0] = col_idx.size();
-  mxnet::TShape indptr_shape(1);
+  mxnet::TShape indptr_shape(1, -1);
   indptr_shape[0] = row_idx.size();
 
   // Store the non-zeros in a subgraph with edge attributes of new edge ids.
@@ -1199,7 +1185,7 @@ inline bool EdgeIDShape(const nnvm::NodeAttrs& attrs,
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(1));
   SHAPE_ASSIGN_CHECK(*in_attrs, 1, out_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*in_attrs, 2, out_attrs->at(0));
-  return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U;
+  return !mxnet::op::shape_is_none(out_attrs->at(0));
 }
 
 inline bool EdgeIDType(const nnvm::NodeAttrs& attrs,
@@ -1357,7 +1343,7 @@ inline bool DGLAdjacencyShape(const nnvm::NodeAttrs& attrs,
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-  return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U;
+  return !mxnet::op::shape_is_none(out_attrs->at(0));
 }
 
 inline bool DGLAdjacencyType(const nnvm::NodeAttrs& attrs,
@@ -1460,9 +1446,9 @@ static void CompactSubgraph(const NDArray &csr, const NDArray &vids,
     CHECK_NE(row_ids[i], -1);
   }
 
-  mxnet::TShape nz_shape(1);
+  mxnet::TShape nz_shape(1, -1);
   nz_shape[0] = num_elems;
-  mxnet::TShape indptr_shape(1);
+  mxnet::TShape indptr_shape(1, -1);
   CHECK_EQ(out_csr.shape()[0], graph_size);
   indptr_shape[0] = graph_size + 1;
   CHECK_GE(in_ptr_data.shape_[0], indptr_shape[0]);
@@ -1540,7 +1526,7 @@ static bool SubgraphCompactShape(const nnvm::NodeAttrs& attrs,
   }
 
   for (size_t i = 0; i < num_g; i++) {
-    mxnet::TShape gshape(2);
+    mxnet::TShape gshape(2, -1);
     gshape[0] = params.graph_sizes[i];
     gshape[1] = params.graph_sizes[i];
     out_attrs->at(i) = gshape;
diff --git a/src/operator/contrib/fft-inl.h b/src/operator/contrib/fft-inl.h
index 247f6290c02a..a5471b4ba2e2 100644
--- a/src/operator/contrib/fft-inl.h
+++ b/src/operator/contrib/fft-inl.h
@@ -241,7 +241,7 @@ class FFTProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 1) <<"Input:[data]";
     const mxnet::TShape &dshape = (*in_shape)[fft::kData];
     // require data to be known
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
 
     out_shape->clear();
     if (dshape.ndim() == 4) {
diff --git a/src/operator/contrib/ifft-inl.h b/src/operator/contrib/ifft-inl.h
index e53c0f60fa9e..7d8422e838b1 100644
--- a/src/operator/contrib/ifft-inl.h
+++ b/src/operator/contrib/ifft-inl.h
@@ -231,7 +231,7 @@ class IFFTProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 1) <<"Input:[data]";
     const mxnet::TShape &dshape = (*in_shape)[ifft::kData];
     // require data to be known
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
 
     out_shape->clear();
     if (dshape.ndim() == 4) {
diff --git a/src/operator/contrib/index_copy-inl.h b/src/operator/contrib/index_copy-inl.h
index d93bf47949a8..9f78f0593ed1 100644
--- a/src/operator/contrib/index_copy-inl.h
+++ b/src/operator/contrib/index_copy-inl.h
@@ -37,108 +37,19 @@
 namespace mxnet {
 namespace op {
 
-template<int req>
-struct index_copy_forward {
-  template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i,
-                                  int dim,
-                                  IType* index,
-                                  DType* new_tensor,
-                                  DType* out_tensor) {
-    DType* out_ptr = out_tensor + static_cast<int>(index[i]) * dim;
-    DType* new_ptr = new_tensor + i * dim;
-    for (int idx = 0; idx < dim; ++idx) {
-      KERNEL_ASSIGN(out_ptr[idx], req, new_ptr[idx]);
-    }
-  }
-};
-
 template<typename xpu>
 void IndexCopyForward(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
                       const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
-                      const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 3U);
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TBlob& out = outputs[0];
-  const TBlob& original_tensor = inputs[0];
-  const TBlob& idx_vector = inputs[1];
-  const TBlob& copied_tensor = inputs[2];
-  int dim = inputs[2].Size() / inputs[1].Size();
-  // copy original tensor to output
-  mxnet_op::copy(s, out, original_tensor);
-  // index copy
-  MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
-    MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        mxnet_op::Kernel<index_copy_forward<req_type>, xpu>::Launch(s,
-                              idx_vector.Size(), dim,
-                              idx_vector.dptr<IType>(),
-                              copied_tensor.dptr<DType>(),
-                              out.dptr<DType>());
-      });
-    });
-  });
-}
-
-struct index_copy_backward {
-  template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i,
-                                  int dim,
-                                  int index_size,
-                                  int req1, int req2,
-                                  DType* out_grad,
-                                  IType* index,
-                                  DType* in_grad_1,
-                                  DType* in_grad_2) {
-    // Copy to in_grad_2
-    for (int p = 0; p < index_size; ++p) {
-      int idx = static_cast<int>(index[p]);
-      if (i >= idx*dim && i < (idx+1)*dim) {
-        int offset = i - idx*dim;
-        KERNEL_ASSIGN(in_grad_2[p*dim+offset], req2, out_grad[i]);
-        return;
-      }
-    }
-    // Copy to in_grad_1
-    KERNEL_ASSIGN(in_grad_1[i], req1, out_grad[i]);
-  }
-};
+                      const std::vector<TBlob>& outputs);
 
 template<typename xpu>
 void IndexCopyBackward(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
                        const std::vector<TBlob>& inputs,
                        const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 4U);
-  CHECK_EQ(outputs.size(), 3U);
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TBlob& out_grad = inputs[0];
-  const TBlob& index = inputs[2];
-  const TBlob& in_grad_1 = outputs[0];
-  const TBlob& in_grad_2 = outputs[2];
-  int dim = inputs[3].Size() / inputs[2].Size();
-  int index_size = inputs[2].Size();
-  Fill<false>(s, outputs[0], req[0], 0);
-  Fill<false>(s, outputs[2], req[2], 0);
-  // index_copy_backward
-  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
-    MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
-      mxnet_op::Kernel<index_copy_backward, xpu>::Launch(s,
-                                      out_grad.Size(),
-                                      dim, index_size,
-                                      req[0], req[2],
-                                      out_grad.dptr<DType>(),
-                                      index.dptr<IType>(),
-                                      in_grad_1.dptr<DType>(),
-                                      in_grad_2.dptr<DType>());
-    });
-  });
-}
+                       const std::vector<TBlob>& outputs);
 
 inline bool IndexCopyShape(const nnvm::NodeAttrs& attrs,
                            mxnet::ShapeVector *in_attrs,
@@ -153,7 +64,7 @@ inline bool IndexCopyShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->at(1).ndim(), 1);
   // Shape matching
   CHECK_EQ(in_attrs->at(0).ndim(), in_attrs->at(2).ndim());
-  for (size_t i = 0; i < in_attrs->at(0).ndim(); ++i) {
+  for (int i = 0; i < in_attrs->at(0).ndim(); ++i) {
     if (i == 0) {
       CHECK_GE(in_attrs->at(0)[i], in_attrs->at(2)[i]);
     } else {
@@ -165,8 +76,7 @@ inline bool IndexCopyShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->at(1)[0], in_attrs->at(2)[0]);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-  return out_attrs->at(0).ndim() != 0U &&
-         out_attrs->at(0).Size() != 0U;
+  return !mxnet::op::shape_is_none(out_attrs->at(0));
 }
 
 }  // namespace op
diff --git a/src/operator/contrib/index_copy.cc b/src/operator/contrib/index_copy.cc
index bcf6c02d3d37..f272a8860d85 100644
--- a/src/operator/contrib/index_copy.cc
+++ b/src/operator/contrib/index_copy.cc
@@ -26,6 +26,122 @@
 namespace mxnet {
 namespace op {
 
+struct index_copy_fwd_cpu {
+  template<typename DType, typename IType>
+  static void Map(int i,
+                  const DType* new_tensor,
+                  const IType* idx,
+                  DType* out_tensor,
+                  int dim_size) {
+    DType* out_ptr = out_tensor + static_cast<int>(idx[i]) * dim_size;
+    const DType* new_ptr = new_tensor + i * dim_size;
+    std::memcpy(out_ptr, new_ptr, sizeof(DType) * dim_size);
+  }
+};
+
+template<>
+void IndexCopyForward<cpu>(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK(req[0] != kAddTo);
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+  const TBlob& out = outputs[0];
+  const TBlob& original_tensor = inputs[0];
+  const TBlob& idx_vector = inputs[1];
+  const TBlob& copied_tensor = inputs[2];
+  int dim_size = inputs[2].Size() / inputs[1].Size();
+  // copy original tensor to output
+  copy(s, out, original_tensor);
+  // index copy
+  MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
+      Kernel<index_copy_fwd_cpu, cpu>::Launch(
+        s, idx_vector.Size(), copied_tensor.dptr<DType>(),
+        idx_vector.dptr<IType>(), out.dptr<DType>(), dim_size);
+    });
+  });
+}
+
+struct index_copy_bwd_cpu {
+  template<typename DType, typename IType>
+  static void Map(int i,
+                  const DType* out_tensor_grad,
+                  DType* orig_tensor_grad,
+                  DType* new_tensor_grad,
+                  const IType* idx,
+                  int dim_size,
+                  int idx_size,
+                  OpReqType orig_req,
+                  OpReqType new_req) {
+    const int index = idx[i];
+    DType* new_ptr = new_tensor_grad + i * dim_size;
+    DType* orig_ptr = orig_tensor_grad + index * dim_size;
+    const DType* src_ptr = out_tensor_grad + index * dim_size;
+    for (int iter = 0; iter < dim_size; ++iter) {
+      KERNEL_ASSIGN(new_ptr[iter], new_req, src_ptr[iter]);
+    }
+    if (orig_req == kAddTo) {
+      for (int iter = 0; iter < dim_size; ++iter) {
+        orig_ptr[iter] -= src_ptr[iter];
+      }
+    } else if (orig_req == kNullOp) {
+      return;
+    } else {
+      std::memset(orig_ptr, 0, sizeof(DType) * dim_size);
+    }
+  }
+};
+
+template<>
+void IndexCopyBackward<cpu>(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 4U);
+  CHECK_EQ(outputs.size(), 3U);
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  const TBlob& out_grad = inputs[0];
+  const TBlob& index = inputs[2];
+  const TBlob& in_grad_1 = outputs[0];
+  const TBlob& in_grad_2 = outputs[2];
+  int dim_size = inputs[3].Size() / inputs[2].Size();
+  int index_size = inputs[2].Size();
+  OpReqType orig_req = req[0];
+  OpReqType new_req = req[2];
+  // index_copy_backward
+  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
+      switch (orig_req) {
+        case kNullOp:
+          break;
+        case kWriteTo:
+        case kWriteInplace:
+          copy(s, in_grad_1, out_grad);
+          break;
+        case kAddTo:
+          Kernel<op_with_req<op::mshadow_op::plus, kWriteInplace>, cpu>::Launch(
+            s, out_grad.Size(), in_grad_1.dptr<DType>(),
+            out_grad.dptr<DType>(), in_grad_1.dptr<DType>());
+      }
+      Kernel<index_copy_bwd_cpu, cpu>::Launch(
+        s, index_size, out_grad.dptr<DType>(),
+        in_grad_1.dptr<DType>(), in_grad_2.dptr<DType>(),
+        index.dptr<IType>(), dim_size, index_size, orig_req, new_req);
+    });
+  });
+}
+
 static bool IndexCopyType(const nnvm::NodeAttrs& attrs,
                           std::vector<int> *in_attrs,
                           std::vector<int> *out_attrs) {
@@ -71,6 +187,10 @@ Examples::
 .set_attr<nnvm::FInferType>("FInferType", IndexCopyType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_contrib_backward_index_copy"})
 .set_attr<FCompute>("FCompute<cpu>", IndexCopyForward<cpu>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"old_tensor", "index_vector", "new_tensor"};
+  })
 .add_argument("old_tensor", "NDArray-or-Symbol", "Old tensor")
 .add_argument("index_vector", "NDArray-or-Symbol", "Index vector")
 .add_argument("new_tensor", "NDArray-or-Symbol", "New tensor to be copied");
diff --git a/src/operator/contrib/index_copy.cu b/src/operator/contrib/index_copy.cu
index dc416114b04d..53f2600aba06 100644
--- a/src/operator/contrib/index_copy.cu
+++ b/src/operator/contrib/index_copy.cu
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file index_copy.cc
+ * \file index_copy.cu
  * \brief
  */
 #include "./index_copy-inl.h"
@@ -26,6 +26,114 @@
 namespace mxnet {
 namespace op {
 
+struct index_copy_fwd_gpu {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  const DType* new_tensor,
+                                  const IType* idx,
+                                  DType* out_tensor,
+                                  int dim_size) {
+    int index = static_cast<int>(idx[i / dim_size]);
+    out_tensor[index * dim_size + i % dim_size] = new_tensor[i];
+  }
+};
+
+template<>
+void IndexCopyForward<gpu>(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK(req[0] != kAddTo);
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+  const TBlob& out = outputs[0];
+  const TBlob& original_tensor = inputs[0];
+  const TBlob& idx_vector = inputs[1];
+  const TBlob& copied_tensor = inputs[2];
+  int dim_size = inputs[2].Size() / inputs[1].Size();
+  // copy original tensor to output
+  copy(s, out, original_tensor);
+  // index copy
+  MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
+      Kernel<index_copy_fwd_gpu, gpu>::Launch(
+        s, copied_tensor.Size(), copied_tensor.dptr<DType>(),
+        idx_vector.dptr<IType>(), out.dptr<DType>(), dim_size);
+    });
+  });
+}
+
+struct index_copy_bwd_gpu {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  const DType* out_grad,
+                                  DType* orig_grad,
+                                  DType* new_grad,
+                                  const IType* idx,
+                                  int dim_size,
+                                  int idx_size,
+                                  OpReqType orig_req,
+                                  OpReqType new_req) {
+    int index = idx[i / dim_size];
+    KERNEL_ASSIGN(new_grad[i], new_req, out_grad[index * dim_size + i % dim_size]);
+    if (orig_req == kAddTo) {
+      orig_grad[index * dim_size + i % dim_size] -= new_grad[i];
+    } else if (orig_req == kNullOp) {
+      return;
+    } else {
+      orig_grad[index * dim_size + i % dim_size] = 0;
+    }
+  }
+};
+
+template<>
+void IndexCopyBackward<gpu>(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 4U);
+  CHECK_EQ(outputs.size(), 3U);
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  const TBlob& out_grad = inputs[0];
+  const TBlob& index = inputs[2];
+  const TBlob& in_grad_1 = outputs[0];
+  const TBlob& in_grad_2 = outputs[2];
+  int dim_size = inputs[3].Size() / inputs[2].Size();
+  int index_size = inputs[2].Size();
+  OpReqType orig_req = req[0];
+  OpReqType new_req = req[2];
+  // index_copy_backward
+  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
+      switch (orig_req) {
+        case kNullOp:
+          break;
+        case kWriteTo:
+        case kWriteInplace:
+          copy(s, in_grad_1, out_grad);
+          break;
+        case kAddTo:
+          Kernel<op_with_req<op::mshadow_op::plus, kWriteInplace>, gpu>::Launch(
+            s, out_grad.Size(), in_grad_1.dptr<DType>(),
+            out_grad.dptr<DType>(), in_grad_1.dptr<DType>());
+      }
+      Kernel<index_copy_bwd_gpu, gpu>::Launch(
+        s, in_grad_2.Size(), out_grad.dptr<DType>(),
+        in_grad_1.dptr<DType>(), in_grad_2.dptr<DType>(),
+        index.dptr<IType>(), dim_size, index_size, orig_req, new_req);
+    });
+  });
+}
+
 NNVM_REGISTER_OP(_contrib_index_copy)
 .set_attr<FCompute>("FCompute<gpu>", IndexCopyForward<gpu>);
 
diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h
index 4b9a41c2fa87..4d278fb40645 100644
--- a/src/operator/contrib/multi_proposal-inl.h
+++ b/src/operator/contrib/multi_proposal-inl.h
@@ -108,7 +108,7 @@ class MultiProposalProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 3) << "Input:[cls_prob, bbox_pred, im_info]";
     const mxnet::TShape &dshape = in_shape->at(proposal::kClsProb);
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
     Shape<4> bbox_pred_shape;
     bbox_pred_shape = Shape4(dshape[0], dshape[1] * 2, dshape[2], dshape[3]);
     SHAPE_ASSIGN_CHECK(*in_shape, proposal::kBBoxPred,
diff --git a/src/operator/contrib/multibox_detection-inl.h b/src/operator/contrib/multibox_detection-inl.h
index 977126ad269d..1ac14e237f0d 100644
--- a/src/operator/contrib/multibox_detection-inl.h
+++ b/src/operator/contrib/multibox_detection-inl.h
@@ -161,7 +161,7 @@ class MultiBoxDetectionProp : public OperatorProperty {
     CHECK_EQ(cshape[2] * 4, lshape[1]) << "# anchors mismatch with # loc";
     CHECK_GT(ashape[1], 0U) << "Number of anchors must > 0";
     CHECK_EQ(ashape[2], 4U);
-    mxnet::TShape oshape = mxnet::TShape(3);
+    mxnet::TShape oshape = mxnet::TShape(3, -1);
     oshape[0] = cshape[0];
     oshape[1] = ashape[1];
     oshape[2] = 6;  // [id, prob, xmin, ymin, xmax, ymax]
diff --git a/src/operator/contrib/multibox_prior-inl.h b/src/operator/contrib/multibox_prior-inl.h
index 3636a6016bd2..d8929f3deff4 100644
--- a/src/operator/contrib/multibox_prior-inl.h
+++ b/src/operator/contrib/multibox_prior-inl.h
@@ -180,7 +180,7 @@ class MultiBoxPriorProp: public OperatorProperty {
     int in_width = dshape[3];
     CHECK_GT(in_width, 0) << "Input width should > 0";
     // since input sizes are same in each batch, we could share MultiBoxPrior
-    mxnet::TShape oshape = mxnet::TShape(3);
+    mxnet::TShape oshape = mxnet::TShape(3, -1);
     int num_sizes = param_.sizes.ndim();
     int num_ratios = param_.ratios.ndim();
     oshape[0] = 1;
@@ -189,7 +189,7 @@ class MultiBoxPriorProp: public OperatorProperty {
     out_shape->clear();
     out_shape->push_back(oshape);
     CHECK_EQ(param_.steps.ndim(), 2) << "Step ndim must be 2: (step_y, step_x)";
-    return true;
+    return shape_is_known(oshape);
   }
 
   OperatorProperty* Copy() const override {
diff --git a/src/operator/contrib/nnvm_to_onnx.cc b/src/operator/contrib/nnvm_to_onnx.cc
index 0417a085616a..0c8bd79490e3 100644
--- a/src/operator/contrib/nnvm_to_onnx.cc
+++ b/src/operator/contrib/nnvm_to_onnx.cc
@@ -417,7 +417,8 @@ std::unordered_map<std::string, mxnet::TShape> GetPlaceholderShapes(
   for (uint32_t i = 0; i < shape_inputs.size(); ++i) {
     std::string name = ig[ig.input_nodes()[i]].source->attrs.name;
     mxnet::TShape shp = shape_inputs[i];
-    if (shp.ndim() > 0) {
+    if (!mxnet::op::shape_is_none(shp)) {
+      // TODO(@reminisce): confirm
       placeholder_shapes.emplace(name, shp);
     }
   }
diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc
index 9f948bad81b6..83bbcdab833d 100644
--- a/src/operator/contrib/optimizer_op.cc
+++ b/src/operator/contrib/optimizer_op.cc
@@ -45,7 +45,7 @@ inline bool GroupAdagradShape(const nnvm::NodeAttrs &attrs,
   SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*in_attrs, 1, out_attrs->at(0));
 
-  return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U &&
+  return !mxnet::op::shape_is_none(out_attrs->at(0)) &&
          (in_attrs->at(0)[0] == in_attrs->at(1)[0]) &&
          (in_attrs->at(0)[0] == in_attrs->at(2)[0]);
 }
diff --git a/src/operator/contrib/proposal-inl.h b/src/operator/contrib/proposal-inl.h
index 9908ca96ec5f..21e9fe198e63 100644
--- a/src/operator/contrib/proposal-inl.h
+++ b/src/operator/contrib/proposal-inl.h
@@ -106,7 +106,7 @@ class ProposalProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 3) << "Input:[cls_prob, bbox_pred, im_info]";
     const mxnet::TShape &dshape = in_shape->at(proposal::kClsProb);
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
     Shape<4> bbox_pred_shape;
     bbox_pred_shape = Shape4(dshape[0], dshape[1] * 2, dshape[2], dshape[3]);
     SHAPE_ASSIGN_CHECK(*in_shape, proposal::kBBoxPred,
diff --git a/src/operator/contrib/quadratic_op-inl.h b/src/operator/contrib/quadratic_op-inl.h
index e679fedc8e57..a7aca63de17a 100644
--- a/src/operator/contrib/quadratic_op-inl.h
+++ b/src/operator/contrib/quadratic_op-inl.h
@@ -60,7 +60,7 @@ inline bool QuadraticOpShape(const nnvm::NodeAttrs& attrs,
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-  return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U;
+  return !mxnet::op::shape_is_none(out_attrs->at(0));
 }
 
 inline bool QuadraticOpType(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/contrib/sync_batch_norm-inl.h b/src/operator/contrib/sync_batch_norm-inl.h
index b94416640f55..cd1a3285fe06 100644
--- a/src/operator/contrib/sync_batch_norm-inl.h
+++ b/src/operator/contrib/sync_batch_norm-inl.h
@@ -69,7 +69,6 @@ struct SyncBatchNormParam : public dmlc::Parameter<SyncBatchNormParam> {
     DMLC_DECLARE_FIELD(ndev).set_default(1)
       .describe("The count of GPU devices");
     DMLC_DECLARE_FIELD(key)
-      .set_default("")
       .describe("Hash key for synchronization, please set the same hash key for same layer, "
                 "Block.prefix is typically used as in :class:`gluon.nn.contrib.SyncBatchNorm`.");
   }
@@ -275,14 +274,18 @@ class SyncBatchNorm : public Operator {
       static_cast<real_t>(in_data[syncbatchnorm::kData].shape_.Size());
     Tensor<xpu, 4> data;
     Tensor<xpu, 4> out;
-    if (in_data[syncbatchnorm::kData].ndim() == 2) {
+    if (in_data[syncbatchnorm::kData].ndim() == 4) {
+      data = in_data[syncbatchnorm::kData].get<xpu, 4, real_t>(s);
+      out = out_data[syncbatchnorm::kOut].get<xpu, 4, real_t>(s);
+    } else {
+      index_t num_channels = in_data[syncbatchnorm::kData].ndim() > 1 ?
+        in_data[syncbatchnorm::kData].shape_[1] : 1;
+      index_t spatial_size = in_data[syncbatchnorm::kData].shape_.ProdShape(2,
+          in_data[syncbatchnorm::kData].ndim());
       Shape<4> dshape = Shape4(in_data[syncbatchnorm::kData].shape_[0],
-                               in_data[syncbatchnorm::kData].shape_[1], 1, 1);
+                               num_channels, 1, spatial_size);
       data = in_data[syncbatchnorm::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
       out = out_data[syncbatchnorm::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-    } else {
-      data = in_data[syncbatchnorm::kData].get<xpu, 4, real_t>(s);
-      out = out_data[syncbatchnorm::kOut].get<xpu, 4, real_t>(s);
     }
     Tensor<xpu, 1> slope = in_data[syncbatchnorm::kGamma].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1> bias = in_data[syncbatchnorm::kBeta].get<xpu, 1, real_t>(s);
@@ -354,16 +357,20 @@ class SyncBatchNorm : public Operator {
     Tensor<xpu, 4> data, grad, grad_in;
     const real_t scale = static_cast<real_t>(out_grad[syncbatchnorm::kOut].shape_[1]) /
       static_cast<real_t>(out_grad[syncbatchnorm::kOut].shape_.Size());
-    if (in_data[syncbatchnorm::kData].ndim() == 2) {
+    if (in_data[syncbatchnorm::kData].ndim() == 4) {
+      data = in_data[syncbatchnorm::kData].get<xpu, 4, real_t>(s);
+      grad = out_grad[syncbatchnorm::kOut].get<xpu, 4, real_t>(s);
+      grad_in = in_grad[syncbatchnorm::kData].get<xpu, 4, real_t>(s);
+    } else {
+      index_t num_channels = out_grad[syncbatchnorm::kOut].ndim() > 1 ?
+        out_grad[syncbatchnorm::kOut].shape_[1] : 1;
+      index_t spatial_size = out_grad[syncbatchnorm::kOut].shape_.ProdShape(2,
+          out_grad[syncbatchnorm::kOut].ndim());
       Shape<4> dshape = Shape4(out_grad[syncbatchnorm::kOut].shape_[0],
-                               out_grad[syncbatchnorm::kOut].shape_[1], 1, 1);
+                               num_channels, 1, spatial_size);
       data = in_data[syncbatchnorm::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
       grad = out_grad[syncbatchnorm::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
       grad_in = in_grad[syncbatchnorm::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-    } else {
-      data = in_data[syncbatchnorm::kData].get<xpu, 4, real_t>(s);
-      grad = out_grad[syncbatchnorm::kOut].get<xpu, 4, real_t>(s);
-      grad_in = in_grad[syncbatchnorm::kData].get<xpu, 4, real_t>(s);
     }
 
     Tensor<xpu, 1> mean = out_data[syncbatchnorm::kMean].get<xpu, 1, real_t>(s);
@@ -475,7 +482,7 @@ class SyncBatchNormProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
     const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
     in_shape->at(1) = mxnet::TShape(Shape1(dshape[1]));
     in_shape->at(2) = mxnet::TShape(Shape1(dshape[1]));
     out_shape->clear();
diff --git a/src/operator/contrib/transformer-inl.h b/src/operator/contrib/transformer-inl.h
index 01faf244aff9..da3d14e33cf4 100644
--- a/src/operator/contrib/transformer-inl.h
+++ b/src/operator/contrib/transformer-inl.h
@@ -41,7 +41,9 @@ static void DivSqrtDimForward_(const nnvm::NodeAttrs& attrs,
                   const std::vector<OpReqType>& req,
                   const std::vector<TBlob>& outputs) {
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  double sqrt_dim = std::sqrt(static_cast<double>(inputs[0].shape_[inputs[0].ndim() - 1]));
+  CHECK_GE(inputs[0].ndim(), 1);
+  int last_idx = inputs[0].ndim() - 1;
+  double sqrt_dim = std::sqrt(static_cast<double>(inputs[0].shape_[last_idx]));
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
       mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::div, Req>, xpu>::Launch(
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index ac6fea7c143b..4c0d67bb08f7 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -37,11 +37,11 @@ struct ForeachParam : public dmlc::Parameter<ForeachParam> {
   int num_outputs;
   int num_out_data;
   // The location of states in the subgraph inputs.
-  nnvm::Tuple<dim_t> in_state_locs;
+  mxnet::Tuple<dim_t> in_state_locs;
   // The location of data arrays in the subgraph inputs.
-  nnvm::Tuple<dim_t> in_data_locs;
+  mxnet::Tuple<dim_t> in_data_locs;
   // The location of remaining arrays in the subgraph inputs.
-  nnvm::Tuple<dim_t> remain_locs;
+  mxnet::Tuple<dim_t> remain_locs;
   DMLC_DECLARE_PARAMETER(ForeachParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
     .describe("Number of inputs.");
@@ -82,7 +82,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
   CHECK_GT(params.in_data_locs.ndim(), 0);
   size_t len = inputs[0].shape()[iter_dim];
   state.num_iterations = len;
-  for (size_t i = 1; i < params.in_data_locs.ndim(); i++)
+  for (int i = 1; i < params.in_data_locs.ndim(); i++)
     CHECK_EQ(inputs[i].shape()[iter_dim], len);
   for (size_t i = 0; i < (size_t) params.num_out_data; i++)
     CHECK_EQ(len, outputs[i].shape()[iter_dim]);
@@ -120,7 +120,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
   // and the loop states.
   std::vector<NDArray> subg_inputs(inputs.size());
   // The remaining arrays (other than input data and states) only need to be set once.
-  for (size_t j = 0; j < params.remain_locs.ndim(); j++) {
+  for (int j = 0; j < params.remain_locs.ndim(); j++) {
     CHECK_LT(params.remain_locs[j], subg_inputs.size());
     subg_inputs[params.remain_locs[j]] = inputs[j + params.in_data_locs.ndim()
         + params.in_state_locs.ndim()];
@@ -148,7 +148,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
 
     // Initialize inputs for the subgraph.
     // Get a slice from the input data arrays.
-    for (size_t j = 0; j < params.in_data_locs.ndim(); j++) {
+    for (int j = 0; j < params.in_data_locs.ndim(); j++) {
       size_t loc = params.in_data_locs[j];
       subg_inputs[loc] = inputs[j].At(i);
     }
@@ -161,7 +161,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
         subg_inputs[params.in_state_locs[idx]] = (*subg_out_prev)[j];
       }
     } else {
-      for (size_t j = 0; j < params.in_state_locs.ndim(); j++) {
+      for (int j = 0; j < params.in_state_locs.ndim(); j++) {
         CHECK_LT(params.in_state_locs[j], subg_inputs.size());
         subg_inputs[params.in_state_locs[j]] = inputs[j + params.in_data_locs.ndim()];
       }
@@ -203,7 +203,7 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
   // [data vars], [loop vars], [remaining vars]
 
   // [remaining vars]
-  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
     size_t loc = params.remain_locs[i];
     size_t orig_loc = i + params.in_data_locs.ndim() + params.in_state_locs.ndim();
     subg_igrads[loc] = outputs[orig_loc];
@@ -216,20 +216,20 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
     if (iter_num < len - 1) {
       // For the rest of the iterations, we should add graidents to the
       // remaining vars.
-      for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+      for (int i = 0; i < params.remain_locs.ndim(); i++) {
         size_t loc = params.remain_locs[i];
         subg_req[loc] = kAddTo;
       }
     }
 
     // [data vars]
-    for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+    for (int i = 0; i < params.in_data_locs.ndim(); i++) {
       size_t loc = params.in_data_locs[i];
       subg_igrads[loc] = outputs[i].At(iter_num);
       subg_req[loc] = req[i];
     }
     // [loop vars]
-    for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
+    for (int i = 0; i < params.in_state_locs.ndim(); i++) {
       size_t loc = params.in_state_locs[i];
       const NDArray &output = outputs[i + params.in_data_locs.ndim()];
       if (iter_num != 0) {
@@ -258,9 +258,9 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
 
 template<typename T>
 static void remap(const std::vector<T> &op_in, size_t start,
-                  const nnvm::Tuple<dim_t> &locs, std::vector<T> *subg_in) {
+                  const mxnet::Tuple<dim_t> &locs, std::vector<T> *subg_in) {
   auto op_in_it = op_in.begin() + start;
-  for (size_t i = 0; i < locs.ndim(); i++) {
+  for (int i = 0; i < locs.ndim(); i++) {
     dim_t loc = locs[i];
     subg_in->at(loc) = *(op_in_it + i);
   }
@@ -284,7 +284,7 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   mxnet::ShapeVector subg_in_shape(in_shape->size());
   // data shape
   std::vector<bool> data_1d(params.in_data_locs.ndim(), false);
-  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
     size_t loc = params.in_data_locs[i];
     if (in_shape->at(i).ndim() == 1)
       data_1d[i] = true;
@@ -301,7 +301,7 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   for (int i = 0; i < params.num_out_data; i++) {
     mxnet::TShape shape = subg_out_shape[i];
     // If we don't have shape info, we don't need to do anything.
-    if (shape.ndim() == 0)
+    if (!mxnet::ndim_is_known(shape))
       continue;
     subg_out_shape[i] = SliceFirstDim(shape);
   }
@@ -317,12 +317,12 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   for (int i = 0; i < params.num_out_data; i++) {
     // If the output shape isn't inferred, we don't need to propogate the info.
     const auto& g_out_shape = subg_out_shape[i];
-    if (g_out_shape.ndim() == 0)
+    if (!mxnet::ndim_is_known(g_out_shape))
       continue;
 
-    auto out = mxnet::TShape(g_out_shape.ndim() + 1);
+    auto out = mxnet::TShape(g_out_shape.ndim() + 1, -1);
     out[0] = len;
-    for (size_t i = 1; i < out.ndim(); i++)
+    for (int i = 1; i < out.ndim(); i++)
       out[i] = g_out_shape[i - 1];
     SHAPE_ASSIGN_CHECK(*out_shape, i, out);
   }
@@ -331,34 +331,34 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*out_shape, i, subg_out_shape[i]);
 
   // For the shape of input data.
-  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
     size_t loc = params.in_data_locs[i];
     const auto &shape = subg_in_shape[loc];
     // If the input data shape isn't inferred, we don't need to propogate the
     // info.
-    if (shape.ndim() == 0)
+    if (!mxnet::ndim_is_known(shape))
       continue;
 
     if (data_1d[i]) {
-      mxnet::TShape s(1);
+      mxnet::TShape s(1, -1);
       s[0] = len;
       SHAPE_ASSIGN_CHECK(*in_shape, i, s);
     } else {
-      auto in = mxnet::TShape(shape.ndim() + 1);
+      auto in = mxnet::TShape(shape.ndim() + 1, -1);
       in[0] = len;
-      for (size_t i = 1; i < in.ndim(); i++)
+      for (int i = 1; i < in.ndim(); i++)
         in[i] = shape[i - 1];
       SHAPE_ASSIGN_CHECK(*in_shape, i, in);
     }
   }
   // For the shape of state.
-  for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_state_locs.ndim(); i++) {
     size_t loc = params.in_state_locs[i];
     SHAPE_ASSIGN_CHECK(*in_shape, i + params.in_data_locs.ndim(),
                        subg_in_shape[loc]);
   }
   // For the shape of remaining data.
-  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
     size_t loc = params.remain_locs[i];
     SHAPE_ASSIGN_CHECK(*in_shape,
                        i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
@@ -387,15 +387,15 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
   remap(*in_type, params.in_data_locs.ndim() + params.in_state_locs.ndim(),
         params.remain_locs, &subg_in_type);
   bool success = InferSubgraphDataType(*attrs.subgraphs[0], &subg_in_type, out_type);
-  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
     size_t loc = params.in_data_locs[i];
     TYPE_ASSIGN_CHECK(*in_type, i, subg_in_type[loc]);
   }
-  for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_state_locs.ndim(); i++) {
     size_t loc = params.in_state_locs[i];
     TYPE_ASSIGN_CHECK(*in_type, i + params.in_data_locs.ndim(), subg_in_type[loc]);
   }
-  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
     size_t loc = params.remain_locs[i];
     TYPE_ASSIGN_CHECK(*in_type, i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
                       subg_in_type[loc]);
@@ -418,16 +418,16 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
         params.remain_locs, &subg_in_attrs);
   bool success = InferSubgraphStorage(*attrs.subgraphs[0], dev_mask,
                                       dispatch_mode, &subg_in_attrs, out_attrs);
-  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
     size_t loc = params.in_data_locs[i];
     STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i, subg_in_attrs[loc]);
   }
-  for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_state_locs.ndim(); i++) {
     size_t loc = params.in_state_locs[i];
     STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i + params.in_data_locs.ndim(),
                               subg_in_attrs[loc]);
   }
-  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
     size_t loc = params.remain_locs[i];
     STORAGE_TYPE_ASSIGN_CHECK(*in_attrs,
                               i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
@@ -488,9 +488,9 @@ struct WhileLoopParam : public dmlc::Parameter<WhileLoopParam> {
   // `cond_input_locs' contains indices of inputs fed to `cond', and
   // `func_input_locs' contains indices of inputs fed to `func'.
   // `func_var_locs' are indices in which input "variables" are stored in func's inputs.
-  nnvm::Tuple<dim_t> cond_input_locs;
-  nnvm::Tuple<dim_t> func_input_locs;
-  nnvm::Tuple<dim_t> func_var_locs;
+  mxnet::Tuple<dim_t> cond_input_locs;
+  mxnet::Tuple<dim_t> func_input_locs;
+  mxnet::Tuple<dim_t> func_var_locs;
   DMLC_DECLARE_PARAMETER(WhileLoopParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(2)
     .describe("Number of input arguments, including cond and func as two symbol inputs.");
@@ -538,12 +538,12 @@ class WhileLoopState: public LoopState {
                  n_iterations(0U),
                  cond_op(LoopState::MakeSharedOp(cond)),
                  oi_map(params.func_var_locs.ndim(), -1) {
-    const nnvm::Tuple<dim_t> &func_input_locs = params.func_input_locs;
-    const nnvm::Tuple<dim_t> &func_var_locs = params.func_var_locs;
-    const nnvm::Tuple<dim_t> &cond_input_locs = params.cond_input_locs;
-    for (size_t i = 0; i < func_var_locs.ndim(); ++i) {
+    const mxnet::Tuple<dim_t> &func_input_locs = params.func_input_locs;
+    const mxnet::Tuple<dim_t> &func_var_locs = params.func_var_locs;
+    const mxnet::Tuple<dim_t> &cond_input_locs = params.cond_input_locs;
+    for (int i = 0; i < func_var_locs.ndim(); ++i) {
       dim_t pos_i = func_input_locs[func_var_locs[i]];
-      for (size_t j = 0; j < cond_input_locs.ndim(); ++j) {
+      for (int j = 0; j < cond_input_locs.ndim(); ++j) {
         dim_t pos_j = cond_input_locs[j];
         if (pos_i == pos_j) {
           this->oi_map[i] = j;
@@ -740,7 +740,7 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
   // infer shape for cond and func
   auto infer_subg = [&params, in_shape, out_shape](std::shared_ptr<Symbol> subg,
                                                    ShapeVector *_subg_out,
-                                                   const nnvm::Tuple<dim_t> &input_locs,
+                                                   const mxnet::Tuple<dim_t> &input_locs,
                                                    int num_out_data,
                                                    bool fill_out_shape) {
     // create subg_in
@@ -781,7 +781,7 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
     for (size_t i = 0; i < subg_in.size(); ++i) {
       auto eid = idx.entry_id(input_nids[i], 0);
       auto g_out_shape = new_shapes[eid];
-      if (g_out_shape.ndim() == 0 || g_out_shape.Size() == 0) {
+      if (!shape_is_known(g_out_shape)) {
         // when the shape is not fully inferred
         continue;
       }
@@ -795,13 +795,13 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
     for (int i = 0; i < num_out_data; ++i) {
       auto eid = idx.entry_id(g.outputs[i]);
       auto g_out_shape = new_shapes[eid];
-      if (g_out_shape.ndim() == 0 || g_out_shape.Size() == 0) {
+      if (!shape_is_known(g_out_shape)) {
         // when the shape is not fully inferred
         continue;
       }
-      auto out = mxnet::TShape(g_out_shape.ndim() + 1);
+      auto out = mxnet::TShape(g_out_shape.ndim() + 1, -1);
       out[0] = params.max_iterations;
-      for (size_t i = 1; i < out.ndim(); i++)
+      for (int i = 1; i < out.ndim(); i++)
         out[i] = g_out_shape[i - 1];
       SHAPE_ASSIGN_CHECK(*out_shape, i, out);
     }
@@ -809,7 +809,7 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
     for (size_t i = num_out_data; i < g.outputs.size(); ++i) {
       auto eid = idx.entry_id(g.outputs[i]);
       auto g_out_shape = new_shapes[eid];
-      if (g_out_shape.ndim() == 0 || g_out_shape.Size() == 0) {
+      if (!shape_is_known(g_out_shape)) {
         // when the shape is not fully inferred
         continue;
       }
@@ -817,7 +817,7 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
     }
     return g.GetAttr<size_t>("shape_num_unknown_nodes") == 0;
   };
-  mxnet::ShapeVector cond_out_shape{mxnet::TShape(1U)};  // this means: [(1, )]
+  mxnet::ShapeVector cond_out_shape{mxnet::TShape(1, 1)};  // this means: [(1, )]
   mxnet::ShapeVector func_out_shape(params.num_outputs);
   CHECK(params.sync_in_out(in_shape, out_shape, is_udf));
   bool succ_0 = infer_subg(attrs.subgraphs[0], &cond_out_shape, params.cond_input_locs, 0, false);
@@ -915,9 +915,9 @@ WhileLoopGradient(const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& og
 struct CondParam : public dmlc::Parameter<CondParam> {
   int num_args;
   int num_outputs;
-  nnvm::Tuple<dim_t> cond_input_locs;
-  nnvm::Tuple<dim_t> then_input_locs;
-  nnvm::Tuple<dim_t> else_input_locs;
+  mxnet::Tuple<dim_t> cond_input_locs;
+  mxnet::Tuple<dim_t> then_input_locs;
+  mxnet::Tuple<dim_t> else_input_locs;
   DMLC_DECLARE_PARAMETER(CondParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(3)
     .describe("Number of input arguments, including cond, then and else as three symbol inputs.");
@@ -992,7 +992,7 @@ static void CondComputeExCPU(const OpStatePtr& state_ptr,
   state.cond_op->Forward(nullptr, cond_input_ptr, cond_output_ptr);
   branch_selection = as_bool_scalar(*cond_output_ptr[0]);
   // select the right branch
-  const nnvm::Tuple<dim_t> &func_input_locs = branch_selection
+  const mxnet::Tuple<dim_t> &func_input_locs = branch_selection
                                             ? params.then_input_locs
                                             : params.else_input_locs;
   LoopState &loop_state = branch_selection
@@ -1017,7 +1017,7 @@ static void CondGradComputeExCPU(const OpStatePtr& state_ptr,
   // select the right branch
   int branch_selection = state.branch_selection;
   CHECK_NE(branch_selection, -1);
-  const nnvm::Tuple<dim_t> &func_input_locs = branch_selection
+  const mxnet::Tuple<dim_t> &func_input_locs = branch_selection
                                             ? params.then_input_locs
                                             : params.else_input_locs;
   LoopState &loop_state = branch_selection
@@ -1048,7 +1048,7 @@ static bool CondShape(const nnvm::NodeAttrs& attrs,
   // infer shape for cond, then and else
   auto infer_subg = [&params, in_shape, out_shape](std::shared_ptr<Symbol> subg,
                                                    ShapeVector *_subg_out,
-                                                   const nnvm::Tuple<dim_t> &input_locs,
+                                                   const mxnet::Tuple<dim_t> &input_locs,
                                                    bool fill_out_shape) {
     // create subg_in
     mxnet::ShapeVector subg_in;
@@ -1086,7 +1086,7 @@ static bool CondShape(const nnvm::NodeAttrs& attrs,
     for (size_t i = 0; i < subg_in.size(); ++i) {
       auto eid = idx.entry_id(input_nids[i], 0);
       auto g_out_shape = new_shapes[eid];
-      if (g_out_shape.ndim() == 0 || g_out_shape.Size() == 0) {
+      if (!shape_is_known(g_out_shape)) {
         // when the shape is not fully inferred
         continue;
       }
@@ -1099,7 +1099,7 @@ static bool CondShape(const nnvm::NodeAttrs& attrs,
     for (size_t i = 0; i < g.outputs.size(); ++i) {
       auto eid = idx.entry_id(g.outputs[i]);
       auto g_out_shape = new_shapes[eid];
-      if (g_out_shape.ndim() == 0 || g_out_shape.Size() == 0) {
+      if (!shape_is_known(g_out_shape)) {
         // when the shape is not fully inferred
         continue;
       }
@@ -1107,7 +1107,7 @@ static bool CondShape(const nnvm::NodeAttrs& attrs,
     }
     return g.GetAttr<size_t>("shape_num_unknown_nodes") == 0;
   };
-  ShapeVector cond_out_shape{mxnet::TShape(1U)};  // this means: [(1, )]
+  ShapeVector cond_out_shape{mxnet::TShape(1, 1)};  // this means: [(1, )]
   ShapeVector then_out_shape(params.num_outputs);
   ShapeVector else_out_shape(params.num_outputs);
   bool succ_0 = infer_subg(attrs.subgraphs[0], &cond_out_shape, \
@@ -1190,7 +1190,7 @@ static bool BackwardCondStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size() + 3U, (size_t) params.num_args);
   CHECK_EQ(attrs.subgraphs.size(), 3U);
   static const std::function<bool(const int &)> is_udf = is_stype_udf;
-  auto sub_pass = [&](const std::shared_ptr<Symbol> &subg, const nnvm::Tuple<dim_t> &input_locs) {
+  auto sub_pass = [&](const std::shared_ptr<Symbol> &subg, const mxnet::Tuple<dim_t> &input_locs) {
     // A. first construct subg_in_attrs
     // need subg_in_attrs as subg_bwd_out (copy), subg_fwd_in (extract), subg_fwd_out (copy)
     std::vector<int> subg_in_attrs;
diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index ed6748a9c85c..080c718dc9bf 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -64,11 +64,11 @@ struct ConvolutionV1Param : public dmlc::Parameter<ConvolutionV1Param> {
   dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(ConvolutionV1Param) {
     DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, 0))
     .describe("convolution stride: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape(0, 0))
     .describe("convolution dilate: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, 0))
     .describe("pad for convolution: (h, w) or (d, h, w)");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
     .describe("convolution filter(channel) number");
@@ -405,7 +405,7 @@ class ConvolutionV1Prop : public OperatorProperty {
     // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
     out_shape->resize(1, mxnet::TShape());
     const mxnet::TShape &dshp = (*in_shape)[conv_v1::kData];
-    if (dshp.ndim() ==  0) return false;
+    if (!mxnet::ndim_is_known(dshp)) return false;
     if (param_.kernel.ndim() == 2) {
       // 2d conv_v1
       CHECK_EQ(dshp.ndim(), 4U) \
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
deleted file mode 100644
index cc8e4db404da..000000000000
--- a/src/operator/cudnn_rnn-inl.h
+++ /dev/null
@@ -1,863 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2016 by Contributors
- * \file cudnn_rnn-inl.h
- * \brief
- * \author Sebastian Bodenstein
-*/
-#ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_
-#define MXNET_OPERATOR_CUDNN_RNN_INL_H_
-
-#define USE_CUDNN_LSTM_PROJ MXNET_USE_CUDNN == 1 && CUDNN_VERSION >= 7200
-
-#include <mxnet/storage.h>
-#include <vector>
-#include <map>
-#include <string>
-#include <utility>
-#include <cstdint>
-#include "./rnn-inl.h"
-
-namespace mxnet {
-namespace op {
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-template<typename DType>
-class CuDNNRNNOp : public Operator {
- public:
-  explicit CuDNNRNNOp(RNNParam param) {
-    this->param_ = param;
-    init_cudnn_ = false;
-    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
-    // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy.
-    // No tests in place for fp16 RNNs, so leave TensorCore disabled for now.
-    cudnn_tensor_core_ = false;
-    // When fp16 RNN tests are introduced, we can enable TensorCore as follows:
-//    cudnn_tensor_core =
-//        mshadow::DataType<DType>::kFlag == mshadow::kFloat16 && GetEnvAllowTensorCore();
-    // Defaults
-    input_mode_ = CUDNN_LINEAR_INPUT;  // Don't support this yet
-    // RNN Mode
-    switch (param_.mode) {
-      case rnn_enum::kRnnRelu:
-        mode_ = CUDNN_RNN_RELU;
-        break;
-      case rnn_enum::kRnnTanh:
-        mode_ = CUDNN_RNN_TANH;
-        break;
-      case rnn_enum::kLstm:
-        mode_ = CUDNN_LSTM;
-        break;
-      case rnn_enum::kGru:
-        mode_ = CUDNN_GRU;
-        break;
-      default:
-        LOG(FATAL) << "Not implmented";
-    }
-#if USE_CUDNN_LSTM_PROJ
-    if (param_.projection_size.has_value()) {
-      CHECK_EQ(param_.mode, rnn_enum::kLstm)
-        << "Projection is only supported for LSTM.";
-      CHECK_GE(param_.state_size, param_.projection_size.value())
-        << "State size must be larger than projection size.";
-    }
-#else
-    CHECK(!param_.projection_size.has_value())
-      << "Projection is only supported for LSTM with CuDNN version later than 7.1.1.";
-#endif
-#if USE_CUDNN_LSTM_PROJ
-    if (param_.lstm_state_clip_min.has_value()
-        || param_.lstm_state_clip_max.has_value()) {
-      CHECK_EQ(param_.mode, rnn_enum::kLstm)
-        << "State clipping is only supported for LSTM.";
-      CHECK(param_.lstm_state_clip_min.has_value() && param_.lstm_state_clip_max.has_value())
-        << "lstm_state_clip_min and lstm_state_clip_max must be specified together.";
-      CHECK_GE(param_.lstm_state_clip_max.value(), param_.lstm_state_clip_min.value())
-        << "lstm_state_clip_max must be greater or equal to lstm_state_clip_min";
-    }
-#else
-    CHECK(!param_.lstm_state_clip_min.has_value()
-          && !param_.lstm_state_clip_max.has_value())
-      << "State clipping is only supported for LSTM with CuDNN version later than 7.2.1.";
-#endif
-    // RNN Direction
-    direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
-    // Other
-    if (param_.mode == rnn_enum::kLstm)
-      param_.lstm_q_ = true;
-    else
-      param_.lstm_q_ = false;
-
-    // Create descriptors
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&hy_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&cy_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhy_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcy_desc_));
-
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc_));
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&dw_desc_));
-
-    CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_));
-    CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
-
-    #if USE_CUDNN_LSTM_PROJ
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&x_data_desc_));
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&y_data_desc_));
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_));
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_));
-    #endif
-  }
-
-  ~CuDNNRNNOp() {
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(cy_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhy_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcy_desc_));
-
-    CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc_));
-    CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_));
-    CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
-
-    if (init_cudnn_) {
-      for (size_t i = 0; i < x_desc_vec_.size(); ++i) {
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc_vec_[i]));
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc_vec_[i]));
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]));
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]));
-      }
-      init_cudnn_ = false;
-
-      Storage::Get()->Free(reserve_space_);
-      if (param_.p > 0) {
-        Storage::Get()->Free(dropout_states_);
-      }
-    }
-    #if USE_CUDNN_LSTM_PROJ
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(x_data_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(y_data_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dx_data_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_));
-    #endif
-  }
-
-  virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    size_t in_expected = param_.lstm_q_ ? 4 : 3;
-    size_t out_expected = param_.lstm_q_ ? 3 : 2;
-    if (!param_.state_outputs)
-        out_expected = 1;
-
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    Stream<gpu> *s = ctx.get_stream<gpu>();
-    // get input + output tensors
-    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-
-    void * hy_ptr = NULL;
-    if (param_.state_outputs)
-      hy_ptr = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
-
-    DType * cx_ptr = NULL;
-    DType * cy_ptr = NULL;
-
-    if (param_.lstm_q_)
-      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-    if (param_.lstm_q_ && param_.state_outputs)
-      cy_ptr = (out_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
-
-    CHECK_EQ(x.CheckContiguous(), true);
-    CHECK_EQ(w.CheckContiguous(), true);
-    CHECK_EQ(hx.CheckContiguous(), true);
-    CHECK_EQ(y.CheckContiguous(), true);
-
-    if (!init_cudnn_) {
-      Init(s, in_data, out_data);
-    }
-    // Get temp space
-    int temp_size = workspace_size_;
-    Tensor<gpu, 1, DType> temp_space =
-      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
-                              mshadow::Shape1(temp_size), s);
-    #if USE_CUDNN_LSTM_PROJ
-    std::vector<int> seqLengthArray(param_.batch_size_, param_.seq_length_);
-    CUDNN_CALL(cudnnSetRNNDataDescriptor(x_data_desc_,
-                                         dtype_,
-                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                         param_.seq_length_,
-                                         param_.batch_size_,
-                                         param_.input_size_,
-                                         seqLengthArray.data(),
-                                         nullptr));
-    int out_size =
-      (param_.projection_size.has_value()) ? param_.projection_size.value() : param_.state_size;
-    out_size = (param_.bidirectional) ? (out_size * 2) : out_size;
-    CUDNN_CALL(cudnnSetRNNDataDescriptor(y_data_desc_,
-                                         dtype_,
-                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                         param_.seq_length_,
-                                         param_.batch_size_,
-                                         out_size,
-                                         seqLengthArray.data(),
-                                         nullptr));
-    if (ctx.is_train) {
-      CUDNN_CALL(cudnnSetRNNDataDescriptor(dx_data_desc_,
-                                           dtype_,
-                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                           param_.seq_length_,
-                                           param_.batch_size_,
-                                           param_.input_size_,
-                                           seqLengthArray.data(),
-                                           nullptr));
-      CUDNN_CALL(cudnnSetRNNDataDescriptor(dy_data_desc_,
-                                           dtype_,
-                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                           param_.seq_length_,
-                                           param_.batch_size_,
-                                           out_size,
-                                           seqLengthArray.data(),
-                                           nullptr));
-    }
-    #endif
-
-    #if USE_CUDNN_LSTM_PROJ
-    bool clip_state = param_.lstm_state_clip_min.has_value();
-    bool clip_nan = param_.lstm_state_clip_nan;
-    CUDNN_CALL(cudnnRNNSetClip(s->dnn_handle_,
-                               rnn_desc_,
-                               clip_state ? CUDNN_RNN_CLIP_MINMAX : CUDNN_RNN_CLIP_NONE,
-                               clip_nan ? CUDNN_NOT_PROPAGATE_NAN : CUDNN_PROPAGATE_NAN,
-                               clip_state ? param_.lstm_state_clip_min.value() : 0.0,
-                               clip_state ? param_.lstm_state_clip_max.value() : 0.0));
-    #endif
-
-    if (ctx.is_train) {
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnRNNForwardTrainingEx(s->dnn_handle_,
-                                           rnn_desc_,
-                                           x_data_desc_,
-                                           x.dptr_,
-                                           hx_desc_,
-                                           hx.dptr_,
-                                           cx_desc_,
-                                           cx_ptr,
-                                           w_desc_,
-                                           w.dptr_,
-                                           y_data_desc_,
-                                           y.dptr_,
-                                           hy_desc_,
-                                           hy_ptr,
-                                           cy_desc_,
-                                           cy_ptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           temp_space.dptr_,
-                                           workspace_byte_,
-                                           reserve_space_.dptr,
-                                           reserve_space_byte_));
-      #else
-      CUDNN_CALL(cudnnRNNForwardTraining(s->dnn_handle_,
-                                         rnn_desc_,
-                                         param_.seq_length_,
-                                         x_desc_vec_.data(),
-                                         x.dptr_,
-                                         hx_desc_,
-                                         hx.dptr_,
-                                         cx_desc_,
-                                         cx_ptr,
-                                         w_desc_,
-                                         w.dptr_,
-                                         y_desc_vec_.data(),
-                                         y.dptr_,
-                                         hy_desc_,
-                                         hy_ptr,
-                                         cy_desc_,
-                                         cy_ptr,
-                                         temp_space.dptr_,
-                                         workspace_byte_,
-                                         reserve_space_.dptr,
-                                         reserve_space_byte_));
-      #endif
-    } else {
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnRNNForwardInferenceEx(s->dnn_handle_,
-                                            rnn_desc_,
-                                            x_data_desc_,
-                                            x.dptr_,
-                                            hx_desc_,
-                                            hx.dptr_,
-                                            cx_desc_,
-                                            cx_ptr,
-                                            w_desc_,
-                                            w.dptr_,
-                                            y_data_desc_,
-                                            y.dptr_,
-                                            hy_desc_,
-                                            hy_ptr,
-                                            cy_desc_,
-                                            cy_ptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            temp_space.dptr_,
-                                            workspace_byte_));
-      #else
-      CUDNN_CALL(cudnnRNNForwardInference(s->dnn_handle_,
-                                          rnn_desc_,
-                                          param_.seq_length_,
-                                          x_desc_vec_.data(),
-                                          x.dptr_,
-                                          hx_desc_,
-                                          hx.dptr_,
-                                          cx_desc_,
-                                          cx_ptr,
-                                          w_desc_,
-                                          w.dptr_,
-                                          y_desc_vec_.data(),
-                                          y.dptr_,
-                                          hy_desc_,
-                                          hy_ptr,
-                                          cy_desc_,
-                                          cy_ptr,
-                                          temp_space.dptr_,
-                                          workspace_byte_));
-      #endif
-    }
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    size_t in_expected = param_.lstm_q_ ? 4 : 3;
-    size_t out_expected = param_.lstm_q_ ? 3 : 2;
-    if (!param_.state_outputs)
-      out_expected = 1;
-
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    CHECK_EQ(in_grad.size(), in_expected);
-    CHECK_EQ(out_grad.size(), out_expected);
-    CHECK_EQ(req.size(), in_expected);
-    CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
-    CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
-    Stream<gpu> *s = ctx.get_stream<gpu>();
-    // get input + output tensors
-    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dx = in_grad[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    if (req[rnn_enum::kParams] != kAddTo) {
-      dw = mshadow::expr::ScalarExp<DType>(0.0f);
-    }
-    // only need kStateOut grad output_states is true
-    void * dhy_ptr = NULL;
-    if (param_.state_outputs)
-      dhy_ptr = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
-
-    // Deal with lstm
-    void * dcx_ptr = NULL;
-    void * dcy_ptr = NULL;
-    void * cx_ptr = NULL;
-
-    if (param_.mode == rnn_enum::kLstm) {
-      CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell";
-      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-    }
-    if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
-        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
-
-    CHECK_EQ(x.CheckContiguous(), true);
-    CHECK_EQ(w.CheckContiguous(), true);
-    CHECK_EQ(dw.CheckContiguous(), true);
-    CHECK_EQ(hx.CheckContiguous(), true);
-    CHECK_EQ(dhx.CheckContiguous(), true);
-    CHECK_EQ(y.CheckContiguous(), true);
-    CHECK_EQ(dy.CheckContiguous(), true);
-
-    if (!init_cudnn_) {
-      Init(s, in_data, out_data);
-    }
-
-    // Get temp space
-    int temp_size = workspace_size_;
-    Tensor<gpu, 1, DType> temp_space =
-      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
-                              mshadow::Shape1(temp_size), s);
-    #if USE_CUDNN_LSTM_PROJ
-    CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
-                                      rnn_desc_,
-                                      y_data_desc_,
-                                      y.dptr_,
-                                      dy_data_desc_,
-                                      dy.dptr_,
-                                      nullptr,
-                                      nullptr,
-                                      dhy_desc_,
-                                      dhy_ptr,
-                                      dcy_desc_,
-                                      dcy_ptr,
-                                      w_desc_,
-                                      w.dptr_,
-                                      hx_desc_,
-                                      hx.dptr_,
-                                      cx_desc_,
-                                      cx_ptr,
-                                      dx_data_desc_,
-                                      dx.dptr_,
-                                      dhx_desc_,
-                                      dhx.dptr_,
-                                      dcx_desc_,
-                                      dcx_ptr,
-                                      nullptr,
-                                      nullptr,
-                                      temp_space.dptr_,
-                                      workspace_byte_,
-                                      reserve_space_.dptr,
-                                      reserve_space_byte_));
-    CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_,
-                                         rnn_desc_,
-                                         x_data_desc_,
-                                         x.dptr_,
-                                         hx_desc_,
-                                         hx.dptr_,
-                                         y_data_desc_,
-                                         y.dptr_,
-                                         temp_space.dptr_,
-                                         workspace_byte_,
-                                         dw_desc_,
-                                         dw.dptr_,
-                                         reserve_space_.dptr,
-                                         reserve_space_byte_));
-    #else
-    CUDNN_CALL(cudnnRNNBackwardData(s->dnn_handle_,
-                                    rnn_desc_,
-                                    param_.seq_length_,
-                                    y_desc_vec_.data(),
-                                    y.dptr_,
-                                    dy_desc_vec_.data(),
-                                    dy.dptr_,
-                                    dhy_desc_,
-                                    dhy_ptr,
-                                    dcy_desc_,
-                                    dcy_ptr,
-                                    w_desc_,
-                                    w.dptr_,
-                                    hx_desc_,
-                                    hx.dptr_,
-                                    cx_desc_,
-                                    cx_ptr,
-                                    dx_desc_vec_.data(),
-                                    dx.dptr_,
-                                    dhx_desc_,
-                                    dhx.dptr_,
-                                    dcx_desc_,
-                                    dcx_ptr,
-                                    temp_space.dptr_,
-                                    workspace_byte_,
-                                    reserve_space_.dptr,
-                                    reserve_space_byte_));
-    CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_,
-                                       rnn_desc_,
-                                       param_.seq_length_,
-                                       x_desc_vec_.data(),
-                                       x.dptr_,
-                                       hx_desc_,
-                                       hx.dptr_,
-                                       y_desc_vec_.data(),
-                                       y.dptr_,
-                                       temp_space.dptr_,
-                                       workspace_byte_,
-                                       dw_desc_,
-                                       dw.dptr_,
-                                       reserve_space_.dptr,
-                                       reserve_space_byte_));
-    #endif
-  }
-
- private:
-  inline void Init(mshadow::Stream<gpu> *s,
-                   const std::vector<TBlob> &in_data,
-                   const std::vector<TBlob> &out_data) {
-    using namespace mshadow;
-    #if CUDNN_MAJOR >= 5
-    format_ = CUDNN_TENSOR_NCHW;
-    #endif
-    size_t in_expected = param_.lstm_q_ ? 4 : 3;
-    size_t out_expected = param_.lstm_q_ ? 3 : 2;
-    if (!param_.state_outputs)
-      out_expected = 1;
-
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    if (!init_cudnn_) {
-      init_cudnn_ = true;
-      // get input + output tensors
-      Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-      Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-      param_.seq_length_ = x.shape_[0];
-      param_.batch_size_ = x.shape_[1];
-      param_.input_size_ = x.shape_[2];
-
-      // Tensor Descriptors
-      std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
-      std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
-      std::vector<cudnnTensorDescriptor_t> dx_vec(param_.seq_length_);
-      std::vector<cudnnTensorDescriptor_t> dy_vec(param_.seq_length_);
-      int dimA[3];
-      int strideA[3];
-      for (int i = 0; i < param_.seq_length_; i++) {
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&x_vec[i]));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&y_vec[i]));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&dx_vec[i]));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_vec[i]));
-
-        dimA[0] = param_.batch_size_;
-        dimA[1] = param_.input_size_;
-        dimA[2] = 1;
-        strideA[0] = dimA[2] * dimA[1];
-        strideA[1] = dimA[2];
-        strideA[2] = 1;
-
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(x_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(dx_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-        dimA[0] = param_.batch_size_;
-        dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
-        dimA[2] = 1;
-        strideA[0] = dimA[2] * dimA[1];
-        strideA[1] = dimA[2];
-        strideA[2] = 1;
-
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(y_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(dy_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-      }
-      x_desc_vec_ = x_vec;
-      y_desc_vec_ = y_vec;
-      dx_desc_vec_ = dx_vec;
-      dy_desc_vec_ = dy_vec;
-
-      // set the state tensors
-      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
-      dimA[1] = param_.batch_size_;
-      dimA[2] = param_.state_size;
-      strideA[0] = dimA[2] * dimA[1];
-      strideA[1] = dimA[2];
-      strideA[2] = 1;
-      #if USE_CUDNN_LSTM_PROJ
-      int dimB[3];
-      int strideB[3];
-      dimB[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
-      dimB[1] = param_.batch_size_;
-      dimB[2] = param_.projection_size.has_value() ?
-                param_.projection_size.value() : param_.state_size;
-      strideB[0] = dimB[2] * dimB[1];
-      strideB[1] = dimB[2];
-      strideB[2] = 1;
-      #endif
-
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(cx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(cy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-
-      // Create Dropout descriptors
-      if (param_.p > 0) {
-        CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_));
-        dropout_size_ = dropout_byte_ / sizeof(DType);
-        dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU(s->dev_id));
-      } else {
-        dropout_states_ = {};
-        dropout_byte_ = 0;
-      }
-      CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_,
-                                           param_.p,  // discard probability
-                                           dropout_states_.dptr, dropout_byte_,
-                                           seed_));
-      // RNN descriptors
-      #if CUDNN_MAJOR >= 6
-      cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
-      CUDNN_CALL(cudnnSetRNNDescriptor_v6(s->dnn_handle_,
-                                          rnn_desc_,
-                                          param_.state_size,
-                                          param_.num_layers,
-                                          dropout_desc_,
-                                          input_mode_,
-                                          direction_,
-                                          mode_,
-                                          rnn_algo,
-                                          dtype_));
-      #else
-      CUDNN_CALL(cudnnSetRNNDescriptor(rnn_desc_,
-                                       param_.state_size,
-                                       param_.num_layers,
-                                       dropout_desc_,
-                                       input_mode_,
-                                       direction_,
-                                       mode_,
-                                       dtype_));
-      #endif
-      #if CUDNN_MAJOR >= 7
-        cudnnMathType_t math_type = CUDNN_DEFAULT_MATH;
-        if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) {
-          math_type = CUDNN_TENSOR_OP_MATH;
-        }
-      #if CUDNN_VERSION >= 7200
-            if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
-                (DataType<DType>::kFlag != kFloat16))
-              math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
-      #endif
-        CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type));
-      #endif
-      #if USE_CUDNN_LSTM_PROJ
-      if (param_.projection_size.has_value()) {
-        CUDNN_CALL(cudnnSetRNNProjectionLayers(s->dnn_handle_,
-                                               rnn_desc_,
-                                               param_.projection_size.value(),
-                                               0));
-      }
-      #endif
-      // Get temp space sizes
-      CUDNN_CALL(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
-                                          rnn_desc_,
-                                          param_.seq_length_,
-                                          x_desc_vec_.data(),
-                                          &workspace_byte_));
-      CUDNN_CALL(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
-                                                rnn_desc_,
-                                                param_.seq_length_,
-                                                x_desc_vec_.data(),
-                                                &reserve_space_byte_));
-      workspace_size_ = workspace_byte_ / sizeof(DType);
-      // Allocate the reserve space
-      reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
-
-      // Check that number of params are correct
-      size_t cudnn_param_size;
-      CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_,
-                                       rnn_desc_,
-                                       x_desc_vec_[0],
-                                       &cudnn_param_size,
-                                       dtype_));
-      CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
-
-      // Set param descriptors
-      int dim_w[3] = {1, 1, 1};
-      dim_w[0] = w.shape_[0];
-      CUDNN_CALL(cudnnSetFilterNdDescriptor(w_desc_,
-                                            dtype_,
-                                            format_,
-                                            3,
-                                            dim_w));
-      CUDNN_CALL(cudnnSetFilterNdDescriptor(dw_desc_,
-                                            dtype_,
-                                            format_,
-                                            3,
-                                            dim_w));
-
-      // Query weight layout
-      // cudnnFilterDescriptor_t m_desc;
-      // CHECK_EQ(cudnnCreateFilterDescriptor(&m_desc), CUDNN_STATUS_SUCCESS);
-      // DType *p;
-      // int n = 2;
-      // int64_t last = 0;
-      // if (param_.mode == rnn_enum::kLstm) n = 8;
-      // else if (param_.mode == rnn_enum::kGru) n = 6;
-
-      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
-      //   for (int j = 0; j < n; ++j) {
-      //     CHECK_EQ(cudnnGetRNNLinLayerMatrixParams(s->dnn_handle_, rnn_desc_,
-      //       i, x_desc_vec_[0], w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
-      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
-      //     last = ((int64_t)(p - NULL))/sizeof(DType);
-      //     cudnnDataType_t t;
-      //     cudnnTensorFormat_t f;
-      //     int ndim = 5;
-      //     int dims[5] = {0, 0, 0, 0, 0};
-      //     CHECK_EQ(cudnnGetFilterNdDescriptor(m_desc, ndim, &t, &f, &ndim, &dims[0]),
-      //       CUDNN_STATUS_SUCCESS);
-      //     LOG(INFO) << "w: " <<  i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
-      //     for (int i = 0; i < ndim; ++i) LOG(INFO) << dims[i];
-      //   }
-      // }
-
-      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
-      //   for (int j = 0; j < n; ++j) {
-      //     CHECK_EQ(cudnnGetRNNLinLayerBiasParams(s->dnn_handle_, rnn_desc_, i, x_desc_vec_[0],
-      //       w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
-      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
-      //     last = ((int64_t)(p - NULL))/sizeof(DType);
-      //     LOG(INFO) << "b: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
-      //   }
-      // }
-    }
-  }
-
-  cudnnDataType_t dtype_;
-  bool init_cudnn_;
-  cudnnRNNDescriptor_t rnn_desc_;
-  cudnnRNNMode_t mode_;
-  cudnnDirectionMode_t direction_;
-  cudnnRNNInputMode_t input_mode_;
-  cudnnDropoutDescriptor_t dropout_desc_;
-  Storage::Handle dropout_states_, reserve_space_;
-  uint64_t seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
-  size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
-  int workspace_size_, dropout_size_;
-  std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
-  #if USE_CUDNN_LSTM_PROJ
-  cudnnRNNDataDescriptor_t x_data_desc_, y_data_desc_, dx_data_desc_, dy_data_desc_;
-  #endif
-  cudnnTensorDescriptor_t hx_desc_, cx_desc_;
-  cudnnTensorDescriptor_t hy_desc_, cy_desc_;
-  cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
-  cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
-
-  cudnnFilterDescriptor_t w_desc_, dw_desc_;
-  // Allow TensorCore algo policy
-  bool cudnn_tensor_core_;
-
-  #if CUDNN_MAJOR >= 5
-  cudnnTensorFormat_t format_;
-  #endif
-  RNNParam param_;
-};
-#endif  // __CUDACC__ && CUDNN
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_CUDNN_RNN_INL_H_
diff --git a/src/operator/custom/custom-inl.h b/src/operator/custom/custom-inl.h
index de82f3e8eef0..3bf63b75cfdb 100644
--- a/src/operator/custom/custom-inl.h
+++ b/src/operator/custom/custom-inl.h
@@ -31,6 +31,7 @@
 #include <mxnet/operator.h>
 #include <mxnet/c_api.h>
 #include <mxnet/imperative.h>
+#include <algorithm>
 #include <map>
 #include <vector>
 #include <string>
@@ -95,7 +96,12 @@ class CustomOperator {
       bool prev_recording = Imperative::Get()->set_is_recording(recording);
       bool prev_training = Imperative::Get()->set_is_training(training);
 
-      func();
+      try {
+        func();
+      } catch (dmlc::Error& e) {
+        exception_ =
+            std::make_shared<std::exception_ptr>(std::current_exception());
+      }
 
       Imperative::Get()->set_is_training(prev_training);
       Imperative::Get()->set_is_recording(prev_recording);
@@ -115,6 +121,16 @@ class CustomOperator {
 
       Engine::Get()->PushSync(
           [=](RunContext rctx) {
+            try {
+              Throw();
+              for (const auto& i : arrs) {
+                Engine::Get()->Throw(i.var());
+              }
+            } catch(dmlc::Error& err) {
+              ctx.async_on_complete(&err);
+              return;
+            }
+
             for (size_t i = 0, out_idx = 0; i < arrs.size(); i++) {
               if (arrs[i].storage_type() == kDefaultStorage ||
                   arrs[i].storage_type() == kUndefinedStorage)
@@ -124,54 +140,89 @@ class CustomOperator {
                 out_idx++;
               }
             }
+
             ctx.async_on_complete();
           },
-          ctx.run_ctx.ctx, vars, vars2, FnProperty::kNormal, 0,
+          ctx.run_ctx.ctx, vars, vars2, FnProperty::kNoSkip, 0,
           "CustomOperator");
     });
+    // increase num_threads if there is not enough threads to execute custom operator
+    if (q_.size() > num_free_threads_)
+      CreateThreads(q_.size() - num_free_threads_);
     cv_.notify_all();
   }
 
-  ~CustomOperator() {
+  static CustomOperator* Get() {
+    static CustomOperator inst;
+    return &inst;
+  }
+
+  void Start() {
+    num_free_threads_ = 0;
+    destructing_ = false;
+    naive_engine_ = true;
+    exception_ = nullptr;
+    if (std::string("NaiveEngine") != dmlc::GetEnv("MXNET_ENGINE_TYPE", std::string())) {
+      naive_engine_ = false;
+    }
+  }
+
+  void Stop() {
     if (naive_engine_) return;
     {
       std::unique_lock<std::mutex> lock(mutex_);
       destructing_ = true;
       cv_.notify_all();
     }
-    worker_.join();
+    for (auto &worker : workers_)
+      worker.join();
+    workers_.clear();
   }
 
-  static CustomOperator* Get();
+  inline void Throw() {
+    if (exception_ && *exception_) {
+      std::exception_ptr tmp = *exception_;
+      exception_ = nullptr;
+      std::rethrow_exception(tmp);
+    }
+  }
 
  private:
   CustomOperator() {
-    destructing_ = false;
-    naive_engine_ = true;
-    if (std::string("NaiveEngine") != dmlc::GetEnv("MXNET_ENGINE_TYPE", std::string())) {
-      naive_engine_ = false;
-      worker_ = std::thread(
-        [&]() {
-          std::unique_lock<std::mutex> lock(mutex_);
-          while (!q_.empty() || !destructing_) {
-            cv_.wait(lock, [&] {return !q_.empty() || destructing_;});
-            while (!q_.empty()) {
-              auto fn = q_.front();
-              lock.unlock();
-              fn();
-              lock.lock();
-              q_.pop();
-            }
-          }
-        });
+    this->Start();
+  }
+  void ThreadTarget() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (!q_.empty() || !destructing_) {
+      cv_.wait(lock, [&] {return !q_.empty() || destructing_;});
+      while (!q_.empty()) {
+        --num_free_threads_;
+        auto fn = q_.front();
+        q_.pop();
+        lock.unlock();
+        fn();
+        ++num_free_threads_;
+        lock.lock();
+      }
     }
   }
+  void SetNumThreads(int num_threads) {
+    for (int i = workers_.size(); i < num_threads; ++i) {
+      workers_.emplace_back(std::thread([this]{this->ThreadTarget();}));
+      ++num_free_threads_;
+    }
+  }
+  void CreateThreads(int num_new_threads) {
+    SetNumThreads(workers_.size() + num_new_threads);
+  }
   std::mutex mutex_;
   std::map<std::string, CustomOpPropCreator> registry_;
   // async worker
   std::condition_variable cv_;
-  std::thread worker_;
+  std::vector<std::thread> workers_;
+  std::atomic<uint32_t> num_free_threads_;
   std::queue<std::function<void(void)> > q_;
+  std::shared_ptr<std::exception_ptr> exception_;
   bool naive_engine_;
   bool destructing_;
 };
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index 39cca4d7c436..412bfa1bc3aa 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -34,11 +34,6 @@ namespace mxnet {
 namespace op {
 namespace custom {
 
-CustomOperator* CustomOperator::Get() {
-  static CustomOperator inst;
-  return &inst;
-}
-
 struct CustomParam {
   std::string op_type;
   size_t num_args, num_outs, num_auxs;
@@ -133,17 +128,21 @@ bool InferShape(const NodeAttrs& attrs,
   const CustomParam& params = nnvm::get<CustomParam>(attrs.parsed);
 
   size_t total = params.num_args + params.num_outs + params.num_auxs;
-  std::vector<uint32_t*> shapes(total);
+  std::vector<int*> shapes(total);
   std::vector<int> ndims(total);
   size_t buff_size = 0;
-  for (const auto& i : *in_shape) buff_size += i.ndim();
-  std::vector<uint32_t> buff(buff_size);
-  uint32_t *ptr = buff.data();
+  for (const auto& i : *in_shape) {
+    if (i.ndim() > 0) {
+      buff_size += i.ndim();
+    }
+  }
+  std::vector<int> buff(buff_size);
+  int *ptr = buff.data();
   for (size_t i = 0; i < in_shape->size(); ++i) {
     shapes[i] = ptr;
     ndims[i] = (*in_shape)[i].ndim();
-    for (size_t j = 0; j < (*in_shape)[i].ndim(); ++j, ++ptr) {
-      *ptr = static_cast<uint32_t>((*in_shape)[i][j]);
+    for (int j = 0; j < (*in_shape)[i].ndim(); ++j, ++ptr) {
+      *ptr = (*in_shape)[i][j];
     }
   }
 
@@ -268,7 +267,7 @@ OpStatePtr CreateState(const NodeAttrs& attrs, Context ctx,
   for (size_t i = 0; i < in_shape.size(); ++i) {
     shapes[i] = ptr;
     ndims[i] = in_shape[i].ndim();
-    for (size_t j = 0; j < in_shape[i].ndim(); ++j, ++ptr) {
+    for (int j = 0; j < in_shape[i].ndim(); ++j, ++ptr) {
       *ptr = static_cast<uint32_t>(in_shape[i][j]);
     }
   }
diff --git a/src/operator/image/crop-inl.h b/src/operator/image/crop-inl.h
new file mode 100644
index 000000000000..a1a4b23f658e
--- /dev/null
+++ b/src/operator/image/crop-inl.h
@@ -0,0 +1,190 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file crop-inl.h
+ * \brief the image crop operator implementation
+ */
+
+#ifndef MXNET_OPERATOR_IMAGE_CROP_INL_H_
+#define MXNET_OPERATOR_IMAGE_CROP_INL_H_
+
+
+#include <algorithm>
+#include <vector>
+
+#include "mxnet/base.h"
+#include "dmlc/optional.h"
+#include "image_utils.h"
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../../common/static_array.h"
+#include "../tensor/matrix_op-inl.h"
+#include "resize-inl.h"
+
+namespace mxnet {
+namespace op {
+namespace image {
+
+struct CropParam : public dmlc::Parameter<CropParam> {
+  int x;
+  int y;
+  int width;
+  int height;
+  DMLC_DECLARE_PARAMETER(CropParam) {
+    DMLC_DECLARE_FIELD(x)
+    .describe("Left boundary of the cropping area.");
+    DMLC_DECLARE_FIELD(y)
+    .describe("Top boundary of the cropping area.");
+    DMLC_DECLARE_FIELD(width)
+    .describe("Width of the cropping area.");
+    DMLC_DECLARE_FIELD(height)
+    .describe("Height of the cropping area.");
+  }
+};
+
+inline bool CropShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape> *in_attrs,
+                             std::vector<TShape> *out_attrs) {
+  // input attrs should only be (h, w, c) or (n, h, w, c)
+  if (in_attrs->at(0).ndim() == 3U) {
+    CHECK((in_attrs->at(0)[2] == 1) || (in_attrs->at(0)[2] == 3))
+      << "Expect channel of the input image is 1 or 3, but got"
+      << in_attrs->at(0)[2];
+  } else if (in_attrs->at(0).ndim() == 4U) {
+    CHECK((in_attrs->at(0)[3] == 1) || (in_attrs->at(0)[3] == 3))
+      << "Expect channel of the input image is 1 or 3, but got"
+      << in_attrs->at(0)[3];
+  } else {
+    LOG(FATAL) << "Image Crop expects inputs of 3D (h, w, c) or 4D (n, h, w, c). But got "
+      << in_attrs->at(0).ndim();
+  }
+
+  const auto& ishape = (*in_attrs)[0];
+  const CropParam& param = nnvm::get<CropParam>(attrs.parsed);
+
+  CHECK((param.height > 0) && (param.width > 0))
+    << "Input height and width must be greater than 0";
+  CHECK(param.x + param.width <= ishape[ishape.ndim() - 2])
+    << " x + width should not be greater than input width";
+  CHECK(param.y + param.height <= ishape[ishape.ndim() - 3])
+    << " y + height should not be greater than input height";
+  if (ishape.ndim() == 3) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape({param.height, param.width, ishape[C]}));
+  } else {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape({ishape[N], param.height, param.width, ishape[kC]}));
+  }
+  return true;
+}
+
+inline void CropImpl(int x,
+                      int y,
+                      int width,
+                      int height,
+                      const std::vector<TBlob> &inputs,
+                      const std::vector<TBlob> &outputs,
+                      const OpContext &ctx,
+                      const std::vector<OpReqType> &req) {
+  using namespace mshadow;
+  const TBlob& data = inputs[0];
+  const TBlob& out = outputs[0];
+  MXNET_NDIM_SWITCH(data.ndim(), ndim, {
+    Stream<cpu>* s = ctx.get_stream<cpu>();
+    common::StaticArray<index_t, ndim> begin = {0}, step = {1};
+    if (ndim == 3) {
+      begin[0] = y;
+      begin[1] = x;
+    } else {
+      begin[1] = y;
+      begin[2] = x;
+    }
+    MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        size_t num_threads = out.shape_.FlatTo2D()[0];
+        mxnet_op::Kernel<slice_forward<ndim, Req, cpu>, cpu>::Launch(s, num_threads,
+          out.dptr<DType>(), data.dptr<DType>(),
+          data.shape_.get<ndim>(), out.shape_.get<ndim>(), begin, step);
+      })
+    })
+  })
+}
+
+inline void CropBackwardImpl(int x,
+                      int y,
+                      int width,
+                      int height,
+                      const std::vector<TBlob> &inputs,
+                      const std::vector<TBlob> &outputs,
+                      const OpContext &ctx,
+                      const std::vector<OpReqType> &req) {
+  using namespace mshadow;
+  if (req[0] == kNullOp) return;
+  const TBlob& output_grad = inputs[0];
+  const TBlob& input_grad = outputs[0];
+  Stream<cpu>* s = ctx.get_stream<cpu>();
+  if (req[0] == kWriteTo) {
+    Fill(s, input_grad, req[0], 0);
+  } else if (req[0] == kWriteInplace) {
+    LOG(FATAL) << "_backward_image_crop does not support kWriteInplace";
+  }
+  MXNET_NDIM_SWITCH(output_grad.ndim(), ndim, {
+    common::StaticArray<index_t, ndim> begin = {0}, step = {1};
+    if (ndim == 3) {
+      begin[0] = y;
+      begin[1] = x;
+    } else {
+      begin[1] = y;
+      begin[2] = x;
+    }
+    MSHADOW_TYPE_SWITCH(output_grad.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        size_t num_threads = output_grad.shape_.FlatTo2D()[0];
+        mxnet_op::Kernel<slice_assign<ndim, Req, cpu>, cpu>::Launch(s, num_threads,
+          input_grad.dptr<DType>(), output_grad.dptr<DType>(),
+          input_grad.shape_.get<ndim>(), output_grad.shape_.get<ndim>(), begin, step);
+      })
+    })
+  })
+}
+
+inline void CropOpForward(const nnvm::NodeAttrs &attrs,
+                   const OpContext &ctx,
+                   const std::vector<TBlob> &inputs,
+                   const std::vector<OpReqType> &req,
+                   const std::vector<TBlob> &outputs) {
+  CHECK_EQ(outputs.size(), 1U);
+  const CropParam& param = nnvm::get<CropParam>(attrs.parsed);
+  CropImpl(param.x, param.y, param.width, param.height, inputs, outputs, ctx, req);
+}
+
+inline void CropOpBackward(const nnvm::NodeAttrs &attrs,
+                   const OpContext &ctx,
+                   const std::vector<TBlob> &inputs,
+                   const std::vector<OpReqType> &req,
+                   const std::vector<TBlob> &outputs) {
+  CHECK_EQ(outputs.size(), 1U);
+  const CropParam& param = nnvm::get<CropParam>(attrs.parsed);
+  CropBackwardImpl(param.x, param.y, param.width, param.height, inputs, outputs, ctx, req);
+}
+}  // namespace image
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_IMAGE_CROP_INL_H_
diff --git a/src/operator/image/crop.cc b/src/operator/image/crop.cc
new file mode 100644
index 000000000000..52d2f11a464b
--- /dev/null
+++ b/src/operator/image/crop.cc
@@ -0,0 +1,85 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file crop-cc.h
+ * \brief the image crop operator registration
+ */
+
+#include "mxnet/base.h"
+#include "crop-inl.h"
+#include "../operator_common.h"
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+namespace image {
+
+DMLC_REGISTER_PARAMETER(CropParam);
+
+NNVM_REGISTER_OP(_image_crop)
+.describe(R"code(Crop an image NDArray of shape (H x W x C) or (N x H x W x C) 
+to the given size.
+Example:
+    .. code-block:: python
+        image = mx.nd.random.uniform(0, 255, (4, 2, 3)).astype(dtype=np.uint8)
+        mx.nd.image.crop(image, 1, 1, 2, 2)
+            [[[144  34   4]
+              [ 82 157  38]]
+
+             [[156 111 230]
+              [177  25  15]]]
+            <NDArray 2x2x3 @cpu(0)>
+        image = mx.nd.random.uniform(0, 255, (2, 4, 2, 3)).astype(dtype=np.uint8)
+        mx.nd.image.crop(image, 1, 1, 2, 2)            
+            [[[[ 35 198  50]
+               [242  94 168]]
+
+              [[223 119 129]
+               [249  14 154]]]
+
+
+              [[[137 215 106]
+                [ 79 174 133]]
+
+               [[116 142 109]
+                [ 35 239  50]]]]
+            <NDArray 2x2x2x3 @cpu(0)>
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<CropParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", CropShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", CropOpForward)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_backward_image_crop" })
+.add_argument("data", "NDArray-or-Symbol", "The input.")
+.add_arguments(CropParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_image_crop)
+.set_attr_parser(ParamParser<CropParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", CropOpBackward);
+
+}  // namespace image
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index c37324678120..182cd682af8b 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -93,7 +93,7 @@ inline bool ToTensorShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
 
   mxnet::TShape &shp = (*in_attrs)[0];
-  if (!shp.ndim()) return false;
+  if (!shape_is_known(shp)) return false;
 
   CHECK((shp.ndim() == 3) || (shp.ndim() == 4))
       << "Input image must have shape (height, width, channels), or "
@@ -549,7 +549,7 @@ template<typename DType, int axis>
 void FlipImpl(const mxnet::TShape &shape, DType *src, DType *dst) {
   int head = 1, mid = shape[axis], tail = 1;
   for (int i = 0; i < axis; ++i) head *= shape[i];
-  for (uint32_t i = axis+1; i < shape.ndim(); ++i) tail *= shape[i];
+  for (int i = axis+1; i < shape.ndim(); ++i) tail *= shape[i];
 
   for (int i = 0; i < head; ++i) {
     for (int j = 0; j < (mid >> 1); ++j) {
diff --git a/src/operator/image/resize-inl.h b/src/operator/image/resize-inl.h
index de2189838d76..4ebebbfb272c 100644
--- a/src/operator/image/resize-inl.h
+++ b/src/operator/image/resize-inl.h
@@ -49,12 +49,12 @@ void ResizeImplCUDA(Stream<gpu> *s,
 #endif  // MXNET_USE_CUDA
 
 struct ResizeParam : public dmlc::Parameter<ResizeParam> {
-  nnvm::Tuple<int> size;
+  mxnet::Tuple<int> size;
   bool keep_ratio;
   int interp;
   DMLC_DECLARE_PARAMETER(ResizeParam) {
     DMLC_DECLARE_FIELD(size)
-    .set_default(nnvm::Tuple<int>())
+    .set_default(mxnet::Tuple<int>())
     .describe("Size of new image. Could be (width, height) or (size)");
     DMLC_DECLARE_FIELD(keep_ratio)
     .describe("Whether to resize the short edge or both edges to `size`, "
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index c7fa3f0443ee..7f8638630145 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -47,7 +47,7 @@ namespace op {
 namespace leakyrelu {
 enum LeakyReLUOpInputs {kData, kGamma};
 enum LeakyReLUOpOutputs {kOut, kMask};
-enum LeakyReLUOpType {kLeakyReLU, kPReLU, kRReLU, kELU, kSELU};
+enum LeakyReLUOpType {kLeakyReLU, kPReLU, kRReLU, kELU, kSELU, kGELU};
 enum LeakyReLUOpResource {kRandom};
 }  // namespace leakyrelu
 
@@ -64,6 +64,7 @@ struct LeakyReLUParam : public dmlc::Parameter<LeakyReLUParam> {
     .add_enum("prelu", leakyrelu::kPReLU)
     .add_enum("elu", leakyrelu::kELU)
     .add_enum("selu", leakyrelu::kSELU)
+    .add_enum("gelu", leakyrelu::kGELU)
     .describe("Activation function to be applied.");
     DMLC_DECLARE_FIELD(slope).set_default(0.25f)
     .describe("Init slope for the activation. (For leaky and elu only)");
@@ -190,6 +191,13 @@ class LeakyReLUOp : public Operator {
         });
         break;
       }
+      case leakyrelu::kGELU: {
+        MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
+          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::gelu, Req>, xpu>::Launch(
+            s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_);
+        });
+        break;
+      }
       default:
         LOG(FATAL) << "Not implmented";
     }
@@ -223,7 +231,7 @@ class LeakyReLUOp : public Operator {
     if (param_.act_type == leakyrelu::kRReLU) {
       mask = out_data[leakyrelu::kMask].get_with_shape<xpu, 3, DType>(dshape, s);
     }
-    if (param_.act_type == leakyrelu::kPReLU) {
+    if (param_.act_type == leakyrelu::kPReLU || param_.act_type == leakyrelu::kGELU) {
       data = in_data[leakyrelu::kData].get_with_shape<xpu, 3, DType>(dshape, s);
     }
     switch (param_.act_type) {
@@ -287,6 +295,15 @@ class LeakyReLUOp : public Operator {
         });
         break;
       }
+      case leakyrelu::kGELU: {
+        MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kData], Req, {
+          mxnet_op::Kernel<mxnet_op::op_with_req<
+            mxnet_op::backward_grad_tuned<mshadow_op::gelu_grad>, Req>, xpu>::Launch(
+              s, gdata.size(0) * gdata.size(1) * gdata.size(2), gdata.dptr_, grad.dptr_,
+              data.dptr_, output.dptr_);
+        });
+        break;
+      }
       default:
         LOG(FATAL) << "Not implmented";
     }
@@ -298,7 +315,7 @@ class LeakyReLUOp : public Operator {
     return a < b ? (a < c ? a : c) : (b < c ? b : c);
   }
   static inline mxnet::TShape expand_shape(const mxnet::TShape& src, const mxnet::TShape& dst) {
-    mxnet::TShape result(dst.ndim());
+    mxnet::TShape result(dst.ndim(), -1);
     int s = src.ndim() - 1;
     for (int i = dst.ndim() - 1; i >= 0; i--) {
       if (s >= 0 && i <= 1 && (dst[i] == src[s] || src[s] == 1)) {
@@ -338,10 +355,10 @@ class LeakyReLUProp : public OperatorProperty {
       CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
     }
     const mxnet::TShape &dshape = in_shape->at(leakyrelu::kData);
-    if (dshape.ndim() == 0) return false;
+    if (!mxnet::ndim_is_known(dshape)) return false;
     if (param_.act_type == leakyrelu::kPReLU) {
       const mxnet::TShape &gshape = in_shape->at(leakyrelu::kGamma);
-      if (gshape.ndim() == 0) {
+      if (!mxnet::ndim_is_known(gshape)) {
         in_shape->at(leakyrelu::kGamma) = mxnet::TShape(Shape1(dshape[1]));
       }
       if (dshape == gshape) {
diff --git a/src/operator/loss_binary_op-inl.h b/src/operator/loss_binary_op-inl.h
index a3853c56359a..1d71993da515 100644
--- a/src/operator/loss_binary_op-inl.h
+++ b/src/operator/loss_binary_op-inl.h
@@ -43,7 +43,7 @@ inline bool SoftmaxCrossEntropyShape(const nnvm::NodeAttrs& attrs,
       << "SoftmaxCrossEntropy only accept 1D label";
   CHECK_EQ((*in_attrs)[0][0], (*in_attrs)[1][0])
       << "SoftmaxCrossEntropy: data label shape mismatch";
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1));
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
   return true;
 }
 
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index f56436b8fa0c..d9d6151c06bf 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -45,10 +45,14 @@ namespace mshadow_op {
 __constant__ const float PI = 3.14159265358979323846;
 __constant__ const float SELU_ALPHA = 1.6732632423543772848170429916717;
 __constant__ const float SELU_LAMBDA = 1.0507009873554804934193349852946;
+__constant__ const float GELU_CUBIC_CONSTANT = 0.044715;
+__constant__ const float GELU_ROOT_2_OVER_PI = 0.7978845608028654;
 #else
 const float PI = 3.14159265358979323846;
 const float SELU_ALPHA = 1.6732632423543772848170429916717;
 const float SELU_LAMBDA = 1.0507009873554804934193349852946;
+const float GELU_CUBIC_CONSTANT = 0.044715;
+const float GELU_ROOT_2_OVER_PI = 0.7978845608028654;
 using std::isnan;
 #endif
 using std::enable_if;
@@ -127,9 +131,20 @@ MXNET_UNARY_MATH_OP(softsign, a / (1.0f + math::fabs(a)));
 
 MXNET_UNARY_MATH_OP(softsign_grad, 1.0f /  math::sqr(1.0f + math::fabs(a)));
 
-MXNET_UNARY_MATH_OP_NC(relu, a > DType(0) ? a : DType(0));
+#define MXNET_GELU_GX(a) \
+  a * (DType(1.0f) + DType(GELU_CUBIC_CONSTANT) * a * a)
 
-MXNET_UNARY_MATH_OP_NC(relu_grad, a > DType(0) ? DType(1) : DType(0));
+#define MXNET_GELU_GX_GRAD(a) \
+  (DType(1.0f) + DType(3.0f * GELU_CUBIC_CONSTANT) * a * a)
+
+#define MXNET_GELU_TANH(a) \
+  math::tanh(DType(GELU_ROOT_2_OVER_PI) * MXNET_GELU_GX(a))
+
+MXNET_UNARY_MATH_OP(gelu, DType(0.5f) * a * (DType(1.0f) + MXNET_GELU_TANH(a)));
+
+MXNET_BINARY_MATH_OP_NC(gelu_grad,
+  b / a + b * (DType(1.0f) - MXNET_GELU_TANH(a)) *
+  DType(GELU_ROOT_2_OVER_PI) * MXNET_GELU_GX_GRAD(a));
 
 MXNET_UNARY_MATH_OP_NC(selu, DType(SELU_LAMBDA) *
                          (a > DType(0) ? a : DType(math::id(SELU_ALPHA) * math::expm1(a))));
@@ -317,12 +332,6 @@ MXNET_BINARY_MATH_OP(rpower, math::pow(b, a));
 
 MXNET_BINARY_MATH_OP(rpower_grad, math::id(a) * math::log(b));
 
-/*! \brief used for generate element of maximum */
-MXNET_BINARY_MATH_OP(maximum, a > b ? a : b);
-
-/*! \brief used for generate element of minimum */
-MXNET_BINARY_MATH_OP_NC(minimum, a < b ? a : b);
-
 MXNET_UNARY_MATH_OP_NC(nt, a != DType(0) ? DType(0) : DType(1));
 
 MXNET_BINARY_MATH_OP_NC(ge, a >= b ? DType(1) : DType(0));
@@ -788,6 +797,44 @@ namespace isnan_typed {
   }
 };  // namespace isnan_typed
 
+MXNET_UNARY_MATH_OP_NC(relu, isnan_typed::IsNan(a) || (a > DType(0)) ? a : DType(0));
+
+/*! \brief used for computing gradient of relu operator */
+struct relu_grad : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    if (isnan_typed::IsNan(a)) {
+      return a;
+    } else {
+      return a > DType(0) ? DType(1) : DType(0);
+    }
+  }
+};
+
+/*! \brief used for computing binary operator maximum */
+struct maximum : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    if (isnan_typed::IsNan(a)) {
+      return a;
+    } else {
+      return (a > b ? a : b);
+    }
+  }
+};
+
+/*! \brief used for computing binary operator minimum */
+struct minimum : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    if (isnan_typed::IsNan(a)) {
+      return a;
+    } else {
+      return DType(a < b ? a : b);
+    }
+  }
+};
+
 /*! \brief sum reducer that ignores NaN values in the input */
 struct nansum {
   /*! \brief do reduction into dst */
@@ -898,13 +945,13 @@ struct nanprod {
 /*! \brief compute l2 norm */
 struct nrm2 {
   /*! \brief do reduction into dst */
-  template<typename DType>
-  MSHADOW_XINLINE static void Reduce(volatile DType& sum_of_squares, volatile DType src) { // NOLINT(*)
+  template<typename AType, typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile AType& sum_of_squares, volatile DType src) { // NOLINT(*)
     sum_of_squares += src * src;
   }
   /*! \brief do stable reduction into dst */
-  template<typename DType>
-  MSHADOW_XINLINE static void Reduce(volatile DType& sum_of_squares,  volatile DType src, volatile DType& scale) { // NOLINT(*)
+  template<typename AType, typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile AType& sum_of_squares,  volatile DType src, volatile DType& scale) { // NOLINT(*)
     if (src != 0) {
       DType abs = mshadow_op::abs::Map(src);
       if (scale < abs) {
@@ -965,6 +1012,66 @@ struct nrm2 {
   }
 };
 
+/*! \brief sum reducer */
+struct sum {
+  /*! \brief do reduction into dst */
+  template<typename AType, typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile AType& dst,  volatile DType src) { // NOLINT(*)
+    dst += src;
+  }
+  /*! \brief do stable reduction into dst */
+  template<typename AType, typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile AType& dst,  volatile DType src, volatile DType& residual) { // NOLINT(*)
+    DType y = src - residual;
+    DType t = dst + y;
+    residual = (t - dst) - y;
+    dst = t;
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& src_val) { // NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& dst_residual, volatile DType& src_val, volatile DType& src_residual) { // NOLINT(*)
+    DType t1 = dst_val + src_val;
+    DType e = t1 - dst_val;
+    DType t2 = ((src_val - e) + (dst_val - (t1 - e))) + dst_residual + src_residual;
+    dst_val = t1 + t2;
+    dst_residual = t2 - (dst_val - t1);
+  }
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst) {} // NOLINT(*)
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst, volatile DType& residual) {} // NOLINT(*)
+  /*!
+   *\brief calculate gradient of redres with respect to redsrc,
+   * redres: reduced result, redsrc: one of reduction element
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) {
+    return 1;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv) { // NOLINT(*)
+    initv = 0;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv, DType &residual) { // NOLINT(*)
+    SetInitValue(initv);
+    residual = 0;
+  }
+};
+
 struct nanprod_grad : public mxnet_op::tunable {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index d8fc5031e4ff..e331255c2e50 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -273,20 +273,87 @@ inline int get_num_threads<cpu>(const int N) {
     }                                                      \
     break;                                                 \
   case mshadow::kUint8:                                    \
-    LOG(FATAL) << "This operation only support "           \
-                  "floating point types not uint8";        \
+    {                                                      \
+      typedef uint8_t DType;                               \
+      typedef uint8_t AType;                               \
+      LOG(FATAL) << "This operation only support "         \
+                    "floating point types not uint8";      \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt8:                                     \
+    {                                                      \
+      typedef int8_t DType;                                \
+      typedef int8_t AType;                                \
+      LOG(FATAL) << "This operation only support "         \
+                    "floating point types not int8";       \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt32:                                    \
+    {                                                      \
+      typedef int32_t DType;                               \
+      typedef int32_t AType;                               \
+      LOG(FATAL) << "This operation only support "         \
+                    "floating point types, not int32";     \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt64:                                    \
+    {                                                      \
+      typedef int64_t DType;                               \
+      typedef int64_t AType;                               \
+      LOG(FATAL) << "This operation only support "         \
+                    "floating point types, not int64";     \
+    }                                                      \
+    break;                                                 \
+  default:                                                 \
+    LOG(FATAL) << "Unknown type enum " << type;            \
+  }
+
+#define MXNET_ACC_TYPE_SWITCH(type, DType, AType, ...)\
+  switch (type) {                                          \
+  case mshadow::kFloat32:                                  \
+    {                                                      \
+      typedef float DType;                                 \
+      typedef double AType;                                \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat64:                                  \
+    {                                                      \
+      typedef double DType;                                \
+      typedef double AType;                                \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat16:                                  \
+    {                                                      \
+      typedef mshadow::half::half_t DType;                 \
+      typedef float AType;                                 \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kUint8:                                    \
+    {                                                      \
+      typedef uint8_t DType;                               \
+      typedef uint32_t AType;                              \
+    }                                                      \
     break;                                                 \
   case mshadow::kInt8:                                     \
-    LOG(FATAL) << "This operation only support "           \
-                  "floating point types not int8";         \
+    {                                                      \
+      typedef int8_t DType;                                \
+      typedef int32_t AType;                               \
+    }                                                      \
     break;                                                 \
   case mshadow::kInt32:                                    \
-    LOG(FATAL) << "This operation only support "           \
-                  "floating point types, not int32";       \
+    {                                                      \
+      typedef int32_t DType;                               \
+      typedef int64_t AType;                               \
+    }                                                      \
     break;                                                 \
   case mshadow::kInt64:                                    \
-    LOG(FATAL) << "This operation only support "           \
-                  "floating point types, not int64";       \
+    {                                                      \
+      typedef int64_t DType;                               \
+      typedef int64_t AType;                               \
+    }                                                      \
     break;                                                 \
   default:                                                 \
     LOG(FATAL) << "Unknown type enum " << type;            \
@@ -714,6 +781,7 @@ struct Kernel<OP, gpu> {
   /*! \brief Launch GPU kernel */
   template<typename ...Args>
   inline static void Launch(mshadow::Stream<gpu> *s, int N, Args... args) {
+    if (0 == N) return;
     using namespace mshadow::cuda;
     int ngrid = std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
     mxnet_generic_kernel<OP, Args...>
@@ -724,6 +792,7 @@ struct Kernel<OP, gpu> {
 
   template<typename ...Args>
   inline static void LaunchEx(mshadow::Stream<gpu> *s, const int N, Args... args) {
+    if (0 == N) return;
     using namespace mshadow::cuda;
     int ngrid = std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
     mxnet_generic_kernel_ex<OP, Args...>
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 511fe455e946..622952cc4bc5 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -332,7 +332,7 @@ static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
 
   const int channelCount = dshape[channelAxis];
 
-  if (dshape.ndim() == 0) {
+  if (!mxnet::ndim_is_known(dshape)) {
     return false;
   }
 
diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu
index 1199ec7fcce5..9fb44e8fae81 100644
--- a/src/operator/nn/batch_norm.cu
+++ b/src/operator/nn/batch_norm.cu
@@ -668,7 +668,7 @@ void BatchNormCompute<gpu>(const nnvm::NodeAttrs& attrs,
 
   param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-  if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4
+  if (!param.use_global_stats && !param.cudnn_off
       && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
       GetCuDNNOp<DType>(param).Forward(ctx, in_data, req, outputs, aux_states);
@@ -697,7 +697,7 @@ void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
 
   param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-  if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4
+  if (!param.use_global_stats && !param.cudnn_off
       && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
       GetCuDNNOp<DType>(param).Backward(ctx, inputs, req, outputs);
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index fa441c45321e..8fb229889332 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -39,39 +39,40 @@ static bool ConcatShape(const nnvm::NodeAttrs& attrs,
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
   mxnet::TShape dshape;
-  index_t size = 0;
-  bool has_zero = false;
+  dim_t size = 0;
+  bool has_unknown_dim_size = false;
   int axis = -1;
   for (int i = 0; i < param_.num_args; ++i) {
     mxnet::TShape tmp = (*in_shape)[i];
-    if (tmp.ndim()) {
+    if (tmp.ndim() > 0) {
       axis = CheckAxis(param_.dim, tmp.ndim());
-      has_zero = tmp[axis] == 0 || has_zero;
+      has_unknown_dim_size = !mxnet::dim_size_is_known(tmp, axis) || has_unknown_dim_size;
       size += tmp[axis];
-      tmp[axis] = 0;
+      tmp[axis] = -1;
       shape_assign(&dshape, tmp);
     }
   }
 
   mxnet::TShape tmp = (*out_shape)[0];
-  if (tmp.ndim()) {
+  if (tmp.ndim() > 0) {
     axis = CheckAxis(param_.dim, tmp.ndim());
-    tmp[axis] = 0;
+    tmp[axis] = -1;
     shape_assign(&dshape, tmp);
   }
 
-  if (dshape.ndim() == 0) return false;
+  if (dshape.ndim() == -1) return false;
+  CHECK_NE(dshape.ndim(), 0) << "zero-dimensional arrays cannot be concatenated";
 
   for (int i = 0; i < param_.num_args; ++i) {
     CHECK(shape_assign(&(*in_shape)[i], dshape))
         << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
   }
 
-  if (!has_zero) dshape[axis] = size;
+  if (!has_unknown_dim_size) dshape[axis] = size;
   CHECK(shape_assign(&(*out_shape)[0], dshape))
       << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
 
-  return dshape.Size() != 0;
+  return shape_is_known(dshape);
 }
 
 // Concat for RNN param deals with the reverse shape inference from output
@@ -90,26 +91,27 @@ static bool RNNParamConcatShape(const nnvm::NodeAttrs& attrs,
   int axis = -1;
   for (int i = 0; i < param_.num_args; ++i) {
     mxnet::TShape tmp = (*in_shape)[i];
-    if (tmp.ndim()) {
+    if (tmp.ndim() > 0) {
       axis = CheckAxis(param_.dim, tmp.ndim());
-      if (tmp[axis] == 0) {
+      if (!mxnet::dim_size_is_known(tmp, axis)) {
         zero_indices.emplace_back(i);
       } else {
+        CHECK_GE(tmp[axis], 0);
         size += tmp[axis];
       }
-      tmp[axis] = 0;
+      tmp[axis] = -1;
       shape_assign(&dshape, tmp);
     }
   }
 
   mxnet::TShape tmp = (*out_shape)[0];
-  if (tmp.ndim()) {
+  if (tmp.ndim() > 0) {
     axis = CheckAxis(param_.dim, tmp.ndim());
-    tmp[axis] = 0;
+    tmp[axis] = -1;
     shape_assign(&dshape, tmp);
   }
 
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
 
   for (int i = 0; i < param_.num_args; ++i) {
     CHECK(shape_assign(&(*in_shape)[i], dshape))
@@ -119,21 +121,21 @@ static bool RNNParamConcatShape(const nnvm::NodeAttrs& attrs,
   if (zero_indices.empty()) dshape[axis] = size;
   CHECK(shape_assign(&(*out_shape)[0], dshape))
       << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
-  if ((*out_shape)[0][axis] != 0 && !zero_indices.empty()) {
+  if ((*out_shape)[0][axis] != -1 && !zero_indices.empty()) {
     int residual = (*out_shape)[0][axis] - size;
     CHECK_GE(residual, 0)
         << "Input size already exceeds output size. Residual: " << residual;
-    CHECK(zero_indices.size() <= 2 && zero_indices.size() >= 0)
+    CHECK(zero_indices.size() <= 2 && zero_indices.size() > 0)
         << "Expecting 1 or 2 inputs that need shape inference. Got: " << zero_indices.size();
-    bool need_infer = !(*out_shape)[0].Size();
+    bool need_infer = !shape_is_known((*out_shape)[0]);
     for (int i : zero_indices) {
       (*in_shape)[i][axis] = residual / zero_indices.size();
-      need_infer = need_infer || !(*in_shape)[i].Size();
+      need_infer = need_infer || !shape_is_known((*in_shape)[i]);
     }
     return !need_infer;
   }
 
-  return dshape.Size() != 0;
+  return shape_is_known(dshape);
 }
 
 static bool ConcatType(const nnvm::NodeAttrs& attrs,
@@ -232,9 +234,10 @@ bool SupportMKLDNNConcat(const std::vector<NDArray> &arrs) {
   for (auto &arr : arrs) {
     if (arr.IsView()) return false;
     if (arr.dtype() != mshadow::kFloat32) return false;
-    unsigned ndim = arr.shape().ndim();
-    unsigned mkldnn_ndims =
-        static_cast<unsigned>(arr.GetMKLDNNData()->get_primitive_desc().desc().data.ndims);
+    // DO not support zero-size tensors.
+    if (arr.shape().Size() == 0) return false;
+    int ndim = arr.shape().ndim();
+    const int mkldnn_ndims = arr.GetMKLDNNData()->get_primitive_desc().desc().data.ndims;
     if (!(ndim == 2 || ndim == 4) || ndim != mkldnn_ndims) return false;
   }
   return true;
diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h
index 7ae34ae363b4..7d5f7c7d5757 100644
--- a/src/operator/nn/convolution-inl.h
+++ b/src/operator/nn/convolution-inl.h
@@ -69,11 +69,11 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(ConvolutionParam) {
     DMLC_DECLARE_FIELD(kernel).describe("Convolution kernel size: (w,), (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, 0))
     .describe("Convolution stride: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape(0, 0))
     .describe("Convolution dilate: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, 0))
     .describe("Zero pad for convolution: (w,), (h, w) or (d, h, w). Defaults to no padding.");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
     .describe("Convolution filter(channel) number");
@@ -209,9 +209,9 @@ class ConvolutionOp {
       Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
         .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
       // calculate the shape of col_buffer
-      mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1);
+      mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
       col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-      for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+      for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
         col_buffer_shape[i] = out_data[0].shape_[i+1];
       }
       // create a column buffer using workspace and col_buffer_shape
@@ -295,9 +295,9 @@ class ConvolutionOp {
       Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
         .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
       // calculate the shape of col_buffer
-      mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1);
+      mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
       col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-      for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+      for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
         col_buffer_shape[i] = out_grad[conv::kData].shape_[i+1];
       }
       // create a column buffer using workspace and col_buffer_shape
@@ -342,10 +342,10 @@ class ConvolutionOp {
   void LayerSetUp(const mxnet::TShape& ishape, const mxnet::TShape& oshape) {
     channel_axis_ = 1;  // hard code channel axis
     const index_t first_spatial_axis = channel_axis_ + 1;
-    const index_t num_axes = param_.kernel.ndim() + 2;
+    const int num_axes = param_.kernel.ndim() + 2;
     num_spatial_axes_ = num_axes - first_spatial_axis;
     is_1x1_ = true;
-    for (index_t i = 0; i < param_.kernel.ndim(); ++i) {
+    for (int i = 0; i < param_.kernel.ndim(); ++i) {
       is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0;
       if (!is_1x1_) break;
     }
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 527a0073930f..536e9a731171 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -96,24 +96,28 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
   // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
   out_shape->resize(1, mxnet::TShape());
   const mxnet::TShape &dshp = (*in_shape)[conv::kData];
-  if (dshp.ndim() ==  0) return false;
+  if (!mxnet::ndim_is_known(dshp)) return false;
 
   if (param_.kernel.ndim() == 1) {
     // 1d conv
     CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
     Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW);
-    Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+    Shape<3> wshape = Shape3(param_.num_filter / param_.num_group,
+        mxnet::dim_size_is_known(dshape, 1) ? dshape[1] / param_.num_group : -1,
         param_.kernel[0]);
     wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
-    wshape[0] *= param_.num_group;
+    if (wshape[0] >= 0) {
+      wshape[0] *= param_.num_group;
+    }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
     if (!param_.no_bias) {
       SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
     }
 
     const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
-    CHECK_EQ(dshape[1] % param_.num_group, 0U) \
-      << "input num_filter must divide group size";
+    if (dshape[1] != -1) {
+      CHECK_EQ(dshape[1] % param_.num_group, 0U) << "input num_filter must divide group size";
+    }
     CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
       << "output num_filter must divide group size";
     CHECK_GT(param_.kernel.Size(), 0U) \
@@ -125,21 +129,21 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<3> oshape;
     oshape[0] = dshape[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] ?
-      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0;
+    oshape[2] = dshape[2] != -1 ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : -1;
     SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
     // Perform incomplete shape inference. Fill in the missing values in data shape.
     // 1) We can always fill in the batch_size.
     // 2) We can back-calculate the input height/width if the corresponding stride is 1.
     oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW);
     dshape[0] = oshape[0];
-    if (oshape[2] && param_.stride[0] == 1) {
+    if (oshape[2] != -1 && param_.stride[0] == 1) {
       dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0];
     }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
         ConvertLayout(dshape, kNCW, param_.layout.value()));
     // Check whether the kernel sizes are valid
-    if (dshape[2] != 0) {
+    if (dshape[2] != -1) {
       CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
     }
     return true;
@@ -149,10 +153,12 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
       << "Input data should be 4D in batch-num_filter-y-x";
     Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
     Shape<4> wshape = Shape4(param_.num_filter / param_.num_group,
-        dshape[1] / param_.num_group,
+        mxnet::dim_size_is_known(dshape, 1) ? dshape[1] / param_.num_group : -1,
         param_.kernel[0], param_.kernel[1]);
     wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
-    wshape[0] *= param_.num_group;
+    if (wshape[0] >= 0) {
+      wshape[0] *= param_.num_group;
+    }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
     if (!param_.no_bias) {
       SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
@@ -160,8 +166,9 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
 
     const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
     const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
-    CHECK_EQ(dshape[1] % param_.num_group, 0U) \
-      << "input num_filter must divide group size";
+    if (dshape[1] != -1) {
+      CHECK_EQ(dshape[1] % param_.num_group, 0U) << "input num_filter must divide group size";
+    }
     CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
       << "output num_filter must divide group size";
     CHECK_GT(param_.kernel.Size(), 0U) \
@@ -173,29 +180,29 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<4> oshape;
     oshape[0] = dshape[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] ?
-      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0;
-    oshape[3] = dshape[3] ?
-      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0;
+    oshape[2] = dshape[2] != -1 ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : -1;
+    oshape[3] = dshape[3] != -1 ?
+      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : -1;
     SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
     // Perform incomplete shape inference. Fill in the missing values in data shape.
     // 1) We can always fill in the batch_size.
     // 2) We can back-calculate the input height/width if the corresponding stride is 1.
     oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
     dshape[0] = oshape[0];
-    if (oshape[2] && param_.stride[0] == 1) {
+    if (oshape[2] != -1 && param_.stride[0] == 1) {
       dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0];
     }
-    if (oshape[3] && param_.stride[1] == 1) {
+    if (oshape[3] != -1 && param_.stride[1] == 1) {
       dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1];
     }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
         ConvertLayout(dshape, kNCHW, param_.layout.value()));
     // Check whether the kernel sizes are valid
-    if (dshape[2] != 0) {
+    if (dshape[2] != -1) {
       CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
     }
-    if (dshape[3] != 0) {
+    if (dshape[3] != -1) {
       CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
     }
     return true;
@@ -204,10 +211,13 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(dshp.ndim(), 5U) \
       << "Input data should be 5D in batch-num_filter-depth-y-x";
     Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW);
-    Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+    Shape<5> wshape = Shape5(param_.num_filter / param_.num_group,
+        mxnet::dim_size_is_known(dshape, 1) ? dshape[1] / param_.num_group : -1,
         param_.kernel[0], param_.kernel[1], param_.kernel[2]);
     wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
-    wshape[0] *= param_.num_group;
+    if (wshape[0] >= 0) {
+      wshape[0] *= param_.num_group;
+    }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
     if (!param_.no_bias) {
       SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
@@ -218,8 +228,9 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
     const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
     const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
-    CHECK_EQ(dshape[1] % param_.num_group, 0U)
-      << "input num_filter must divide group size";
+    if (dshape[1] >= 0) {
+      CHECK_EQ(dshape[1] % param_.num_group, 0U) << "input num_filter must divide group size";
+    }
     CHECK_EQ(param_.num_filter % param_.num_group, 0U)
       << "output num_filter must divide group size";
     CHECK_GT(param_.kernel.Size(), 0U) \
@@ -233,37 +244,37 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<5> oshape;
     oshape[0] = dshape[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] ?
-      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0;
-    oshape[3] = dshape[3] ?
-      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0;
-    oshape[4] = dshape[4] ?
-      (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0;
+    oshape[2] = dshape[2] != -1 ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : -1;
+    oshape[3] = dshape[3] != -1 ?
+      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : -1;
+    oshape[4] = dshape[4] != -1 ?
+      (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : -1;
     SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
     // Perform incomplete shape inference. Fill in the missing values in data shape.
     // 1) We can always fill in the batch_size.
     // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1.
     oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW);
     dshape[0] = oshape[0];
-    if (oshape[2] && param_.stride[0] == 1) {
+    if (oshape[2] != -1 && param_.stride[0] == 1) {
       dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0];
     }
-    if (oshape[3] && param_.stride[1] == 1) {
+    if (oshape[3] != -1 && param_.stride[1] == 1) {
       dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1];
     }
-    if (oshape[4] && param_.stride[2] == 1) {
+    if (oshape[4] != -1 && param_.stride[2] == 1) {
       dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2];
     }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
         ConvertLayout(dshape, kNCDHW, param_.layout.value()));
     // Check whether the kernel sizes are valid
-    if (dshape[2] != 0) {
+    if (dshape[2] != -1) {
       CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
     }
-    if (dshape[3] != 0) {
+    if (dshape[3] != -1) {
       CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
     }
-    if (dshape[4] != 0) {
+    if (dshape[4] != -1) {
       CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input";
     }
     return true;
diff --git a/src/operator/nn/ctc_loss-inl.h b/src/operator/nn/ctc_loss-inl.h
index 357888dc30f1..8c841dfc24b4 100644
--- a/src/operator/nn/ctc_loss-inl.h
+++ b/src/operator/nn/ctc_loss-inl.h
@@ -239,7 +239,7 @@ inline bool CTCLossOpShape(const nnvm::NodeAttrs &attrs,
                                       "the maximum sequence length of the "
                                       "data.";
 
-    mxnet::TShape oshape(1);
+    mxnet::TShape oshape(1, -1);
     oshape[0] = dshape[1];  // batch size
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);  // forward output
     SHAPE_ASSIGN_CHECK(*out_attrs, 1, dshape);  // grad output
diff --git a/src/operator/nn/cudnn/cudnn_algoreg-inl.h b/src/operator/nn/cudnn/cudnn_algoreg-inl.h
index cef9d6f86940..3f2d24c5bf7e 100644
--- a/src/operator/nn/cudnn/cudnn_algoreg-inl.h
+++ b/src/operator/nn/cudnn/cudnn_algoreg-inl.h
@@ -96,7 +96,7 @@ class CuDNNAlgoReg {
       if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
         LOG(INFO) << "Running performance tests to find the best convolution "
             "algorithm, "
-            "this can take a while... (setting env variable "
+            "this can take a while... (set the environment variable "
             "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
         if (reg_.size() >= 1000) {
           // Many people are very concerned about this warning, so change the warning once.
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
index d4b9f84ed2f5..820f8504d74c 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
+++ b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
@@ -84,7 +84,6 @@ class CuDNNBatchNormOp {
     }
     CHECK_EQ(req[cudnnbatchnorm::kOut], kWriteTo);
     CHECK_GE(in_data[cudnnbatchnorm::kData].ndim(), 2);
-    CHECK_LE(in_data[cudnnbatchnorm::kData].ndim(), 4);
 
     Init(in_data[cudnnbatchnorm::kData]);
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -273,12 +272,15 @@ class CuDNNBatchNormOp {
 
  private:
   void Init(const TBlob &in_data) {
-    for (int i = 0; i < 4; ++i) {
-      if (i < in_data.ndim()) {
+    if (in_data.ndim() == 4) {
+      for (int i = 0; i < 4; ++i)
         shape_[i] = in_data.shape_[i];
-      } else {
-        shape_[i] = 1;
-      }
+    } else {
+      // when in_data.ndim() != 4
+      shape_[0] = in_data.shape_[0];
+      shape_[1] = in_data.ndim() > 1 ? in_data.shape_[1] : 1;
+      shape_[2] = 1;
+      shape_[3] = in_data.shape_.ProdShape(2, in_data.ndim());
     }
 
     CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_,
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cc b/src/operator/nn/cudnn/cudnn_batch_norm.cc
index 5632028dd769..cb35ce170e8e 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm.cc
+++ b/src/operator/nn/cudnn/cudnn_batch_norm.cc
@@ -37,7 +37,7 @@ static bool BatchNormShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector *in_
   using namespace mshadow;
   CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, moving_mean, moving_var]";
   const mxnet::TShape &dshape = in_shape->at(0);
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   in_shape->at(1) = mxnet::TShape(Shape1(dshape[1]));
   in_shape->at(2) = mxnet::TShape(Shape1(dshape[1]));
   in_shape->at(3) = mxnet::TShape(Shape1(dshape[1]));
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index e11f7cc81d25..679e0cd1057b 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -521,7 +521,19 @@ class CuDNNConvolutionOp {
                                             wshape[1],
                                             wshape[2],
                                             wshape[3]));
-
+#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500
+      auto kernel_h = wshape[2];
+      auto kernel_w = wshape[3];
+      auto stride_h = stride[0];
+      auto stride_w = stride[1];
+      auto pad_h = pad[0];
+      auto pad_w = pad[1];
+      if (param_.layout.value() == kNCHW &&
+          (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) ||
+           ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) {
+        exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING;
+      }
+#endif
     } else if (param_.kernel.ndim() == 3) {
       // 3d conv
       #if CUDNN_MAJOR >= 5
@@ -714,7 +726,7 @@ class CuDNNConvolutionOp {
     bwd_data_results.resize(actual_bwd_data_algos);
     AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t,
                     cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data",
-                                                   workspace_byte, bwd);
+                                                   workspace_byte, bwd, exclude_dgrad_algo_);
 #else
     // CUDNN_MAJOR < 7
     const int kMaxAlgos = 10;
@@ -750,7 +762,7 @@ class CuDNNConvolutionOp {
       i = 0;
       while (i < nalgo
              && (fwd_algo[i].status != CUDNN_STATUS_SUCCESS
-                 || (param_.cudnn_tune.value() == conv::kLimited
+                 || (param_.cudnn_tune.value() != conv::kFastest
                      && fwd_algo[i].memory > workspace_byte))) {
         ++i;
         min_memory_needs =
@@ -787,7 +799,7 @@ class CuDNNConvolutionOp {
       i = 0;
       while (i < nalgo
              && (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS
-                 || (param_.cudnn_tune.value() == conv::kLimited
+                 || (param_.cudnn_tune.value() != conv::kFastest
                      && bwd_filter_algo[i].memory > workspace_byte))) {
         ++i;
         min_memory_needs = (i == 0) ?
@@ -825,7 +837,7 @@ class CuDNNConvolutionOp {
       i = 0;
       while (i < nalgo
              && (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS
-                 || (param_.cudnn_tune.value() == conv::kLimited
+                 || (param_.cudnn_tune.value() != conv::kFastest
                      && bwd_data_algo[i].memory > workspace_byte))) {
         ++i;
         min_memory_needs = (i == 0) ?
@@ -910,12 +922,14 @@ class CuDNNConvolutionOp {
   // workspace constraints.
   template <typename PerfType, typename AlgoType>
   void AlgoFinalSelect(const std::vector<PerfType> &perf_results, std::string kernel_name,
-                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo) {
+                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo,
+                       int32_t algo_exclude = -1) {
     // Determine the fastest acceptable algo that matches the algo_preference (-1 = any),
     // regardless of mathType.
     bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false);
     for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
       const auto &result = perf_results[i];
+      bool algo_exclusion = static_cast<int32_t>(result.algo) == algo_exclude;
       bool algo_is_tensor_core = false;
       #if CUDNN_MAJOR >= 7
         algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH;
@@ -924,7 +938,8 @@ class CuDNNConvolutionOp {
         #if CUDNN_MAJOR >= 7
           (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
         #endif
-          (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte)) {
+          (param_.cudnn_tune.value() == conv::kLimited || result.memory <= workspace_byte) &&
+          !algo_exclusion) {
         algo->Set(result.algo, algo_is_tensor_core);
         return;
       }
@@ -1000,9 +1015,9 @@ class CuDNNConvolutionOp {
   // e.g. {shape[0], shape[1], shape[2]} -> {shape[1]*shape[2], shape[2], 1}
   template <int dim>
   inline Shape<dim> Strides(const mxnet::TShape &s) {
-    uint32_t ndim = s.ndim();
-    mxnet::TShape strides(ndim);
-    for (uint32_t i = 0; i != ndim; ++i)
+    int ndim = s.ndim();
+    mxnet::TShape strides(ndim, -1);
+    for (int i = 0; i != ndim; ++i)
       strides[i] = s.ProdShape(i+1, ndim);
     return strides.get<dim>();
   }
@@ -1104,6 +1119,8 @@ class CuDNNConvolutionOp {
   bool cudnn_tensor_core_;
   // Is req[kWeight] == conv::kAddTo ?
   bool add_to_weight_;
+  // Is there a dgrad algo that should be avoided (-1 == none)?
+  int32_t exclude_dgrad_algo_ = -1;
   ConvolutionParam param_;
 };
 #endif  // __CUDACC__ && CUDNN
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index ec95d2be3309..adb6caf1c028 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -446,6 +446,19 @@ class CuDNNDeconvolutionOp {
                                             wshape[1],
                                             wshape[2],
                                             wshape[3]));
+#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500
+      auto kernel_h = wshape[2];
+      auto kernel_w = wshape[3];
+      auto stride_h = stride[0];
+      auto stride_w = stride[1];
+      auto pad_h = o_pad[0];
+      auto pad_w = o_pad[1];
+      if (param_.layout.value() == kNCHW &&
+          (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) ||
+           ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) {
+        exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING;
+      }
+#endif
     } else if (param_.kernel.ndim() == 3) {
       // 3d conv
       index_t o_pad[3];
@@ -618,7 +631,7 @@ class CuDNNDeconvolutionOp {
     bwd_data_results.resize(actual_bwd_data_algos);
     AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t,
         cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data",
-                                       workspace_byte, bwd);
+                                       workspace_byte, bwd, exclude_dgrad_algo_);
 #else
     // CUDNN_MAJOR < 7
     const int kMaxAlgos = 10;
@@ -829,11 +842,13 @@ class CuDNNDeconvolutionOp {
   // workspace constraints and a possible user algo preference.
   template <typename PerfType, typename AlgoType>
   void AlgoFinalSelect(const std::vector<PerfType> &perf_results, std::string kernel_name,
-                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo) {
+                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo,
+                       int32_t algo_exclude = -1) {
     // Determine the fastest acceptable algo regardless of mathType.
     bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false);
     for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
       const auto &result = perf_results[i];
+      bool algo_exclusion = static_cast<int32_t>(result.algo) == algo_exclude;
       bool algo_is_tensor_core = false;
       #if CUDNN_MAJOR >= 7
         algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH;
@@ -842,7 +857,8 @@ class CuDNNDeconvolutionOp {
         #if CUDNN_MAJOR >= 7
           (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
         #endif
-          (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte)) {
+          (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte) &&
+          !algo_exclusion) {
         algo->Set(result.algo, algo_is_tensor_core);
         return;
       }
@@ -917,9 +933,9 @@ class CuDNNDeconvolutionOp {
   // e.g. {shape[0], shape[1], shape[2]} -> {shape[1]*shape[2], shape[2], 1}
   template <int dim>
   inline Shape<dim> Strides(const mxnet::TShape &s) {
-    uint32_t ndim = s.ndim();
-    mxnet::TShape strides(ndim);
-    for (uint32_t i = 0; i != ndim; ++i)
+    int ndim = s.ndim();
+    mxnet::TShape strides(ndim, -1);
+    for (int i = 0; i != ndim; ++i)
       strides[i] = s.ProdShape(i+1, ndim);
     return strides.get<dim>();
   }
@@ -1025,6 +1041,8 @@ class CuDNNDeconvolutionOp {
   bool cudnn_tensor_core_;
   // Is req[kWeight] == deconv::kAddTo ?
   bool add_to_weight_;
+  // Is there a dgrad algo that should be avoided (-1 == none)?
+  int32_t exclude_dgrad_algo_ = -1;
   DeconvolutionParam param_;
 };
 #endif  // CUDNN
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 5248c1211ac7..1eeccb02e030 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -65,13 +65,13 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
   DMLC_DECLARE_PARAMETER(DeconvolutionParam) {
     DMLC_DECLARE_FIELD(kernel).describe("Deconvolution kernel size: (w,), (h, w) or (d, h, w). "
                   "This is same as the kernel size used for the corresponding convolution");
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, 0))
         .describe("The stride used for the corresponding convolution: (w,), (h, w) or (d, h, w). "
                   "Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape(0, 0))
         .describe("Dilation factor for each dimension of the input: (w,), (h, w) or (d, h, w). "
                   "Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, 0))
         .describe("The amount of implicit zero padding added during convolution for each "
                   "dimension of the input: "
                   "(w,), (h, w) or (d, h, w). "
@@ -79,11 +79,11 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
                   "If `target_shape` is set, "
                   "`pad` will be ignored and a padding that will generate the target shape "
                   "will be used. Defaults to no padding.");
-    DMLC_DECLARE_FIELD(adj).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(adj).set_default(mxnet::TShape(0, 0))
         .describe("Adjustment for output shape: (w,), (h, w) or (d, h, w). "
                   "If `target_shape` is set, "
                   "`adj` will be ignored and computed accordingly.");
-    DMLC_DECLARE_FIELD(target_shape).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(target_shape).set_default(mxnet::TShape(0, 0))
         .describe("Shape of the output tensor: (w,), (h, w) or (d, h, w).");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
         .describe("Number of output filters.");
@@ -134,16 +134,18 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
       for (size_t i = 0; i < ndim; i++) {
         // input.ndim() can be larger than ndim, in case that the complete input
         // shape was passed and not only the ndim last ones
-        o_pad[i] = stride[i] * (input[(input_ndim - ndim) + i] - 1) + DilatedKernelSize(i);
-        CHECK_GE(o_pad[i], target_shape[i]) << "too big target shape";
-        o_pad[i] -= target_shape[i];
-        o_adj[i] = o_pad[i] % 2;
-        o_pad[i] = (o_pad[i] + 1) / 2;
+        if (mxnet::dim_size_is_known(input, input_ndim - ndim + i)) {
+          o_pad[i] = stride[i] * (input[(input_ndim - ndim) + i] - 1) + DilatedKernelSize(i);
+          CHECK_GE(o_pad[i], target_shape[i]) << "too big target shape";
+          o_pad[i] -= target_shape[i];
+          o_adj[i] = o_pad[i] % 2;
+          o_pad[i] = (o_pad[i] + 1) / 2;
+        }
       }
     } else {
-      for (size_t i = 0; i < ndim; i++) {
-        o_pad[i] = pad[i];
-        o_adj[i] = adj[i];
+      for (int i = 0; i < static_cast<int>(ndim); i++) {
+        o_pad[i] = i < pad.ndim() ? pad[i] : 0;
+        o_adj[i] = i < adj.ndim() ? adj[i] : 0;
       }
     }
   }
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 27928b9b41c3..09b255d009e0 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -54,7 +54,7 @@ static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
   }
   out_shape->resize(1, mxnet::TShape());
   const mxnet::TShape &dshape = (*in_shape)[deconv::kData];
-  if (dshape.ndim() ==  0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
 
   if (param_.kernel.ndim() == 1) {
     // 1d conv
@@ -90,8 +90,12 @@ static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<3> oshape;
     oshape[0] = dshape_ncw[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) +
-      dilated_ksize_x - 2 * o_pad[0] + o_adj[0];
+    if (mxnet::dim_size_is_known(dshape_ncw[2])) {
+      oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) +
+          dilated_ksize_x - 2 * o_pad[0] + o_adj[0];
+    } else {
+      oshape[2] = -1;
+    }
 
     if (param_.target_shape.ndim() > 0) {
       if (param_.target_shape[0] > 0) {
@@ -141,10 +145,18 @@ static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<4> oshape;
     oshape[0] = dshape_nchw[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) +
-      dilated_ksize_y - 2 * o_pad[0] + o_adj[0];
-    oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) +
-      dilated_ksize_x - 2 * o_pad[1] + o_adj[1];
+    if (mxnet::dim_size_is_known(dshape_nchw[2])) {
+      oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) +
+          dilated_ksize_y - 2 * o_pad[0] + o_adj[0];
+    } else {
+      oshape[2] = -1;
+    }
+    if (mxnet::dim_size_is_known(dshape_nchw[3])) {
+      oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) +
+          dilated_ksize_x - 2 * o_pad[1] + o_adj[1];
+    } else {
+      oshape[3] = -1;
+    }
 
     if (param_.target_shape.ndim() > 1) {
       if (param_.target_shape[0] > 0) {
@@ -203,12 +215,24 @@ static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<5> oshape;
     oshape[0] = dshape_ncdhw[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) +
-      dilated_ksize_d - 2 * o_pad[0] + o_adj[0];
-    oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) +
-      dilated_ksize_y - 2 * o_pad[1] + o_adj[1];
-    oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) +
-      dilated_ksize_x - 2 * o_pad[2] + o_adj[2];
+    if (mxnet::dim_size_is_known(dshape_ncdhw[2])) {
+      oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) +
+          dilated_ksize_d - 2 * o_pad[0] + o_adj[0];
+    } else {
+      oshape[2] = -1;
+    }
+    if (mxnet::dim_size_is_known(dshape_ncdhw[3])) {
+      oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) +
+          dilated_ksize_y - 2 * o_pad[1] + o_adj[1];
+    } else {
+      oshape[3] = -1;
+    }
+    if (mxnet::dim_size_is_known(dshape_ncdhw[4])) {
+      oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) +
+          dilated_ksize_x - 2 * o_pad[2] + o_adj[2];
+    } else {
+      oshape[4] = -1;
+    }
 
     if (param_.target_shape.ndim() > 2) {
       if (param_.target_shape[0] > 0) {
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index f184fbdc2282..a34d2992c8c6 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -39,7 +39,10 @@
 #include "../random/sampler.h"
 #include "../tensor/elemwise_binary_broadcast_op.h"
 
-#define MXNET_USE_MKL_DROPOUT defined(USE_MKL) && defined(_OPENMP) && !defined(__CUDACC__)
+#if defined(USE_MKL) && defined(_OPENMP) && !defined(__CUDACC__)
+#define MXNET_USE_MKL_DROPOUT 1
+#endif
+
 #if MXNET_USE_MKL_DROPOUT
 #include <omp.h>
 
@@ -75,9 +78,9 @@ struct DropoutParam : public dmlc::Parameter<DropoutParam> {
     .add_enum("always", dropout::kAlways)
     .set_default(dropout::kTraining)
     .describe("Whether to only turn on dropout during training or to also turn on for inference.");
-    DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape(0, 0))
     .describe("Axes for variational dropout kernel.");
-    DMLC_DECLARE_FIELD(cudnn_off).set_default(dmlc::optional<bool>(true))
+    DMLC_DECLARE_FIELD(cudnn_off).set_default(dmlc::optional<bool>(false))
     .describe("Whether to turn off cudnn in dropout operator. "
               "This option is ignored if axes is specified.");
   }
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index 5fdc672d766e..afad6fd5cc80 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -95,10 +95,10 @@ Example::
   CHECK_EQ(in_shape->size(), 1U);
   const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
   mxnet::TShape dshape(in_shape->at(0));
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   out_shape->clear();
   out_shape->push_back(dshape);
-  for (index_t i = 0; i < param.axes.ndim(); ++i) {
+  for (int i = 0; i < param.axes.ndim(); ++i) {
     dshape[param.axes[i]] = 1;
   }
   out_shape->push_back(dshape);
diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h
index 93d384d51e6f..e4bb11f6bc56 100644
--- a/src/operator/nn/fully_connected-inl.h
+++ b/src/operator/nn/fully_connected-inl.h
@@ -48,6 +48,12 @@ enum FullyConnectedOpResource {kTempSpace};
 enum FullyConnectedOpOutputs {kOut};
 }  // fullc
 
+namespace quantized_fullc {
+enum QuantizedFCInputMinMax {kDataMin, kDataMax, kWeightMin, kWeightMax, kBiasMin, kBiasMax};
+enum QuantizedFCOutputs {kOut, kOutMin, kOutMax};
+}  // quantized_fullc
+
+
 struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
   int num_hidden;
   bool no_bias;
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index 2e02de300e8f..a097357ef5a3 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -32,6 +32,12 @@
 namespace mxnet {
 namespace op {
 
+bool SupportMKLDNNFC(const NDArray& input) {
+  int ndim = input.shape().ndim();
+  return input.dtype() == mshadow::kFloat32 && (ndim >= 1 && ndim <= 4) &&
+         input.storage_type() == kDefaultStorage;
+}
+
 static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs,
                                 mxnet::ShapeVector *in_shape,
                                 mxnet::ShapeVector *out_shape) {
@@ -46,7 +52,7 @@ static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape dshape = (*in_shape)[fullc::kData];
   mxnet::TShape oshape = (*out_shape)[0];
   // require data to be known
-  if (dshape.ndim() ==  0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
 
   index_t num_input;
   if (!param.flatten) {
@@ -69,7 +75,7 @@ static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs,
   } else {
     SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
   }
-  if (oshape.ndim() != 0) {
+  if (oshape.ndim() > 0) {
     dshape[0] = oshape[0];
     SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape);
   }
@@ -94,7 +100,7 @@ void FullyConnectedComputeExCPU(const nnvm::NodeAttrs& attrs,
 #if MXNET_USE_MKLDNN == 1
   if (common::ContainsOnlyStorage(inputs, kDefaultStorage) &&
       common::ContainsOnlyStorage(outputs, kDefaultStorage)) {
-    if (SupportMKLDNN(inputs[0])) {
+    if (SupportMKLDNNFC(inputs[0])) {
       MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
       MKLDNNFCForward(attrs, ctx, inputs, req, outputs);
       MKLDNN_OPCHECK_RUN(FullyConnectedCompute<cpu>, attrs, ctx, inputs, req,
@@ -141,7 +147,7 @@ void FullyConnectedGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                     const std::vector<NDArray> &inputs,
                                     const std::vector<OpReqType> &req,
                                     const std::vector<NDArray> &outputs) {
-  if (SupportMKLDNN(inputs[0])) {
+  if (SupportMKLDNNFC(inputs[0])) {
     MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
     MKLDNNFCBackward(attrs, ctx, inputs, req, outputs);
     MKLDNN_OPCHECK_RUN(FullyConnectedGradCompute<cpu>, attrs, ctx, inputs, req,
diff --git a/src/operator/nn/im2col.h b/src/operator/nn/im2col.h
index 0059a420726d..06a4e1b75b33 100644
--- a/src/operator/nn/im2col.h
+++ b/src/operator/nn/im2col.h
@@ -152,7 +152,7 @@ inline void im2col_nd_core_cpu(const DType* data_input, const bool im2col,
     const mxnet::TShape& kernel_shape, const mxnet::TShape& pad, const mxnet::TShape& stride,
     const mxnet::TShape& dilation, DType* data_output, OpReqType req = mxnet::kWriteTo) {
   if (mxnet::kNullOp == req) return;
-  index_t num_spatial_axes = kernel_shape.ndim();
+  int num_spatial_axes = kernel_shape.ndim();
   if (!im2col) {
     index_t im_size = im_shape[1];  // skip batch dim
     for (index_t i = 0; i < num_spatial_axes; ++i) {
@@ -319,7 +319,7 @@ inline void col2im(mshadow::Stream<cpu>* s,
                    const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
                    const mxnet::TShape& pad, const mxnet::TShape& stride,
                    const mxnet::TShape& dilation, DType* data_im, OpReqType req) {
-  index_t num_spatial_axes = kernel_shape.ndim();
+  int num_spatial_axes = kernel_shape.ndim();
   if (2 == num_spatial_axes) {
     col2im_cpu(data_col, im_shape[1], im_shape[2], im_shape[3],
                kernel_shape[0], kernel_shape[1], pad[0], pad[1],
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index dc4914bf2457..c7de7d734521 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -167,7 +167,7 @@ void LayerNormGradCompute(const nnvm::NodeAttrs& attrs,
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
   int axis = param.axis;
   if (axis < 0) {
-    axis += static_cast<int>(inputs[0].ndim());
+    axis += inputs[0].ndim();
   }
   CHECK(axis >= 0 && axis < inputs[0].ndim()) << "Channel axis out of range: " << param.axis;
   Stream<xpu> *s = ctx.get_stream<xpu>();
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index d4c308398cb7..2e47503a3318 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -41,14 +41,14 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
   const mxnet::TShape &dshape = in_shape->at(layernorm::kData);
   int axis = param.axis;
   if (axis < 0) {
-    axis += static_cast<int>(dshape.ndim());
+    axis += dshape.ndim();
   }
-  CHECK(axis >= 0 && axis < static_cast<int>(dshape.ndim()))
+  CHECK(axis >= 0 && axis < dshape.ndim())
     << "Channel axis out of range: axis=" << param.axis;
 
   const int channelCount = dshape[axis];
 
-  if (dshape.ndim() == 0) {
+  if (!mxnet::ndim_is_known(dshape)) {
     return false;
   }
 
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
index 410bdab667e5..b632e35b57fe 100644
--- a/src/operator/nn/lrn.cc
+++ b/src/operator/nn/lrn.cc
@@ -40,7 +40,7 @@ bool LRNShape(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
   const mxnet::TShape &dshape = in_shape->at(0);
-  if (dshape.ndim() == 0) return false;
+  if (!shape_is_known(dshape)) return false;
   out_shape->clear();
   out_shape->push_back(dshape);
   out_shape->push_back(dshape);
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 0a89c0f31981..3da3f23d7683 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -175,12 +175,14 @@ struct ConvolutionParam;
 struct DeconvolutionParam;
 struct SoftmaxParam;
 struct SoftmaxOutputParam;
+struct TransposeParam;
 bool SupportMKLDNNAct(const ActivationParam& param);
 bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input);
 bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input);
 bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray &input);
 bool SupportMKLDNNSoftmax(const SoftmaxParam& param);
 bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam &param);
+bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray &data);
 }  // namespace op
 
 static int GetTypeSize(int dtype) {
@@ -462,7 +464,7 @@ mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc p
                                                 mkldnn_memory_format_t format);
 
 inline bool same_shape(const mxnet::TShape &shape, const mkldnn_dims_t dims, int ndims) {
-  if (shape.ndim() != (size_t)ndims)
+  if (shape.ndim() != ndims)
     return false;
   for (int i = 0; i < ndims; i++)
     if (shape[i] != dims[i])
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index d40c40668b5e..e80358ac636a 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -335,6 +335,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
       case mkldnn_oihw:
       case mkldnn_ihwo:
       case mkldnn_hwio:
+      case mkldnn_iohw:
       case mkldnn_oIhw8i:
       case mkldnn_oIhw16i:
       case mkldnn_OIhw8i8o:
@@ -372,6 +373,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
   } else if (desc.data.ndims == 5) {
     switch (desc.data.format) {
       case mkldnn_goihw:
+      case mkldnn_giohw:
       case mkldnn_hwigo:
       case mkldnn_hwigo_s8s8:
       case mkldnn_gOIhw8i8o:
diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc
index 8e2b57781a18..7b266efc2a14 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat.cc
+++ b/src/operator/nn/mkldnn/mkldnn_concat.cc
@@ -92,13 +92,13 @@ void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   auto gz_mem = inputs[0].GetMKLDNNData();
   mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc();
   /* init the offset */
-  mkldnn::memory::dims offsets = {0, 0, 0, 0};
+  mkldnn::memory::dims offsets(outputs[0].shape().ndim());
+  for (auto &v : offsets) {
+    v = 0;
+  }
+
   for (int i = 0; i < num_in_data; i++) {
-    mkldnn::memory::dims diff_src_tz
-        = {static_cast<int>(outputs[i].shape()[0]),
-          static_cast<int>(outputs[i].shape()[1]),
-          static_cast<int>(outputs[i].shape()[2]),
-          static_cast<int>(outputs[i].shape()[3])};
+    mkldnn::memory::dims diff_src_tz(outputs[i].shape().begin(), outputs[i].shape().end());
     auto diff_src_mpd = outputs[i].GetMKLDNNData()->get_primitive_desc();
     auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]);
     // create view from gy to gxs[i]
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h b/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
new file mode 100644
index 000000000000..c08371489fed
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_fully_connected-inl.h
+ * \brief Common functions used by MKLDNN (Quantized) FullyConnected operator
+ * \author Ciyong Chen
+*/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_FULLY_CONNECTED_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_FULLY_CONNECTED_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <vector>
+#include <string>
+#include "../fully_connected-inl.h"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct MKLDNNFCParam: public dmlc::Parameter<MKLDNNFCParam> {
+  bool quantized;
+  bool enable_float_output;
+  bool with_relu;
+  dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
+  dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
+
+  DMLC_DECLARE_PARAMETER(MKLDNNFCParam) {
+    DMLC_DECLARE_FIELD(quantized).set_default(false)
+    .describe("Whether it's a quantized FullyConnected operator");
+    DMLC_DECLARE_FIELD(enable_float_output).set_default(false)
+    .describe("Whether to enable float32 output");
+    DMLC_DECLARE_FIELD(with_relu).set_default(false)
+    .describe("Whether there's a post relu after FullyConnected operator");
+    DMLC_DECLARE_FIELD(min_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The minimum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to by "
+              "quantized fullyconnected op to calculate primitive scale");
+    DMLC_DECLARE_FIELD(max_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The maximum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to by "
+              "quantized fullyconnected op to calculate primitive scale");
+  }
+};
+
+struct MKLDNNFCFullParam {
+  FullyConnectedParam default_param;
+  MKLDNNFCParam mkldnn_param;
+  std::vector<float> output_scales = {0.0};
+  std::vector<float> requantize_scales = {0.0};
+};
+
+mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(
+    const MKLDNNFCFullParam &full_param, const bool is_train,
+    const NDArray &data, const NDArray &weight, const NDArray *bias,
+    const mkldnn::memory::desc &out_md);
+
+class MKLDNNFullyConnectedForward {
+ public:
+  mkldnn::inner_product_forward::primitive_desc fwd_pd;
+
+  MKLDNNFullyConnectedForward(const MKLDNNFCFullParam &full_param, const bool is_train,
+                              const NDArray &data, const NDArray &weight,
+                              const NDArray *bias,
+                              const mkldnn::memory::desc &out_md)
+      : fwd_pd(GetFCFwdImpl(full_param, is_train, data, weight, bias, out_md)) {}
+
+
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
+                 const mkldnn::memory *bias, const mkldnn::memory &output);
+
+  const mkldnn::inner_product_forward &GetFwd() const {
+    return *fwd_;
+  }
+
+ private:
+  std::shared_ptr<mkldnn::inner_product_forward> fwd_;
+  std::shared_ptr<mkldnn::memory> data_;
+  std::shared_ptr<mkldnn::memory> weight_;
+  std::shared_ptr<mkldnn::memory> bias_;
+  std::shared_ptr<mkldnn::memory> out_;
+};
+
+typedef ParamOpSign<FullyConnectedParam> MKLDNNFullyconSignature;
+
+MKLDNNFullyConnectedForward &GetFCFwd(
+    const FullyConnectedParam &param, const bool is_train,
+    const NDArray &data, const NDArray &weight,
+    const NDArray *bias, const mkldnn::memory::desc &out_md);
+
+void MKLDNNFCFlattenData(const FullyConnectedParam &param,
+                         const NDArray &out_data,
+                         NDArray *in_data,
+                         mkldnn::memory::desc *out_md);
+
+void MKLDNNFCForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
+                     const std::vector<NDArray> &in_data,
+                     const std::vector<OpReqType> &req,
+                     const std::vector<NDArray> &out_data);
+
+void MKLDNNFCForwardFullFeature(const MKLDNNFCFullParam &param,
+                                const OpContext &ctx,
+                                MKLDNNFullyConnectedForward *fwd,
+                                const std::vector<NDArray> &in_data,
+                                const std::vector<OpReqType> &req,
+                                const std::vector<NDArray> &out_data);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
index 05ef7ebd6573..03d7e62da399 100644
--- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
+++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
@@ -18,220 +18,296 @@
  */
 
 /*!
+ * Copyright (c) 2018 by Contributors
  * \file mkldnn_fully_connected.cc
- * \brief
- * \author Da Zheng
+ * \brief MKLDNN FullyConnected operator
+ * \author Da Zheng, Ciyong Chen
 */
 
-#include "../fully_connected-inl.h"
-#include "./mkldnn_base-inl.h"
-
 #if MXNET_USE_MKLDNN == 1
+#include "mkldnn_fully_connected-inl.h"
+
 namespace mxnet {
 namespace op {
 
-inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd(
+DMLC_REGISTER_PARAMETER(MKLDNNFCParam);
+
+mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(
+    const MKLDNNFCFullParam &full_param, const bool is_train,
     const NDArray &data, const NDArray &weight, const NDArray *bias,
-    const mkldnn::memory::desc &out_md, const bool is_train) {
+    const mkldnn::memory::desc &out_md) {
   auto data_md = GetMemDesc(data);
   auto weight_md = GetMemDesc(weight);
   auto engine = CpuEngine::Get()->get_engine();
   auto propagation =
     is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
+
+  mkldnn::primitive_attr attr;
+  mkldnn::post_ops ops;
+  if (full_param.mkldnn_param.with_relu) {
+    const float scale = 1.0f;
+    const float alpha = 0.0f;
+    const float beta = 1.0f;
+    ops.append_eltwise(scale, eltwise_relu, alpha, beta);
+  }
+  attr.set_post_ops(ops);
+
+  if (full_param.mkldnn_param.quantized) {
+    if ((full_param.mkldnn_param.min_calib_range.has_value() &&
+         full_param.mkldnn_param.max_calib_range.has_value()) ||
+        full_param.mkldnn_param.enable_float_output) {
+      int mask = 0;
+      std::vector<float> scales = {0.0};
+      if (full_param.requantize_scales.size()) {
+        scales[0] = full_param.requantize_scales[0];
+      } else if (full_param.output_scales.size()) {
+        scales[0] = full_param.output_scales[0];
+      } else {
+        LOG(FATAL) << "Must specified either output_scales or requantize_scales!";
+      }
+
+      attr.set_output_scales(mask, scales);
+      attr.set_int_output_round_mode(round_nearest);
+    }
+  }
+
+  auto GetFCFwdPd = [&full_param, &attr,
+                     &engine](const mkldnn::inner_product_forward::desc &desc) {
+    try {
+      return mkldnn::inner_product_forward::primitive_desc(desc, attr, engine);
+    } catch (mkldnn::error &e) {
+      if (e.status == mkldnn_unimplemented &&
+          full_param.mkldnn_param.quantized) {
+        LOG(ERROR) << "AVX512-BW support or MKLDNN v0.18 is required for INT8 fully_connected.";
+      } else {
+        LOG(ERROR) << e.message;
+      }
+      throw;
+    }
+  };
+
   if (bias) {
     auto bias_md = GetMemDesc(*bias);
-    mkldnn::inner_product_forward::desc ipFwd_desc(propagation,
+    mkldnn::inner_product_forward::desc desc(propagation,
         data_md, weight_md, bias_md, out_md);
-    return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine);
+    return GetFCFwdPd(desc);
   } else {
-    mkldnn::inner_product_forward::desc ipFwd_desc(propagation,
+    mkldnn::inner_product_forward::desc desc(propagation,
         data_md, weight_md, out_md);
-    return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine);
+    return GetFCFwdPd(desc);
   }
 }
 
-inline static mkldnn::inner_product_backward_data::primitive_desc GetIpBwdData(
+inline static mkldnn::inner_product_backward_data::primitive_desc GetFCBwdData(
     const NDArray &data, const NDArray &weight, const NDArray &output,
-    mkldnn::inner_product_forward::primitive_desc ipFwd_pd) {
+    mkldnn::inner_product_forward::primitive_desc fwd_pd) {
   auto data_md = GetMemDesc(data);
   auto weight_md = GetMemDesc(weight);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
   mkldnn::inner_product_backward_data::desc desc(data_md, weight_md, out_md);
-  return mkldnn::inner_product_backward_data::primitive_desc(desc, engine, ipFwd_pd);
+  return mkldnn::inner_product_backward_data::primitive_desc(desc, engine, fwd_pd);
 }
 
-inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWeights(
+inline static mkldnn::inner_product_backward_weights::primitive_desc GetFCBwdWeights(
     const NDArray &data, const NDArray &weight, const NDArray *bias,
-    const NDArray &output, mkldnn::inner_product_forward::primitive_desc ipFwd_pd) {
+    const NDArray &output, mkldnn::inner_product_forward::primitive_desc fwd_pd) {
   auto data_md = GetMemDesc(data);
   auto weight_md = GetMemDesc(weight);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
   if (bias) {
     auto bias_md = GetMemDesc(*bias);
-    mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md,
+    mkldnn::inner_product_backward_weights::desc desc(data_md,
         weight_md, bias_md, out_md);
     return mkldnn::inner_product_backward_weights::primitive_desc(
-        ipBwdWeights_desc, engine, ipFwd_pd);
+        desc, engine, fwd_pd);
   } else {
-    mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md,
+    mkldnn::inner_product_backward_weights::desc desc(data_md,
         weight_md, out_md);
     return mkldnn::inner_product_backward_weights::primitive_desc(
-        ipBwdWeights_desc, engine, ipFwd_pd);
+        desc, engine, fwd_pd);
   }
 }
 
-class MKLDNNFullyConnectForward {
-  std::shared_ptr<mkldnn::memory> data;
-  std::shared_ptr<mkldnn::memory> weight;
-  std::shared_ptr<mkldnn::memory> out;
-  std::shared_ptr<mkldnn::memory> bias;
-  std::shared_ptr<mkldnn::inner_product_forward> ipFwd;
-
- public:
-  mkldnn::inner_product_forward::primitive_desc ipFwd_pd;
-
-  MKLDNNFullyConnectForward(const FullyConnectedParam &param, bool is_train,
-                            const NDArray &data, const NDArray &weight,
-                            const NDArray *bias,
-                            const mkldnn::memory::desc &output)
-      : ipFwd_pd(GetIPFwd(data, weight, bias, output, is_train)) {}
-
-  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
-                 const mkldnn::memory *bias, const mkldnn::memory &output) {
-    if (this->data == nullptr)
-      this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              ipFwd_pd.src_primitive_desc(), data.get_data_handle()));
-    else
-      this->data->set_data_handle(data.get_data_handle());
+void MKLDNNFullyConnectedForward::SetNewMem(const mkldnn::memory &data,
+                                            const mkldnn::memory &weight,
+                                            const mkldnn::memory *bias,
+                                            const mkldnn::memory &output) {
+  if (this->data_ == nullptr)
+    this->data_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+            fwd_pd.src_primitive_desc(), data.get_data_handle()));
+  else
+    this->data_->set_data_handle(data.get_data_handle());
 
-    if (this->weight == nullptr)
-      this->weight = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              ipFwd_pd.weights_primitive_desc(), weight.get_data_handle()));
-    else
-      this->weight->set_data_handle(weight.get_data_handle());
+  if (this->weight_ == nullptr)
+    this->weight_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+            fwd_pd.weights_primitive_desc(), weight.get_data_handle()));
+  else
+    this->weight_->set_data_handle(weight.get_data_handle());
+
+  if (this->out_ == nullptr)
+    this->out_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+            fwd_pd.dst_primitive_desc(), output.get_data_handle()));
+  else
+    this->out_->set_data_handle(output.get_data_handle());
 
-    if (this->out == nullptr)
-      this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              ipFwd_pd.dst_primitive_desc(), output.get_data_handle()));
+  if (bias != nullptr) {
+    if (this->bias_ == nullptr)
+      this->bias_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+      fwd_pd.bias_primitive_desc(), bias->get_data_handle()));
     else
-      this->out->set_data_handle(output.get_data_handle());
-
-    if (bias != nullptr) {
-      if (this->bias == nullptr)
-        this->bias = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-        ipFwd_pd.bias_primitive_desc(), bias->get_data_handle()));
-      else
-        this->bias->set_data_handle(bias->get_data_handle());
-      if (this->ipFwd == nullptr)
-        this->ipFwd = std::shared_ptr<mkldnn::inner_product_forward>(
-            new mkldnn::inner_product_forward(
-                ipFwd_pd, mkldnn::primitive::at(*this->data),
-                mkldnn::primitive::at(*this->weight),
-                mkldnn::primitive::at(*this->bias), *this->out));
-    } else if (this->ipFwd == nullptr) {
-      this->ipFwd = std::shared_ptr<mkldnn::inner_product_forward>(
+      this->bias_->set_data_handle(bias->get_data_handle());
+
+    if (this->fwd_ == nullptr)
+      this->fwd_ = std::shared_ptr<mkldnn::inner_product_forward>(
           new mkldnn::inner_product_forward(
-              ipFwd_pd, mkldnn::primitive::at(*this->data),
-              mkldnn::primitive::at(*this->weight), *this->out));
+              fwd_pd, mkldnn::primitive::at(*this->data_),
+              mkldnn::primitive::at(*this->weight_),
+              mkldnn::primitive::at(*this->bias_), *this->out_));
+  } else {
+     if (this->fwd_ == nullptr) {
+      this->fwd_ = std::shared_ptr<mkldnn::inner_product_forward>(
+          new mkldnn::inner_product_forward(
+              fwd_pd, mkldnn::primitive::at(*this->data_),
+              mkldnn::primitive::at(*this->weight_), *this->out_));
     }
   }
-  const mkldnn::inner_product_forward &GetIpFwd() const {
-    return *ipFwd;
-  }
-};
-
-typedef ParamOpSign<FullyConnectedParam> MKLDNNFullyconSignature;
+}
 
-static inline MKLDNNFullyConnectForward &GetFCFwd(
-    const nnvm::NodeAttrs &attrs, const NDArray &data, const NDArray &weight,
-    const NDArray *bias, const mkldnn::memory::desc &output,
-    const bool is_train) {
+MKLDNNFullyConnectedForward &GetFCFwd(
+    const FullyConnectedParam &param, const bool is_train,
+    const NDArray &data, const NDArray &weight,
+    const NDArray *bias, const mkldnn::memory::desc &out_md) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNFullyconSignature,
-              MKLDNNFullyConnectForward, OpHash> fcFwds;
+              MKLDNNFullyConnectedForward, OpHash> fcFwds;
 #else
   static MX_THREAD_LOCAL std::unordered_map<MKLDNNFullyconSignature,
-              MKLDNNFullyConnectForward, OpHash> fcFwds;
+              MKLDNNFullyConnectedForward, OpHash> fcFwds;
 #endif
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
   MKLDNNFullyconSignature key(param);
+  key.AddSign(is_train);
   key.AddSign(data);
   key.AddSign(weight);
-  key.AddSign(is_train);
-
   if (bias)
     key.AddSign(*bias);
 
   auto it = fcFwds.find(key);
   if (it == fcFwds.end()) {
-    MKLDNNFullyConnectForward fcFwd(param, is_train, data, weight, bias,
-                                    output);
-    auto ins_ret = fcFwds.insert(
-        std::pair<MKLDNNFullyconSignature, MKLDNNFullyConnectForward>(key, fcFwd));
-    CHECK(ins_ret.second);
-    it = ins_ret.first;
+    MKLDNNFCFullParam full_param;
+    full_param.default_param = param;
+    full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
+    MKLDNNFullyConnectedForward fcFwd(full_param, is_train, data, weight, bias, out_md);
+    it = AddToCache(&fcFwds, key, fcFwd);
   }
   return it->second;
 }
 
-void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                     const std::vector<NDArray> &in_data,
-                     const std::vector<OpReqType> &req,
-                     const std::vector<NDArray> &out_data) {
+void MKLDNNFCFlattenData(const FullyConnectedParam &param,
+                         const NDArray &out_data,
+                         NDArray *in_data,
+                         mkldnn::memory::desc *out_md) {
+  const mxnet::TShape ishape = in_data->shape();
+  const mxnet::TShape oshape = out_data.shape();
+
+  // If the input data is a view of an MKLDNN array, we should create a new
+  // NDArray with reordered data.
+  if (in_data->IsMKLDNNData() && in_data->IsView())
+    *in_data = in_data->Reorder2Default();
+
+  if (ishape.ndim() != 2) {
+    if (!param.flatten) {
+      *in_data = in_data->MKLDNNDataReshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1),
+                                                    ishape[ishape.ndim()-1]));
+      mkldnn::memory::dims out_dims{static_cast<int>(oshape.ProdShape(0, oshape.ndim()-1)),
+        static_cast<int>(oshape[ishape.ndim()-1])};
+      *out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data.dtype()),
+        mkldnn::memory::format::any);
+    } else {
+      *in_data = in_data->MKLDNNDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())));
+      mkldnn::memory::dims out_dims{static_cast<int>(oshape[0]),
+        static_cast<int>(oshape.ProdShape(1, oshape.ndim()))};
+      *out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data.dtype()),
+        mkldnn::memory::format::any);
+    }
+  }
+}
+
+void MKLDNNFCForwardFullFeature(const MKLDNNFCFullParam &full_param,
+                                const OpContext &ctx,
+                                MKLDNNFullyConnectedForward *fwd,
+                                const std::vector<NDArray> &in_data,
+                                const std::vector<OpReqType> &req,
+                                const std::vector<NDArray> &out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  const mxnet::TShape& ishape = in_data[fullc::kData].shape();
-  const mxnet::TShape& oshape = out_data[fullc::kOut].shape();
   NDArray weight = in_data[fullc::kWeight];
   NDArray data = in_data[fullc::kData];
-  // If the input data is a view of an MKLDNN array, we should create a new
-  // NDArray with reordered data.
-  if (data.IsMKLDNNData() && data.IsView())
-    data = in_data[fullc::kData].Reorder2Default();
 
-  auto out_md = GetMemDesc(out_data[fullc::kOut]);
-  if (data.shape().ndim() != 2 && !param.flatten) {
-    data = data.MKLDNNDataReshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1),
-                                     ishape[ishape.ndim()-1]));
-    mkldnn::memory::dims out_dims{static_cast<int>(oshape.ProdShape(0, oshape.ndim()-1)),
-      static_cast<int>(oshape[ishape.ndim()-1])};
-    out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()),
-      mkldnn::memory::format::any);
-  } else if (data.shape().ndim() != 2) {
-    data = data.MKLDNNDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())));
-    mkldnn::memory::dims out_dims{static_cast<int>(oshape[0]),
-      static_cast<int>(oshape.ProdShape(1, oshape.ndim()))};
-    out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()),
-      mkldnn::memory::format::any);
+  auto data_mem = data.GetMKLDNNDataReorder(fwd->fwd_pd.src_primitive_desc());
+  const mkldnn::memory *weight_mem;
+  if (ctx.is_train) {
+    if (weight.IsMKLDNNData()) {
+      weight.Reorder2DefaultAsync();
+    }
+    weight_mem = GetWeights(weight, fwd->fwd_pd.weights_primitive_desc(), 1);
+  } else {
+    if (weight.IsDefaultData()) {
+      weight_mem = GetWeights(weight, fwd->fwd_pd.weights_primitive_desc(), 1);
+      weight.MKLDNNDataReorderAsync(fwd->fwd_pd.weights_primitive_desc());
+    } else {
+      weight_mem = weight.GetMKLDNNData();
+      CHECK(weight_mem->get_primitive_desc() == fwd->fwd_pd.weights_primitive_desc());
+    }
   }
-  MKLDNNFullyConnectForward &FCFwd =
-      GetFCFwd(attrs, data, weight, param.no_bias ? nullptr : &in_data[fullc::kBias],
-               out_md, ctx.is_train);
-  auto data_mem = data.GetMKLDNNDataReorder(FCFwd.ipFwd_pd.src_primitive_desc());
-  auto weight_mem = weight.GetMKLDNNDataReorder(FCFwd.ipFwd_pd.weights_primitive_desc());
   auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut],
-      FCFwd.ipFwd_pd.dst_primitive_desc(), req[fullc::kOut], &data);
-  if (!param.no_bias) {
+      fwd->fwd_pd.dst_primitive_desc(), req[fullc::kOut], &data);
+  if (!full_param.default_param.no_bias) {
     auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(
-        FCFwd.ipFwd_pd.bias_primitive_desc());
-    FCFwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second);
+        fwd->fwd_pd.bias_primitive_desc());
+    fwd->SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second);
   } else {
-    FCFwd.SetNewMem(*data_mem, *weight_mem, nullptr, *out_mem.second);
+    fwd->SetNewMem(*data_mem, *weight_mem, nullptr, *out_mem.second);
   }
-  MKLDNNStream::Get()->RegisterPrim(FCFwd.GetIpFwd());
+  MKLDNNStream::Get()->RegisterPrim(fwd->GetFwd());
   CommitOutput(out_data[fullc::kOut], out_mem);
   MKLDNNStream::Get()->Submit();
 }
 
+void MKLDNNFCForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
+                     const std::vector<NDArray> &in_data,
+                     const std::vector<OpReqType> &req,
+                     const std::vector<NDArray> &out_data) {
+  MKLDNNFCFullParam full_param;
+  full_param.default_param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
+
+  NDArray data = in_data[fullc::kData];
+  mkldnn::memory::desc out_md = GetMemDesc(out_data[fullc::kOut]);
+  MKLDNNFCFlattenData(full_param.default_param, out_data[fullc::kOut],
+                      &data, &out_md);
+  auto &fwd = GetFCFwd(full_param.default_param, ctx.is_train, data,
+                       in_data[fullc::kWeight],
+                       full_param.default_param.no_bias ? nullptr : &in_data[fullc::kBias],
+                       out_md);
+  std::vector<NDArray> new_inputs;
+  if (full_param.default_param.no_bias)
+    new_inputs = {data, in_data[fullc::kWeight]};
+  else
+    new_inputs = {data, in_data[fullc::kWeight], in_data[fullc::kBias]};
+  MKLDNNFCForwardFullFeature(full_param, ctx, &fwd, new_inputs, req, out_data);
+}
+
 void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
                       const std::vector<NDArray> &inputs,
                       const std::vector<OpReqType> &req,
                       const std::vector<NDArray> &outputs) {
   TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
   const std::vector<NDArray> &in_grad = outputs;
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  MKLDNNFCFullParam full_param;
+  full_param.default_param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
+  const FullyConnectedParam& param = full_param.default_param;
   const mxnet::TShape& ishape = inputs[fullc::kData + 1].shape();
   const mxnet::TShape& oshape = inputs[fullc::kOut].shape();
 
@@ -251,13 +327,14 @@ void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
     out_grad = out_grad.MKLDNNDataReshape(Shape2(oshape[0],
                                              oshape.ProdShape(1, oshape.ndim())));
 
-  mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight,
-      param.no_bias ? nullptr : &in_grad[fullc::kBias], GetMemDesc(out_grad), ctx.is_train);
+
+  mkldnn::inner_product_forward::primitive_desc fwd_pd = GetFCFwdImpl(full_param, ctx.is_train,
+      data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias], GetMemDesc(out_grad));
 
   CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
   if (req[fullc::kData]) {
-    mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetIpBwdData(
-        data, weight, out_grad, ipFwd_pd);
+    mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetFCBwdData(
+        data, weight, out_grad, fwd_pd);
     auto out_grad_mem = out_grad.GetMKLDNNDataReorder(
         ipBwdData_pd.diff_dst_primitive_desc());
     auto weight_mem = weight.GetMKLDNNDataReorder(ipBwdData_pd.weights_primitive_desc());
@@ -270,8 +347,8 @@ void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   }
   if (req[fullc::kWeight]) {
     mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd
-      = GetIPBwdWeights(data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias],
-          out_grad, ipFwd_pd);
+      = GetFCBwdWeights(data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias],
+          out_grad, fwd_pd);
     auto out_grad_mem = out_grad.GetMKLDNNDataReorder(
         ipBwdWeights_pd.diff_dst_primitive_desc());
     auto data_mem = data.GetMKLDNNDataReorder(ipBwdWeights_pd.src_primitive_desc());
diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
index 39f26325b2a5..f3f61b457507 100644
--- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
@@ -113,6 +113,12 @@ void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx
 void MKLDNNSum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
          const mkldnn::memory &out);
 
+void MKLDNNTransposeForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext &ctx,
+                            const NDArray &data,
+                            const OpReqType &req,
+                            const NDArray &output);
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_slice.cc b/src/operator/nn/mkldnn/mkldnn_slice.cc
index 3f3d82020598..2a817a25a5b8 100644
--- a/src/operator/nn/mkldnn/mkldnn_slice.cc
+++ b/src/operator/nn/mkldnn/mkldnn_slice.cc
@@ -37,12 +37,12 @@ MKLDNNSliceFwd::MKLDNNSliceFwd(const SliceParam &param,
                                const NDArray &out) {
   const mxnet::TShape ishape = in.shape();
   const mxnet::TShape oshape = out.shape();
-  uint32_t N = ishape.ndim();
+  const int N = ishape.ndim();
   mkldnn::memory::dims dims(N);
   mkldnn::memory::dims offsets(N);
-  for (uint32_t i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i) {
     int s = 0;
-    if (param.begin[i]) {
+    if (i < param.begin.ndim() &&  param.begin[i]) {
       s = *param.begin[i];
       if (s < 0) s += ishape[i];
     }
diff --git a/src/operator/nn/mkldnn/mkldnn_transpose.cc b/src/operator/nn/mkldnn/mkldnn_transpose.cc
new file mode 100644
index 000000000000..eec19bababb7
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_transpose.cc
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_transpose.cc
+ * \brief Implement transpose operator via MKL-DNN reorder primitive
+ * \author Tao Lv
+*/
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <mkldnn.hpp>
+#include "../../tensor/matrix_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool SupportMKLDNNTranspose(const TransposeParam& param,
+                            const NDArray &data) {
+  auto data_ndim = data.shape().ndim();
+
+  if (data_ndim > 4 || data.dtype() != mshadow::kFloat32)
+    return false;
+
+  return true;
+}
+
+typedef ParamOpSign<TransposeParam> MKLDNNTransposeSignature;
+
+class MKLDNNTransposeForward {
+  std::shared_ptr<mkldnn::memory> data_;
+  std::shared_ptr<mkldnn::memory> out_;
+  std::shared_ptr<mkldnn::memory::primitive_desc> dst_pd_;
+  std::shared_ptr<mkldnn::reorder> transpose_;
+
+ public:
+  MKLDNNTransposeForward(const TransposeParam& param,
+                         const NDArray &data) {
+    auto shape = data.shape();
+    auto data_ndim = shape.ndim();
+    auto axes_ndim = param.axes.ndim();
+    auto axes = mxnet::TShape(data_ndim, -1);
+    if (axes_ndim == 0) {
+      for (int i = 0; i < data_ndim; i++) {
+        axes[i] = data_ndim - i - 1;
+      }
+    } else {
+      axes = param.axes;
+    }
+
+    auto engine = CpuEngine::Get()->get_engine();
+    auto in_mem = data.GetMKLDNNData();
+    auto src_pd = in_mem->get_primitive_desc();
+    data_ = std::make_shared<mkldnn::memory>(src_pd, nullptr);
+
+    // destination
+    // Not all formats are well defined with a certain name in MKL-DNN.
+    // For example, transpose(NCHW, (0, 2, 1, 3)) -> NHCW, which is not explicitly defined in
+    // MKL-DNN. To support general transposing, we need create destination format from scratch.
+    mkldnn_memory_desc_t dst_fmt;
+    dst_fmt.primitive_kind = mkldnn_memory;
+    dst_fmt.ndims = data_ndim;
+    dst_fmt.data_type = mkldnn_f32;
+    dst_fmt.format = mkldnn_blocked;
+
+    for (int i = 0; i < data_ndim; i++)
+      dst_fmt.dims[i] = shape[i];
+
+    unsigned int total_stride = 1;
+    for (int i = data_ndim - 1; i >= 0; i--) {
+      dst_fmt.layout_desc.blocking.padding_dims[i] = shape[i];
+      dst_fmt.layout_desc.blocking.block_dims[i] = 1;
+      dst_fmt.layout_desc.blocking.offset_padding_to_data[i]= 0;
+      // strides[0]: stride between the first elements of adjacent blocks.
+      dst_fmt.layout_desc.blocking.strides[0][axes[i]] = total_stride;
+      // strides[1]: strides between elements in the same block.
+      dst_fmt.layout_desc.blocking.strides[1][axes[i]] = 1;
+
+      total_stride *= shape[axes[i]];
+    }
+
+    dst_fmt.layout_desc.blocking.offset_padding = 0;
+    dst_pd_ = std::make_shared<mkldnn::memory::primitive_desc>(dst_fmt, engine);
+    out_ = std::make_shared<mkldnn::memory>(*dst_pd_, nullptr);
+
+    transpose_ = std::make_shared<mkldnn::reorder>(*data_, *out_);
+  }
+
+  void SetNewMem(const NDArray &data, const NDArray &output) {
+    if (data.IsMKLDNNData()) {
+      this->data_->set_data_handle(data.GetMKLDNNData()->get_data_handle());
+    } else {
+      MSHADOW_TYPE_SWITCH(data.dtype(), DTYPE, {
+        this->data_->set_data_handle(data.data().dptr<DTYPE>());
+      });
+    }
+
+    CHECK(!output.IsMKLDNNData());
+    MSHADOW_TYPE_SWITCH(output.dtype(), DTYPE, {
+      this->out_->set_data_handle(output.data().dptr<DTYPE>());
+    });
+  }
+
+  const mkldnn::reorder &GetFwd() const {
+    return *transpose_;
+  }
+};
+
+static MKLDNNTransposeForward &GetTransposeForward(const TransposeParam& param,
+                                                   const NDArray &data) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<MKLDNNTransposeSignature,
+                                         MKLDNNTransposeForward, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNTransposeSignature,
+                                            MKLDNNTransposeForward, OpHash> fwds;
+#endif
+  MKLDNNTransposeSignature key(param);
+  key.AddSign(data);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    MKLDNNTransposeForward fwd(param, data);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void MKLDNNTransposeForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext &ctx,
+                            const NDArray &data,
+                            const OpReqType &req,
+                            const NDArray &output) {
+  const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
+
+  auto stream = MKLDNNStream::Get();
+  auto fwd = GetTransposeForward(param, data);
+
+  fwd.SetNewMem(data, output);
+  stream->RegisterPrim(fwd.GetFwd());
+  stream->Submit();
+}
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index 9e1e73bf19e2..03f0fa8edd6c 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -55,7 +55,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   dmlc::optional<bool> count_include_pad;
   dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
-    DMLC_DECLARE_FIELD(kernel).set_default(mxnet::TShape())  // add default value here
+    DMLC_DECLARE_FIELD(kernel).set_default(mxnet::TShape(0, 0))  // add default value here
     .enforce_nonzero()
     .describe("Pooling kernel size: (y, x) or (d, y, x)");
 
@@ -78,11 +78,11 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     .add_enum("same", pool_enum::kSame)
     .describe("Pooling convention to be applied.");
 
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, 0))
     .enforce_nonzero()
     .describe("Stride: for pooling (y, x) or (d, y, x). Defaults to 1 for each dimension.");
 
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, 0))
     .describe("Pad for pooling: (y, x) or (d, y, x). Defaults to no padding.");
 
     DMLC_DECLARE_FIELD(p_value).set_default(dmlc::optional<int>())
@@ -200,11 +200,11 @@ class PoolingOp {
         kernel = mxnet::TShape(ishape.data() + 2,
                         ishape.data() + ishape.ndim());
       }
-      padding = mxnet::TShape(ishape.ndim() - 2);
+      padding = mxnet::TShape(ishape.ndim() - 2, 0);
       for (index_t i = 0; i < ishape.ndim() - 2; i++) {
         padding[i] = 0;
       }
-      stride = mxnet::TShape(ishape.ndim() - 2);
+      stride = mxnet::TShape(ishape.ndim() - 2, 1);
     }
     const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
                         param_.p_value.value() : 1;
@@ -257,11 +257,11 @@ class PoolingOp {
         kernel = mxnet::TShape(ishape.data() + 2,
                         ishape.data() + ishape.ndim());
       }
-      padding = mxnet::TShape(ishape.ndim() - 2);
+      padding = mxnet::TShape(ishape.ndim() - 2, 0);
       for (index_t i = 0; i < ishape.ndim() - 2; i++) {
         padding[i] = 0;
       }
-      stride = mxnet::TShape(ishape.ndim() - 2);
+      stride = mxnet::TShape(ishape.ndim() - 2, 1);
     }
 
     const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 2d16604baa20..3e081c9a0552 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -114,11 +114,11 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       << "Pooling: Input data should be  3D in (batch, channel, x)"
       << " Or 4D in (batch, channel, y, x) "
       << " Or 5D in (batch, channel, d, y, x)";
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   int layout = param.GetLayout(dshape.ndim());
   if (param.global_pool) {
     mxnet::TShape oshape = dshape;
-    size_t c_index = 0;
+    int c_index = 0;
     switch (layout) {
       case mshadow::kNCW:
       case mshadow::kNCHW:
@@ -133,7 +133,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       default:
         LOG(FATAL) << "Unsupported tensor layout " << param.layout.value();
     }
-    for (size_t i{1}; i < dshape.ndim(); i++)
+    for (int i = 1; i < dshape.ndim(); i++)
       if (i != c_index)
         oshape[i] = 1;
     out_shape->clear();
diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc
index d09017bf713e..ac638162dc6d 100644
--- a/src/operator/nn/upsampling.cc
+++ b/src/operator/nn/upsampling.cc
@@ -60,7 +60,7 @@ static bool UpSamplingShape(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
     CHECK_EQ(dshape.ndim(), 4U) << \
       "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)";
-    if (dshape.ndim() ==  0) return false;
+    if (!shape_is_known(dshape)) return false;
     int kernel = 2 * param_.scale - param_.scale % 2;
     SHAPE_ASSIGN_CHECK(*in_shape,
         up_enum::kWeight,
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index f629534dabd0..59f572211d0e 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -103,9 +103,10 @@ struct InferStorageTypeError : public dmlc::Error {
     : dmlc::Error(msg_), msg(msg_), index(index) {}
 };
 
-/*! \brief check if shape is empty or contains unknown (0) dim. */
+/*! \brief check if shape is empty or contains unknown (0) dim.
+ * DEPRECATED. */
 inline bool shape_is_none(const mxnet::TShape& x) {
-  return x.ndim() == 0 || x.Size() == 0;
+  return !mxnet::shape_is_known(x);
 }
 
 /*! \brief check if type is none (-1) */
@@ -120,7 +121,7 @@ inline bool storage_type_is_none(const int& x) {
 
 /*! \brief check if shape is scalar({1}). */
 inline bool shape_is_scalar(const mxnet::TShape& x) {
-  return x.ndim() == 1 && x.Size() == 1;
+  return x.ndim() == 0;
 }
 
 /*! \brief get string representation of shape */
@@ -159,16 +160,16 @@ inline std::string type_string(const int& x) {
  * \return whether x and y are compatible.
  */
 inline bool shape_assign(mxnet::TShape *y, const mxnet::TShape& x) {
-  if (y->ndim() == 0) {
+  if (!mxnet::ndim_is_known(*y)) {
     *y = x;
     return true;
   } else if (y->ndim() != x.ndim()) {
-    return x.ndim() == 0;
+    return !mxnet::ndim_is_known(x);
   } else {
-    for (size_t i = 0; i < y->ndim(); ++i) {
-      if ((*y)[i] == 0) {
+    for (int i = 0; i < y->ndim(); ++i) {
+      if (!mxnet::dim_size_is_known(*y, i)) {
         (*y)[i] = x[i];
-      } else if ((*y)[i] != x[i] && x[i] != 0) {
+      } else if ((*y)[i] != x[i] && x[i] >= 0) {
         return false;
       }
     }
@@ -563,7 +564,7 @@ class OpSignature {
   }
 
   void AddSign(const mxnet::TShape &shape) {
-    for (size_t i = 0; i < shape.ndim(); i++) {
+    for (int i = 0; i < shape.ndim(); i++) {
       hash = hash * 2 + shape[i];
       eles.push_back(shape[i]);
     }
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index 56d35b23b369..98ce14e7bf05 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -219,6 +219,7 @@ IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::relu);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::relu_grad);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::selu);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::selu_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::gelu);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::tanh);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::tanh_grad);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::softrelu);  // NOLINT()
@@ -328,6 +329,7 @@ IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_grad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rpower_grad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_rgrad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::xelu_grad); // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::gelu_grad); // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::prelu_grad); // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::elu_grad); // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::maximum);  // NOLINT()
diff --git a/src/operator/operator_util.cc b/src/operator/operator_util.cc
index b87428ca2b64..bc097a5b0c1c 100644
--- a/src/operator/operator_util.cc
+++ b/src/operator/operator_util.cc
@@ -774,7 +774,7 @@ class SimpleUnaryOpProp : public SimpleOpPropBase {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
     const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
+    if (!shape_is_known(dshape)) return false;
     out_shape->clear();
     if (source->unary_shape_ == nullptr) {
       out_shape->push_back(dshape);
diff --git a/src/operator/pad-inl.h b/src/operator/pad-inl.h
index 140d7099e817..89b0ab7780b6 100644
--- a/src/operator/pad-inl.h
+++ b/src/operator/pad-inl.h
@@ -230,7 +230,7 @@ class PadProp : public OperatorProperty {
       }
     }
     mxnet::TShape oshape = dshape;
-    for (size_t i = 0; i < dshape.ndim(); ++i) {
+    for (int i = 0; i < dshape.ndim(); ++i) {
       oshape[i] =
           param_.pad_width[2 * i] + param_.pad_width[2 * i + 1] + dshape[i];
     }
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
index 4e0ccc1caeb9..4241b08a0c5e 100644
--- a/src/operator/pooling_v1-inl.h
+++ b/src/operator/pooling_v1-inl.h
@@ -55,7 +55,7 @@ struct PoolingV1Param : public dmlc::Parameter<PoolingV1Param> {
   int pooling_convention;
   bool global_pool;
   DMLC_DECLARE_PARAMETER(PoolingV1Param) {
-    DMLC_DECLARE_FIELD(kernel).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(kernel).set_default(mxnet::TShape(0, -1))
     .enforce_nonzero()
     .describe("pooling kernel size: (y, x) or (d, y, x)");
 
@@ -73,11 +73,11 @@ struct PoolingV1Param : public dmlc::Parameter<PoolingV1Param> {
     .add_enum("valid", pool_v1_enum::kValid)
     .describe("Pooling convention to be applied.");
 
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, -1))
     .enforce_nonzero()
     .describe("stride: for pooling (y, x) or (d, y, x)");
 
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, -1))
     .describe("pad for pooling: (y, x) or (d, y, x)");
   }
 };
@@ -217,19 +217,20 @@ class PoolingV1Prop : public OperatorProperty {
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     using namespace mshadow;
     param_.Init(kwargs);
-    if (!param_.global_pool) {
-      if (param_.kernel.ndim() == 2) {
-        if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-        if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
-      } else {
-        CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported";
-        if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-        if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
-      }
-      CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
-        << "stride and kernel should have the same length";
-      CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
-        << "pad and kernel should have the same length";
+    if (param_.kernel.ndim() == 1) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+    } else if (param_.kernel.ndim() == 2) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    } else {
+        // ignore kernel size only if global_pool not assigned false
+        if (param_.global_pool == false) {
+          CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim()
+              << "D pooling not supported";
+        }
+      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
     }
   }
 
@@ -247,7 +248,7 @@ class PoolingV1Prop : public OperatorProperty {
     CHECK_LE(dshape.ndim(), 5U) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
                                << "Or 5D in (batch, channel, d, y, x)";
     mxnet::TShape oshape = dshape;
-    if (dshape.ndim() ==  0) return false;
+    if (dshape.ndim() ==  -1) return false;
     if (param_.global_pool) {
       if (dshape.ndim() == 4) {
         oshape[2] = 1;
diff --git a/src/operator/quantization/dequantize-inl.h b/src/operator/quantization/dequantize-inl.h
index dcda5a8b4bef..7c91ad507fd9 100644
--- a/src/operator/quantization/dequantize-inl.h
+++ b/src/operator/quantization/dequantize-inl.h
@@ -99,11 +99,11 @@ inline bool DequantizeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
 
   for (size_t i = 1; i < 3; ++i) {
-    SHAPE_ASSIGN_CHECK(*in_attrs, i, mxnet::TShape({1}));
+    SHAPE_ASSIGN_CHECK(*in_attrs, i, mxnet::TShape(1, 1));
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  return !shape_is_none(out_attrs->at(0));
+  return shape_is_known(out_attrs->at(0));
 }
 
 inline bool DequantizeType(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/quantization/dequantize.cc b/src/operator/quantization/dequantize.cc
index a4d57b9b4461..7c84673095f0 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -71,6 +71,9 @@ by keep zero centered for the quantized value:
 .set_attr<mxnet::FInferShape>("FInferShape", DequantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", DequantizeType)
 .set_attr<FInferStorageType>("FInferStorageType", DequantizeStorageType)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNDequantizeCompute)
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
index e201d290e8c6..d6060e54a82c 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -123,13 +123,32 @@ static void MKLDNNQuantizeV2Compute(const nnvm::NodeAttrs& attrs, const OpContex
                                     const std::vector<OpReqType>& req,
                                     const std::vector<NDArray>& outputs) {
   const QuantizeV2Param& param = nnvm::get<QuantizeV2Param>(attrs.parsed);
-  auto out_type = GetOutputType(param);
-  if (out_type == mshadow::kUint8) {
-    MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
-  } else if (out_type == mshadow::kInt8) {
-    MKLDNNQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
+  if (inputs[0].dtype() == mshadow::kUint8 || inputs[0].dtype() == mshadow::kInt8) {
+    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+      *outputs[1].data().dptr<float>() = param.min_calib_range.value();
+      *outputs[2].data().dptr<float>() = param.max_calib_range.value();
+    } else {
+      if (inputs[0].dtype() == mshadow::kUint8) {
+        *outputs[1].data().dptr<float>() = 0;
+        *outputs[2].data().dptr<float>() = 255;
+      } else {
+        *outputs[1].data().dptr<float>() = -127;
+        *outputs[2].data().dptr<float>() = 127;
+      }
+    }
+    if (req[0] != kWriteInplace) {
+      const_cast<NDArray&>(outputs[0]).CopyFrom(*inputs[0].GetMKLDNNData());
+      MKLDNNStream::Get()->Submit();
+    }
   } else {
-    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+    auto out_type = GetOutputType(param);
+    if (out_type == mshadow::kUint8) {
+      MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
+    } else if (out_type == mshadow::kInt8) {
+      MKLDNNQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
+    } else {
+      LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+    }
   }
 }
 
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
new file mode 100644
index 000000000000..71daf2ec2c16
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_quantized_fully_connected.cc
+ * \brief MKLDNN Quantized FullyConnected operator
+ * \author Ciyong Chen
+ */
+
+#if MXNET_USE_MKLDNN == 1
+#include "../../nn/mkldnn/mkldnn_fully_connected-inl.h"
+#include "../quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs,
+                                          const OpContext &ctx,
+                                          const std::vector<NDArray> &in_data,
+                                          const std::vector<OpReqType> &req,
+                                          const std::vector<NDArray> &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
+  FullyConnectedParam param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  const size_t num_inputs = param.no_bias ? 2 : 3;
+
+  CHECK_EQ(in_data.size(), static_cast<size_t>(num_inputs * 3));
+  CHECK_EQ(out_data.size(), 3U);
+
+  NDArray data = in_data[fullc::kData];
+  NDArray weight = in_data[fullc::kWeight];
+
+  const float min_data =
+    in_data[num_inputs + quantized_fullc::kDataMin].data().dptr<float>()[0];
+  const float max_data =
+    in_data[num_inputs + quantized_fullc::kDataMax].data().dptr<float>()[0];
+  const float min_weight =
+    in_data[num_inputs + quantized_fullc::kWeightMin].data().dptr<float>()[0];
+  const float max_weight =
+    in_data[num_inputs + quantized_fullc::kWeightMax].data().dptr<float>()[0];
+  float *min_output_ptr = out_data[quantized_fullc::kOutMin].data().dptr<float>();
+  float *max_output_ptr = out_data[quantized_fullc::kOutMax].data().dptr<float>();
+
+  auto data_range = (data.dtype() == mshadow::kInt8) ? kInt8Range : kUint8Range;
+  float data_scale = data_range / MaxAbs(min_data, max_data);
+  float weight_scale = kInt8Range / MaxAbs(min_weight, max_weight);
+
+  NDArray quantized_bias;
+  if (!param.no_bias) {
+    NDArray bias = in_data[fullc::kBias];
+    float min_bias = in_data[num_inputs + quantized_fullc::kBiasMin].data().dptr<float>()[0];
+    float max_bias = in_data[num_inputs + quantized_fullc::kBiasMax].data().dptr<float>()[0];
+    float bias_int32_rescale = data_scale * weight_scale * MaxAbs(min_bias, max_bias) / kInt8Range;
+
+    quantized_bias = NDArray(bias.storage_type(), bias.shape(),
+                             bias.ctx(), true, mshadow::kInt32);
+    int8_t *bias_ptr = bias.data().dptr<int8_t>();
+    int32_t *quantized_bias_ptr = quantized_bias.data().dptr<int32_t>();
+    size_t bias_size = bias.shape().Size();
+    #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    for (size_t i = 0; i < bias_size; ++i) {
+      quantized_bias_ptr[i] = bias_ptr[i] * bias_int32_rescale;
+    }
+  }
+
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+    min_output_ptr, max_output_ptr, &min_data, &max_data, &min_weight, &max_weight);
+
+  bool is_train = false;
+  mkldnn::memory::desc out_md = GetMemDesc(out_data[fullc::kOut]);
+  MKLDNNFCFlattenData(param, out_data[fullc::kOut], &data, &out_md);
+  auto &fwd = GetFCFwd(param, is_train, data, weight,
+      param.no_bias ? nullptr : &quantized_bias, out_md);
+
+  auto data_mem = in_data[fullc::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc());
+  const mkldnn::memory *weight_mem = nullptr;
+
+  if (weight.IsDefaultData()) {
+    weight_mem = GetWeights(weight, fwd.fwd_pd.weights_primitive_desc(), 1);
+    weight.MKLDNNDataReorderAsync(fwd.fwd_pd.weights_primitive_desc());
+  } else {
+    weight_mem = weight.GetMKLDNNData();
+    CHECK(weight_mem->get_primitive_desc() == fwd.fwd_pd.weights_primitive_desc());
+  }
+  auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut], fwd.fwd_pd.dst_primitive_desc(),
+                                 req[fullc::kOut]);
+  const mkldnn::memory *bias_mem = nullptr;
+  if (!param.no_bias)
+    bias_mem = quantized_bias.GetMKLDNNDataReorder(fwd.fwd_pd.bias_primitive_desc());
+
+  fwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second);
+  MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
+
+  CommitOutput(out_data[fullc::kOut], out_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_ops-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantized_ops-inl.h
new file mode 100644
index 000000000000..88d77c8d0cb2
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_ops-inl.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_quantized_ops-inl.h
+ * \brief Common functions used by MKLDNN Quantized FullyConnected operator
+ * \author Ciyong Chen
+ */
+
+#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZED_OPS_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZED_OPS_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <mxnet/ndarray.h>
+#include <vector>
+
+namespace mxnet {
+namespace op {
+
+void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs,
+                                          const OpContext &ctx,
+                                          const std::vector<NDArray> &in_data,
+                                          const std::vector<OpReqType> &req,
+                                          const std::vector<NDArray> &out_data);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZED_OPS_INL_H_
diff --git a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
index 45713589dd48..ac414c72d51a 100644
--- a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
@@ -115,7 +115,7 @@ static void MKLDNNRequantizeForward(const nnvm::NodeAttrs& attrs,
     const size_t actual_float_size = sizeof(float);
     const size_t actual_quantized_size = sizeof(SrcDType);
     const size_t temp_reduce_size = ConfigReduce<cpu, SrcDType>(s,
-                         inputs[0].shape(), mxnet::TShape({1}), &src_shape, &dst_shape);
+                         inputs[0].shape(), mxnet::TShape(1, 1), &src_shape, &dst_shape);
     Tensor<cpu, 1, char> temp_space =
       ctx.requested[0].get_space_typed<cpu, 1, char>(
       Shape1(2*actual_float_size+2*actual_quantized_size+temp_reduce_size), s);
diff --git a/src/operator/quantization/quantize-inl.h b/src/operator/quantization/quantize-inl.h
index 747deadd68fe..7b856579a7b5 100644
--- a/src/operator/quantization/quantize-inl.h
+++ b/src/operator/quantization/quantize-inl.h
@@ -95,6 +95,10 @@ void QuantizeCompute(const nnvm::NodeAttrs& attrs,
 
   const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
   if (param.out_type == mshadow::kUint8) {
+    if (std::is_same<xpu, gpu>::value) {
+      LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
+                    "please switch to the context of CPU or int8 data type for GPU.";
+    }
     Kernel<quantize_unsigned, xpu>::Launch(s, outputs[0].Size(),
       outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
       inputs[0].dptr<float>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
@@ -116,13 +120,13 @@ inline bool QuantizeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 3U);
 
   for (size_t i = 1; i < 3; ++i) {
-    SHAPE_ASSIGN_CHECK(*in_attrs, i, mxnet::TShape({1}));
+    SHAPE_ASSIGN_CHECK(*in_attrs, i, mxnet::TShape(1, 1));
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape{1});
   SHAPE_ASSIGN_CHECK(*out_attrs, 2, mxnet::TShape{1});
-  return !shape_is_none(out_attrs->at(0));
+  return shape_is_known(out_attrs->at(0));
 }
 
 inline bool QuantizeType(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/quantization/quantize.cc b/src/operator/quantization/quantize.cc
index c28d8c860924..63467506b99b 100644
--- a/src/operator/quantization/quantize.cc
+++ b/src/operator/quantization/quantize.cc
@@ -82,6 +82,9 @@ where
 .set_attr<mxnet::FInferShape>("FInferShape", QuantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizeType)
 .set_attr<FInferStorageType>("FInferStorageType", QuantizeStorageType)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizeCompute)
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index af533978a6f5..5bd9e8af9038 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -265,7 +265,7 @@ Graph QuantizeGraph(Graph &&src) {
             (mirror_node->op() != Op::Get("_contrib_dequantize"))) {
           // here we calculate the output number (exclude min/max, in order to
           // calculate min/max index from mirror node) based on assumption that
-          // there is only 1min and 1max output from mirror node (which is
+          // there is only 1 min and 1 max output from mirror node (which is
           // currently true)
           size_t num_outputs = mirror_node->num_outputs() - 2;
           uint32_t min_index = num_outputs + 2 * e.index;
@@ -297,9 +297,13 @@ Graph QuantizeGraph(Graph &&src) {
       // Only insert dequantize for those Ops supports quantize and not excluded.
       NodePtr mirror_node = mirror_map.at(e.node.get());
       NodeEntry mirror_entry = NodeEntry{mirror_node, e.index, e.version};
-      size_t num_inputs = e.node->num_inputs();
-      uint32_t min_index = num_inputs + 2 * e.index;
-      uint32_t max_index = num_inputs + 2 * e.index + 1;
+      // here we calculate the output number (exclude min/max, in order to
+      // calculate min/max index from mirror node) based on assumption that
+      // there is only 1 min and 1 max output from mirror node (which is
+      // currently true)
+      size_t num_outputs = e.node->num_outputs();
+      uint32_t min_index = num_outputs + 2 * e.index;
+      uint32_t max_index = num_outputs + 2 * e.index + 1;
 
       NodePtr dequantize_node = CreateNode("_contrib_dequantize",
           e.node->attrs.name + "_dequantize");
diff --git a/src/operator/quantization/quantize_v2-inl.h b/src/operator/quantization/quantize_v2-inl.h
index 7a0998383824..9ebb645e1ba6 100644
--- a/src/operator/quantization/quantize_v2-inl.h
+++ b/src/operator/quantization/quantize_v2-inl.h
@@ -137,51 +137,71 @@ void QuantizeV2Compute(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
   auto out_type = GetOutputType(param);
-  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
-    if (out_type == mshadow::kUint8) {
-      Kernel<quantize_v2_unsigned, xpu>::Launch(
-          s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
-          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
-          param.max_calib_range.value(), MinValue<uint8_t>(), MaxValue<uint8_t>());
-    } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
-      Kernel<quantize_v2_zero_centered, xpu>::Launch(
-          s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
-          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
-          param.max_calib_range.value(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+  if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
+    LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
+                  "please switch to the context of CPU or int8 data type for GPU.";
+  }
+
+  if (inputs[0].type_flag_ == mshadow::kUint8 || inputs[0].type_flag_ == mshadow::kInt8) {
+    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+      *outputs[1].dptr<float>() = param.min_calib_range.value();
+      *outputs[2].dptr<float>() = param.max_calib_range.value();
     } else {
-      LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+      if (inputs[0].type_flag_ == mshadow::kUint8) {
+        *outputs[1].dptr<float>() = 0;
+        *outputs[2].dptr<float>() = 255;
+      } else {
+        *outputs[1].dptr<float>() = -127;
+        *outputs[2].dptr<float>() = 127;
+      }
     }
-  } else {  // model is not calibrated
-    mxnet::TShape src_shape, dst_shape;
-    const size_t actual_float_size = sizeof(float);
-    const size_t temp_reduce_size =
-        ConfigReduce<xpu, SrcDType>(s, inputs[0].shape_, mxnet::TShape({1}),
-                                    &src_shape, &dst_shape);
-    Tensor<xpu, 1, char> temp_space = ctx.requested[0].get_space_typed<xpu, 1, char>(
-        Shape1(2 * actual_float_size + temp_reduce_size), s);
-    const int dev_id = ctx.run_ctx.ctx.dev_id;
-    TBlob in_min_t(reinterpret_cast<SrcDType *>(temp_space.dptr_), Shape1(1), xpu::kDevMask,
-                   dev_id);
-    TBlob in_max_t(reinterpret_cast<SrcDType *>(temp_space.dptr_) + 1, Shape1(1), xpu::kDevMask,
-                   dev_id);
-    Tensor<xpu, 1, char> workspace(temp_space.dptr_ + 2 * actual_float_size,
-                                   Shape1(temp_reduce_size), s);
-    broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
-        s, in_min_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
-    broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
-        s, in_max_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
-    if (out_type == mshadow::kUint8) {
-      Kernel<quantize_v2_unsigned, xpu>::Launch(
-          s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
-          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
-          in_max_t.dptr<float>(), MinValue<uint8_t>(), MaxValue<uint8_t>());
-    } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
-      Kernel<quantize_v2_zero_centered, xpu>::Launch(
-          s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
-          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
-          in_max_t.dptr<float>(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
-    } else {
-      LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+    UnaryOp::IdentityCompute<xpu>(attrs, ctx, {inputs[0]}, req, outputs);
+  } else {
+    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+      if (out_type == mshadow::kUint8) {
+        Kernel<quantize_v2_unsigned, xpu>::Launch(
+            s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
+            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
+            param.max_calib_range.value(), MinValue<uint8_t>(), MaxValue<uint8_t>());
+      } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
+        Kernel<quantize_v2_zero_centered, xpu>::Launch(
+            s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
+            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
+            param.max_calib_range.value(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+      } else {
+        LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+      }
+    } else {  // model is not calibrated
+      mxnet::TShape src_shape, dst_shape;
+      const size_t actual_float_size = sizeof(float);
+      const size_t temp_reduce_size = ConfigReduce<xpu, SrcDType>(
+          s, inputs[0].shape_, mxnet::TShape(1, 1), &src_shape, &dst_shape);
+      Tensor<xpu, 1, char> temp_space = ctx.requested[0].get_space_typed<xpu, 1, char>(
+          Shape1(2 * actual_float_size + temp_reduce_size), s);
+      const int dev_id = ctx.run_ctx.ctx.dev_id;
+      TBlob in_min_t(reinterpret_cast<SrcDType *>(temp_space.dptr_), Shape1(1), xpu::kDevMask,
+                    dev_id);
+      TBlob in_max_t(reinterpret_cast<SrcDType *>(temp_space.dptr_) + 1, Shape1(1), xpu::kDevMask,
+                    dev_id);
+      Tensor<xpu, 1, char> workspace(temp_space.dptr_ + 2 * actual_float_size,
+                                    Shape1(temp_reduce_size), s);
+      broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
+          s, in_min_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
+      broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
+          s, in_max_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
+      if (out_type == mshadow::kUint8) {
+        Kernel<quantize_v2_unsigned, xpu>::Launch(
+            s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
+            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
+            in_max_t.dptr<float>(), MinValue<uint8_t>(), MaxValue<uint8_t>());
+      } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
+        Kernel<quantize_v2_zero_centered, xpu>::Launch(
+            s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
+            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
+            in_max_t.dptr<float>(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+      } else {
+        LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+      }
     }
   }
 }
@@ -202,7 +222,8 @@ static inline bool QuantizeV2Type(const nnvm::NodeAttrs &attrs, std::vector<int>
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 3U);
   const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  CHECK(in_attrs->at(0) == mshadow::kFloat32 || in_attrs->at(0) == mshadow::kUint8 ||
+        in_attrs->at(0) == mshadow::kInt8);
   auto out_type = GetOutputType(param);
   if (out_type == mshadow::kUint8) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
diff --git a/src/operator/quantization/quantize_v2.cc b/src/operator/quantization/quantize_v2.cc
index e221d580d228..920100bc9f8b 100644
--- a/src/operator/quantization/quantize_v2.cc
+++ b/src/operator/quantization/quantize_v2.cc
@@ -83,11 +83,20 @@ If min_calib_range isn't presented, the output type will be int8.
 .set_attr<mxnet::FInferShape>("FInferShape", QuantizeV2Shape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizeV2Type)
 .set_attr<FInferStorageType>("FInferStorageType", QuantizeV2StorageType)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizeV2Compute)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", QuantizeV2Compute<cpu>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs) {
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity", [](const NodeAttrs& attrs){
+  return std::vector<bool>{true};
+})
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
   const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
   if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
diff --git a/src/operator/quantization/quantized_concat.cc b/src/operator/quantization/quantized_concat.cc
index f5c1e8e6ceae..d6aeb41da1f8 100644
--- a/src/operator/quantization/quantized_concat.cc
+++ b/src/operator/quantization/quantized_concat.cc
@@ -35,34 +35,34 @@ static bool ConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_sha
   CHECK_EQ(out_shape->size(), 3U);
   mxnet::TShape dshape;
   index_t size = 0;
-  bool has_zero = false;
+  bool has_unknown_dim_size = false;
   int axis = -1;
   for (int i = 0; i < param_.num_args; ++i) {
     mxnet::TShape tmp = (*in_shape)[i];
-    if (tmp.ndim()) {
+    if (tmp.ndim() > 0) {
       axis = CheckAxis(param_.dim, tmp.ndim());
-      has_zero = tmp[axis] == 0 || has_zero;
+      has_unknown_dim_size = !mxnet::dim_size_is_known(tmp, axis) || has_unknown_dim_size;
       size += tmp[axis];
-      tmp[axis] = 0;
+      tmp[axis] = -1;
       shape_assign(&dshape, tmp);
     }
   }
 
   mxnet::TShape tmp = (*out_shape)[0];
-  if (tmp.ndim()) {
+  if (tmp.ndim() > 0) {
     axis = CheckAxis(param_.dim, tmp.ndim());
-    tmp[axis] = 0;
+    tmp[axis] = -1;
     shape_assign(&dshape, tmp);
   }
 
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
 
   for (int i = 0; i < param_.num_args; ++i) {
     CHECK(shape_assign(&(*in_shape)[i], dshape))
         << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
   }
 
-  if (!has_zero) dshape[axis] = size;
+  if (!has_unknown_dim_size) dshape[axis] = size;
   CHECK(shape_assign(&(*out_shape)[0], dshape))
       << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
 
@@ -71,7 +71,7 @@ static bool ConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_sha
   }
   SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape{1});
   SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape{1});
-  return dshape.Size() != 0;
+  return shape_is_known(dshape);
 }
 
 static bool ConcatType(const nnvm::NodeAttrs& attrs, std::vector<int>* in_type,
@@ -127,6 +127,9 @@ If any input holds int8, then the output will be int8. Otherwise output will be
 .set_attr<nnvm::FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
   return std::vector<std::string>{"output", "min_output", "max_output"};
 })
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .set_attr<nnvm::FInferType>("FInferType", ConcatType)
 .set_attr<mxnet::FInferShape>("FInferShape", ConcatShape)
 .set_attr<std::string>("key_var_num_args", "num_args")
diff --git a/src/operator/quantization/quantized_conv.cc b/src/operator/quantization/quantized_conv.cc
index 7841c3acb47c..aa3f5ce1ad61 100644
--- a/src/operator/quantization/quantized_conv.cc
+++ b/src/operator/quantization/quantized_conv.cc
@@ -78,8 +78,8 @@ bool QuantizedConvShape(const nnvm::NodeAttrs& attrs,
   oshape[W] = (AddPad(dshape[W], param.pad[1]) - wshape[W]) / param.stride[1] + 1;
 
   SHAPE_ASSIGN_CHECK(*out_shape, 0, oshape);
-  SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape({1}));
-  SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape({1}));
+  SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape(1, 1));
   return true;
 }
 
@@ -160,6 +160,9 @@ and max thresholds representing the threholds for quantizing the float32 output
 .set_attr<mxnet::FInferShape>("FInferShape", QuantizedConvShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedConvType)
 .set_attr<FInferStorageType>("FInferStorageType", QuantizedConvStorageType)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>(1, ResourceRequest::kTempSpace);
diff --git a/src/operator/quantization/quantized_flatten-inl.h b/src/operator/quantization/quantized_flatten-inl.h
index 99a262de19ca..de051b969659 100644
--- a/src/operator/quantization/quantized_flatten-inl.h
+++ b/src/operator/quantization/quantized_flatten-inl.h
@@ -86,10 +86,10 @@ inline bool QuantizedFlattenShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 3U);
 
   const mxnet::TShape &dshape = (*in_attrs)[0];
-  if (shape_is_none(dshape)) return false;
+  if (!shape_is_known(dshape)) return false;
 
-  uint32_t target_dim = 1;
-  for (uint32_t i = 1; i < dshape.ndim(); ++i) {
+  dim_t target_dim = 1;
+  for (int i = 1; i < dshape.ndim(); ++i) {
     target_dim *= dshape[i];
   }
 
diff --git a/src/operator/quantization/quantized_flatten.cc b/src/operator/quantization/quantized_flatten.cc
index f283d98cf10b..7e6d27b256d4 100644
--- a/src/operator/quantization/quantized_flatten.cc
+++ b/src/operator/quantization/quantized_flatten.cc
@@ -34,6 +34,9 @@ NNVM_REGISTER_OP(_contrib_quantized_flatten)
 .set_attr<mxnet::FInferShape>("FInferShape", QuantizedFlattenShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedFlattenType)
 .set_attr<FCompute>("FCompute<cpu>", QuantizedFlattenCompute<cpu>)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"data", "min_data", "max_data"};
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index f51b6fdd1798..e42ea3020352 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -26,6 +26,10 @@
 #include <vector>
 #include "quantization_utils.h"
 #include "../nn/fully_connected-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "../nn/mkldnn/mkldnn_fully_connected-inl.h"
+#include "mkldnn/mkldnn_quantized_ops-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
@@ -38,16 +42,22 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
                                   mxnet::ShapeVector *in_shape,
                                   mxnet::ShapeVector *out_shape) {
   const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  CHECK(param.flatten) << "QuantizedFullyConnectedOp only supports flatten=true for now";
   using namespace mshadow;
   uint32_t num_inputs = param.no_bias ? 2 : 3;
   CHECK_EQ(in_shape->size(), num_inputs * 3);
   CHECK_EQ(out_shape->size(), 3U);
 
-  CHECK(!shape_is_none(in_shape->at(0)))
+  CHECK(shape_is_known(in_shape->at(0)))
     << "QuantizedFullyConnectedOp input data shape must be given";
   const mxnet::TShape& dshape = in_shape->at(0);
-  mxnet::TShape wshape = Shape2(param.num_hidden, dshape.ProdShape(1, dshape.ndim()));
+  index_t num_input;
+  if (!param.flatten) {
+    num_input = dshape[dshape.ndim() - 1];
+  } else {
+    num_input = dshape.ProdShape(1, dshape.ndim());
+  }
+
+  TShape wshape = Shape2(param.num_hidden, num_input);
   SHAPE_ASSIGN_CHECK(*in_shape, 1, wshape);
   if (!param.no_bias) {
     mxnet::TShape bshape = Shape1(param.num_hidden);
@@ -58,9 +68,15 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*in_shape, i, mxnet::TShape{1});
   }
 
-  SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape({dshape[0], wshape[0]}));
-  SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape({1}));
-  SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape({1}));
+  if (!param.flatten) {
+    TShape result_shape(dshape);
+    result_shape[dshape.ndim() - 1] = param.num_hidden;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
+  } else {
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
+  }
+  SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape(1, 1));
   return true;
 }
 
@@ -72,7 +88,14 @@ bool QuantizedFullyConnectedType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_type->size(), num_inputs * 3);
   CHECK_EQ(out_type->size(), 3U);
 
-  for (size_t i = 0; i < num_inputs; ++i) {
+#if MXNET_USE_MKLDNN == 1
+  CHECK(in_type->at(0) == mshadow::kInt8 || in_type->at(0) == mshadow::kUint8)
+      << "QuantizedFullyConnected only supports int8/uint8 input, while "
+      << in_type->at(0) << " is given.";
+#else
+  TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kInt8);
+#endif
+  for (size_t i = 1; i < num_inputs; ++i) {
     TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kInt8);
   }
   for (size_t i = num_inputs; i < 3 * num_inputs; ++i) {
@@ -90,10 +113,16 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
                                         DispatchMode* dispatch_mode,
                                         std::vector<int> *in_attrs,
                                         std::vector<int> *out_attrs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t num_inputs = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), num_inputs * 3);
+  CHECK_EQ(out_attrs->size(), 3U);
+
+#if MXNET_USE_MKLDNN == 1
+  return MKLDNNStorageType(attrs, dev_mask, true,
+                           dispatch_mode, in_attrs, out_attrs);
+#else
   *dispatch_mode = DispatchMode::kFCompute;
-  if (dev_mask == mshadow::cpu::kDevMask) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
 
   for (auto &v : *out_attrs) {
     v = kDefaultStorage;
@@ -109,6 +138,7 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
     }
   }
   return true;
+#endif
 }
 
 struct QuantizedSumInitKernelWithBias {
@@ -137,28 +167,42 @@ struct QuantizedSumInitKernelWithBias {
 };
 
 
-template<typename SrcType>
-void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
-                                    const OpContext &ctx,
-                                    const std::vector<NDArray> &in_data,
-                                    const std::vector<OpReqType> &req,
-                                    const std::vector<NDArray> &out_data) {
+void QuantizedFullyConnectedForwardCPU(const nnvm::NodeAttrs& attrs,
+                                       const OpContext &ctx,
+                                       const std::vector<TBlob> &in_data,
+                                       const std::vector<OpReqType> &req,
+                                       const std::vector<TBlob> &out_data) {
 #if MSHADOW_USE_MKL == 1
   const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
   using namespace mshadow;
   using namespace mxnet_op;
+  Stream<cpu> *s = ctx.get_stream<cpu>();
   size_t num_inputs = param.no_bias ? 2 : 3;
   CHECK_EQ(in_data.size(),  num_inputs * 3);
   CHECK_EQ(out_data.size(), 3U);
-  const NDArray& data = in_data[0];
-  const NDArray& weight = in_data[1];
-  const NDArray& out = out_data[0];
-  mxnet::TShape dshape = data.shape();
-  mxnet::TShape wshape = weight.shape();
-  mxnet::TShape oshape = out.shape();
-  auto output_temp = out.data().dptr<int32_t>();
-  auto weight_temp = weight.data().dptr<SrcType>();
-  auto data_temp = data.data().dptr<SrcType>();
+
+  const mxnet::TShape &dshape = in_data[fullc::kData].shape_;
+  const mxnet::TShape &wshape = in_data[fullc::kWeight].shape_;
+  const mxnet::TShape &oshape = out_data[fullc::kOut].shape_;
+
+  CHECK(in_data[fullc::kData].type_flag_ == mshadow::kInt8)
+    << "QuantizedFullyConnectedForwardCPU Op only supports int8 for now, but got "
+    << mxnet::op::type_string(in_data[fullc::kData].type_flag_);
+
+  if (dshape.ndim() != 2)
+    CHECK(param.flatten)
+        << "QuantizedFullyConnectedForwardCPU only supports flatten=true "
+        << "when dshape.ndim() != 2 for now.";
+
+  Tensor<cpu, 2, int8_t> weight = in_data[fullc::kWeight].get<cpu, 2, int8_t>(s);
+  Tensor<cpu, 2, int8_t> data = in_data[fullc::kData].get_with_shape<cpu, 2, int8_t>(
+    Shape2(dshape[0], dshape.ProdShape(1, dshape.ndim())), s);
+  Tensor<cpu, 2, int32_t> out = out_data[fullc::kOut].get_with_shape<cpu, 2, int32_t>(
+    Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
+
+  auto data_temp = data.dptr_;
+  auto weight_temp = weight.dptr_;
+  auto output_temp = out.dptr_;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   const float alpha = 1.0f;
   const float beta  = 1.0f;
@@ -167,7 +211,6 @@ void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
   const MKL_INT8 ob = 0;
   MKL_INT32 oc = 0;
   const int m = dshape[0], n = wshape[0], k = dshape.ProdShape(1, dshape.ndim());
-  Stream<cpu> *s = ctx.get_stream<cpu>();
   //  cblas_gemm_s8u8s32 required first matrix must be uint8
   //  shift data from int8(from -128 to 127) to uint8 (from 0 to 255)
   int shift = 128;
@@ -179,16 +222,29 @@ void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
     shiftdata.dptr_[i] = data_temp[i] + shift;
   }
 
-  Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
-      out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
-      in_data[num_inputs].data().dptr<float>(), in_data[num_inputs+1].data().dptr<float>(),
-      in_data[num_inputs+2].data().dptr<float>(), in_data[num_inputs+3].data().dptr<float>());
+  Tensor<cpu, 1, float> min_output = out_data[quantized_fullc::kOutMin].get<cpu, 1, float>(s);
+  Tensor<cpu, 1, float> max_output = out_data[quantized_fullc::kOutMax].get<cpu, 1, float>(s);
+  Tensor<cpu, 1, float> min_data =
+    in_data[num_inputs + quantized_fullc::kDataMin].get<cpu, 1, float>(s);
+  Tensor<cpu, 1, float> max_data =
+    in_data[num_inputs + quantized_fullc::kDataMax].get<cpu, 1, float>(s);
+  Tensor<cpu, 1, float> min_weight =
+    in_data[num_inputs + quantized_fullc::kWeightMin].get<cpu, 1, float>(s);
+  Tensor<cpu, 1, float> max_weight =
+    in_data[num_inputs + quantized_fullc::kWeightMax].get<cpu, 1, float>(s);
+
+  Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1, min_output.dptr_,
+      max_output.dptr_, min_data.dptr_, max_data.dptr_, min_weight.dptr_, max_weight.dptr_);
   if (!param.no_bias) {
-    const NDArray& bias = in_data[2];
-    Kernel<QuantizedSumInitKernelWithBias, cpu>::Launch(s, n, out.data().dptr<int32_t>(),
-        bias.data().dptr<int8_t>(), out_data[1].data().dptr<float>(),
-        out_data[2].data().dptr<float>(), in_data[7].data().dptr<float>(),
-        in_data[8].data().dptr<float>());
+    Tensor<cpu, 1, int8_t> bias = in_data[fullc::kBias].get_with_shape<cpu, 1, int8_t>(
+      Shape1(wshape[0]), s);
+    Tensor<cpu, 1, float> min_bias =
+      in_data[num_inputs + quantized_fullc::kBiasMin].get<cpu, 1, float>(s);
+    Tensor<cpu, 1, float> max_bias =
+      in_data[num_inputs + quantized_fullc::kBiasMax].get<cpu, 1, float>(s);
+
+    Kernel<QuantizedSumInitKernelWithBias, cpu>::Launch(s, n, out.dptr_,
+        bias.dptr_, min_output.dptr_, max_output.dptr_, min_bias.dptr_, max_bias.dptr_);
   } else {
     #pragma omp parallel for num_threads(omp_threads)
     for (int i = 0; i < m * n; ++i) {
@@ -216,11 +272,11 @@ void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
                      shiftdata.dptr_,
                      k,
                      oa,
-                     weight.data().dptr<SrcType>(),
+                     weight.dptr_,
                      k,
                      ob,
                      beta,
-                     out.data().dptr<int32_t>(),
+                     out.dptr_,
                      n,
                      &oc);
 #else
@@ -230,6 +286,16 @@ void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
 #endif
 }
 
+#if MXNET_USE_MKLDNN == 1
+void QuantizedFullyConnectedForwardExCPU(const nnvm::NodeAttrs &attrs,
+                                         const OpContext &ctx,
+                                         const std::vector<NDArray> &in_data,
+                                         const std::vector<OpReqType> &req,
+                                         const std::vector<NDArray> &out_data) {
+  MKLDNNQuantizedFullyConnectedForward(attrs, ctx, in_data, req, out_data);
+}
+#endif
+
 NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
 .describe(R"code(Fully Connected operator for input, weight and bias data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
@@ -264,9 +330,15 @@ and max thresholds representing the threholds for quantizing the float32 output
 .set_attr<mxnet::FInferShape>("FInferShape", QuantizedFullyConnectedShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedFullyConnectedType)
 .set_attr<FInferStorageType>("FInferStorageType", QuantizedFullyConnectedStorageType)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
-.set_attr<FComputeEx>("FComputeEx<cpu>",
-    QuantizedFullyConnectedForward<int8_t>)
+.set_attr<FCompute>("FCompute<cpu>", QuantizedFullyConnectedForwardCPU)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FComputeEx>("FComputeEx<cpu>", QuantizedFullyConnectedForwardExCPU)
+#endif
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
diff --git a/src/operator/quantization/quantized_fully_connected.cu b/src/operator/quantization/quantized_fully_connected.cu
index e8580e2e2c9d..d1cbdc98d535 100644
--- a/src/operator/quantization/quantized_fully_connected.cu
+++ b/src/operator/quantization/quantized_fully_connected.cu
@@ -75,6 +75,11 @@ void QuantizedFullyConnectedForwardGPU(const nnvm::NodeAttrs& attrs,
   mxnet::TShape oshape = out.shape_;
   // (m, n) * (k, n).T = (m, k)
   // A * B.T = C
+  if (dshape.ndim() != 2) {
+    CHECK(param.flatten)
+      << "Currently, QuantizedFullyConnected Op only supports flatten=true "
+      << "when ishape.ndim()!=2 for GPU.";
+  }
 
   // row_C = col_C(T) = cublas(col_B * col_A(T)) = cublas(row_B(T), row_A)
   // row_C = col_C(T) = cublas(col_B(T) * col_A(T)) = cublas(row_B, row_A)
diff --git a/src/operator/quantization/quantized_pooling.cc b/src/operator/quantization/quantized_pooling.cc
index cdc98eeac6f6..1839e2a29d77 100644
--- a/src/operator/quantization/quantized_pooling.cc
+++ b/src/operator/quantization/quantized_pooling.cc
@@ -35,7 +35,7 @@ bool QuantizedPoolingShape(const nnvm::NodeAttrs& attrs,
                            mxnet::ShapeVector *out_shape) {
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), 3U);
-  if (shape_is_none(in_shape->at(0))) return false;
+  if (!shape_is_known(in_shape->at(0))) return false;
   const mxnet::TShape &dshape = (*in_shape)[0];
   CHECK_EQ(dshape.ndim(), 4U)
       << "quantized_pooling: Input data should be 4D in "
@@ -45,7 +45,7 @@ bool QuantizedPoolingShape(const nnvm::NodeAttrs& attrs,
       << "QuantizedPoolingOp only supports NCHW layout for now, saw " << layout;
   // NCHW layout
   const int N = 0, H = 2, W = 3, C = 1;
-  mxnet::TShape oshape(4);
+  mxnet::TShape oshape(4, -1);
   CHECK_EQ(param.kernel.ndim(), 2) << "QuantizedPoolingOp only supports 2D pooling for now";
   CHECK(param.kernel[0] <= dshape[H] + 2 * param.pad[0])
       << "kernel size (" << param.kernel[0]
@@ -157,6 +157,9 @@ the float32 data into int8.
 .set_attr<mxnet::FInferShape>("FInferShape", QuantizedPoolingShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedPoolingType)
 .set_attr<FInferStorageType>("FInferStorageType", QuantizedPoolingStorageType)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .set_attr<FNeedRequantize>("FNeedRequantize",
   [](const NodeAttrs& attrs) {
     const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
diff --git a/src/operator/quantization/requantize-inl.h b/src/operator/quantization/requantize-inl.h
index 21d58d4607eb..9106c7fe4716 100644
--- a/src/operator/quantization/requantize-inl.h
+++ b/src/operator/quantization/requantize-inl.h
@@ -111,7 +111,7 @@ void RequantizeForward(const nnvm::NodeAttrs& attrs,
     const size_t actual_float_size = sizeof(float);
     const size_t actual_quantized_size = sizeof(SrcDType);
     const size_t temp_reduce_size = ConfigReduce<xpu, SrcDType>(
-        s, inputs[0].shape_, mxnet::TShape({1}), &src_shape, &dst_shape);
+        s, inputs[0].shape_, mxnet::TShape(1, 1), &src_shape, &dst_shape);
     Tensor<xpu, 1, char> temp_space =
       ctx.requested[0].get_space_typed<xpu, 1, char>(
           Shape1(2*actual_float_size+2*actual_quantized_size+temp_reduce_size), s);
diff --git a/src/operator/quantization/requantize.cc b/src/operator/quantization/requantize.cc
index edfb58e5cbd5..4807226e464c 100644
--- a/src/operator/quantization/requantize.cc
+++ b/src/operator/quantization/requantize.cc
@@ -64,6 +64,9 @@ inference accuracy.
 .set_attr<mxnet::FInferShape>("FInferShape", QuantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", RequantizeType)
 .set_attr<FInferStorageType>("FInferStorageType", RequantizeStorageType)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNRequantizeForward)
diff --git a/src/operator/random/multisample_op.h b/src/operator/random/multisample_op.h
index e9f266932e13..7d5e256297ad 100644
--- a/src/operator/random/multisample_op.h
+++ b/src/operator/random/multisample_op.h
@@ -66,7 +66,7 @@ inline bool MultiSampleOpShape(const nnvm::NodeAttrs& attrs,
   // Get shape to be sampled for each parameter set.
   const MultiSampleParam& param = nnvm::get<MultiSampleParam>(attrs.parsed);
   mxnet::TShape sshape = param.shape;
-  for (size_t i = 0; i < sshape.ndim(); ++i) {
+  for (int i = 0; i < sshape.ndim(); ++i) {
     CHECK_GT(sshape[i], 0) << "shape parameter must be non-zero within each dimension";
   }
   // Examine output shape whether it is already defined.
diff --git a/src/operator/random/sample_multinomial_op.h b/src/operator/random/sample_multinomial_op.h
index e76cd646b850..b38aefbc1634 100644
--- a/src/operator/random/sample_multinomial_op.h
+++ b/src/operator/random/sample_multinomial_op.h
@@ -41,7 +41,7 @@ struct SampleMultinomialParam : public dmlc::Parameter<SampleMultinomialParam> {
   int dtype;
   DMLC_DECLARE_PARAMETER(SampleMultinomialParam) {
     DMLC_DECLARE_FIELD(shape)
-      .set_default(mxnet::TShape())
+      .set_default(mxnet::TShape(0, 1))
       .describe("Shape to be sampled from each random distribution.");
     DMLC_DECLARE_FIELD(get_prob)
     .set_default(false)
@@ -68,7 +68,7 @@ inline bool SampleMultinomialOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), param.get_prob ? 2U : 1U);
   const mxnet::TShape& ishape = (*in_attrs)[0];
-  if (!ishape.ndim()) return false;
+  if (!shape_is_known(ishape)) return false;
 
   MSHADOW_TYPE_SWITCH(param.dtype, DType, {
     CHECK_LE(ishape[ishape.ndim() - 1], mxnet::common::MaxIntegerValue<DType>())
@@ -76,26 +76,26 @@ inline bool SampleMultinomialOpShape(const nnvm::NodeAttrs& attrs,
   });
 
   if (ishape.ndim() == 1) {
-    if (param.shape.ndim()) {
+    if (param.shape.ndim() > 0) {
       SHAPE_ASSIGN_CHECK(*out_attrs, 0, param.shape);
       if (param.get_prob) SHAPE_ASSIGN_CHECK(*out_attrs, 1, param.shape);
     } else {
-      SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1));
-      if (param.get_prob) SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape(1));
+      SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
+      if (param.get_prob) SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape(1, 1));
     }
     return true;
   }
 
-  mxnet::TShape oshape(ishape.ndim() - 1 + param.shape.ndim());
-  for (size_t i = 0; i < ishape.ndim() - 1; ++i) {
+  mxnet::TShape oshape(ishape.ndim() - 1 + param.shape.ndim(), -1);
+  for (int i = 0; i < ishape.ndim() - 1; ++i) {
     oshape[i] = ishape[i];
   }
-  for (size_t i = 0; i < param.shape.ndim(); ++i) {
+  for (int i = 0; i < param.shape.ndim(); ++i) {
     oshape[i + ishape.ndim() - 1] = param.shape[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   if (param.get_prob) SHAPE_ASSIGN_CHECK(*out_attrs, 1, oshape);
-  return true;
+  return shape_is_known(out_attrs->at(0)) && shape_is_known(out_attrs->at(1));
 }
 
 
diff --git a/src/operator/random/unique_sample_op.h b/src/operator/random/unique_sample_op.h
index 87998c8f46b1..e88b95a8bdd6 100644
--- a/src/operator/random/unique_sample_op.h
+++ b/src/operator/random/unique_sample_op.h
@@ -60,7 +60,7 @@ inline bool SampleUniqueShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 0U);
   CHECK_EQ(out_attrs->size(), 2U);
   // output shape is known
-  if ((*out_attrs)[0].ndim() == 2 && param.shape.ndim() == 0) {
+  if ((*out_attrs)[0].ndim() == 2 && !mxnet::ndim_is_known(param.shape)) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::Shape1((*out_attrs)[0][0]));
     return true;
   }
diff --git a/src/operator/regression_output-inl.h b/src/operator/regression_output-inl.h
index 8b63a8a2cff6..d8f102de1675 100644
--- a/src/operator/regression_output-inl.h
+++ b/src/operator/regression_output-inl.h
@@ -57,7 +57,7 @@ inline bool RegressionOpShape(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   CHECK_EQ(in_attrs->size(), 2U) << "Input:[data, label]";
   const mxnet::TShape &dshape = in_attrs->at(0);
-  if (dshape.ndim() == 0) return false;
+  if (!shape_is_known(dshape)) return false;
   auto &lshape = (*in_attrs)[1];
   if (lshape.ndim() == 0) {
     // special treatment for 1D output, to allow 1D label by default.
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 71ad331786ae..37f21ce6d126 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -26,6 +26,9 @@
 #ifndef MXNET_OPERATOR_RNN_INL_H_
 #define MXNET_OPERATOR_RNN_INL_H_
 
+#define MXNET_USE_CUDNN_RNN MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+#define USE_CUDNN_LSTM_PROJ MXNET_USE_CUDNN == 1 && CUDNN_VERSION >= 7200
+
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
@@ -35,6 +38,7 @@
 #include <vector>
 #include <string>
 #include <utility>
+#include <cstdint>
 #include "./math.h"
 #include "./math_functions-inl.h"
 #include "./operator_common.h"
@@ -47,7 +51,7 @@ namespace rnn_enum {
   enum RNNOpInputs {kData, kParams, kState, kStateCell};
   enum RNNOpOutputs {kOut, kStateOut, kStateCellOut};
   enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
-  enum RNNOpResource {kTempSpace};
+  enum RNNOpResource {kCuDNNDropoutDescSpace};
 }
 
 inline int GetRnnParamSize(int num_layer,
@@ -160,9 +164,8 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   uint32_t num_layers;
   bool bidirectional, state_outputs;
   int mode;
-  float p, pkeep_;
+  float p;
   int seq_length_, batch_size_, input_size_;
-  bool lstm_q_;  // whether type is lstm
   dmlc::optional<int> projection_size;
   dmlc::optional<double> lstm_state_clip_min, lstm_state_clip_max;
   bool lstm_state_clip_nan;
@@ -212,7 +215,6 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   }
 };
 
-
 /**
  * @params: ws: Temp workspace for gemm's output storage.
  *          rs: Reserve space of forward intermediate data used for training.
@@ -236,6 +238,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
  *                  hy's shape is [num_layers, batch_size, state_size]
  *          cy_ptr: Only used in lstm mode. pointer of tensor cy  containing the cell state
  *                  for t=seq_length. cy' shape is [num_layers, batch_size, state_size]
+ *          dropout: should be 0 <= dropout < 1
  *          mode: Specifies the type of RNN to compute.
  */
 template <typename DType>
@@ -376,58 +379,189 @@ void RNNBackward(DType* ws,
   }
 }
 
-template<typename DType>
-class RNNOp : public Operator{
+template<typename xpu, typename DType>
+class RNNOp {
  public:
-  explicit RNNOp(RNNParam p)
-    :param_(p), init_space_(false), reserve_space_size_(0) {
+  RNNParam param_;
+  Context ctx_;
+  explicit RNNOp(RNNParam param, Context ctx) {
+    this->param_ = param;
+    this->ctx_ = ctx;
+    #if MXNET_USE_CUDNN_RNN
+    init_cudnn_ = false;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy.
+    // No tests in place for fp16 RNNs, so leave TensorCore disabled for now.
+    cudnn_tensor_core_ = false;
+    // When fp16 RNN tests are introduced, we can enable TensorCore as follows:
+//    cudnn_tensor_core =
+//        mshadow::DataType<DType>::kFlag == mshadow::kFloat16 && GetEnvAllowTensorCore();
+    // Defaults
+    input_mode_ = CUDNN_LINEAR_INPUT;  // Don't support this yet
+    // RNN Mode
+    switch (param_.mode) {
+      case rnn_enum::kRnnRelu:
+        mode_ = CUDNN_RNN_RELU;
+        break;
+      case rnn_enum::kRnnTanh:
+        mode_ = CUDNN_RNN_TANH;
+        break;
+      case rnn_enum::kLstm:
+        mode_ = CUDNN_LSTM;
+        break;
+      case rnn_enum::kGru:
+        mode_ = CUDNN_GRU;
+        break;
+      default:
+        LOG(FATAL) << "Not implmented";
+    }
+#if USE_CUDNN_LSTM_PROJ
     if (param_.projection_size.has_value()) {
-      LOG(FATAL) << "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
+      CHECK_EQ(param_.mode, rnn_enum::kLstm)
+        << "Projection is only supported for LSTM.";
+      CHECK_GE(param_.state_size, param_.projection_size.value())
+        << "State size must be larger than projection size.";
     }
+#else
+    CHECK(!param_.projection_size.has_value())
+      << "Projection is only supported for LSTM with CuDNN version later than 7.1.1.";
+#endif
+#if USE_CUDNN_LSTM_PROJ
     if (param_.lstm_state_clip_min.has_value()
         || param_.lstm_state_clip_max.has_value()) {
-      LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1";
+      CHECK_EQ(param_.mode, rnn_enum::kLstm)
+        << "State clipping is only supported for LSTM.";
+      CHECK(param_.lstm_state_clip_min.has_value() && param_.lstm_state_clip_max.has_value())
+        << "lstm_state_clip_min and lstm_state_clip_max must be specified together.";
+      CHECK_GE(param_.lstm_state_clip_max.value(), param_.lstm_state_clip_min.value())
+        << "lstm_state_clip_max must be greater or equal to lstm_state_clip_min";
+    }
+#else
+    CHECK(!param_.lstm_state_clip_min.has_value()
+          && !param_.lstm_state_clip_max.has_value())
+      << "State clipping is only supported for LSTM with CuDNN version later than 7.2.1.";
+#endif
+    // RNN Direction
+    direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+    // Create descriptors
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&hy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcy_desc_));
+
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&dw_desc_));
+
+    CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_));
+    CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
+
+    #if USE_CUDNN_LSTM_PROJ
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&x_data_desc_));
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&y_data_desc_));
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_));
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_));
+    #endif
+    #else
+    if (ctx_.dev_type == kGPU) {
+      LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
+    }
+    #endif
+
+    if (ctx_.dev_type == kCPU) {
+      this->init_space_ = false;
+      this->temp_init_space_ = false;
+      this->reserve_cpu_space_size_ = 0;
+      this->temp_cpu_space_size_ = 0;
+
+      if (param_.projection_size.has_value()) {
+        LOG(FATAL) <<
+            "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
+      }
+      if (param_.lstm_state_clip_min.has_value()
+          || param_.lstm_state_clip_max.has_value()) {
+        LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1";
+      }
     }
   }
 
   ~RNNOp() {
-    if (init_space_) {
+    #if MXNET_USE_CUDNN_RNN
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcy_desc_));
+
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc_));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_));
+    CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
+
+    if (init_cudnn_) {
+      for (size_t i = 0; i < x_desc_vec_.size(); ++i) {
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc_vec_[i]));
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc_vec_[i]));
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]));
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]));
+      }
+      init_cudnn_ = false;
+      Storage::Get()->Free(temp_space_);
       Storage::Get()->Free(reserve_space_);
-      init_space_ = false;
+    }
+    #if USE_CUDNN_LSTM_PROJ
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(x_data_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(y_data_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dx_data_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_));
+    #endif
+    #endif
+
+    if (ctx_.dev_type == kCPU) {
+      if (init_space_) {
+        Storage::Get()->Free(reserve_cpu_space_);
+        init_space_ = false;
+      }
+      if (temp_init_space_) {
+        Storage::Get()->Free(temp_cpu_space_);
+        temp_init_space_ = false;
+      }
     }
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK(param_.p >= 0.0f && param_.p < 1.0f)
         << "unsupported dropout value, should be 0 <= dropout < 1";
-
-    size_t in_expected = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
-    size_t out_expected = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
-    if (!param_.state_outputs) {
-      out_expected = 1;
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
     }
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    Stream<cpu> *s = ctx.get_stream<cpu>();
-    // get input + output tensor
-    Tensor<cpu, 3, DType> x = in_data[rnn_enum::kData].get<cpu, 3, DType>(s);
-    Tensor<cpu, 1, DType> w = in_data[rnn_enum::kParams].get<cpu, 1, DType>(s);
-    Tensor<cpu, 3, DType> hx = in_data[rnn_enum::kState].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> y = out_data[rnn_enum::kOut].get<cpu, 3, DType>(s);
-    CHECK(x.CheckContiguous());
-    CHECK(w.CheckContiguous());
-    CHECK(hx.CheckContiguous());
-    CHECK(y.CheckContiguous());
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    // get input + output tensors
+    Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 3, DType> hx = in_data[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
+
     param_.seq_length_ = x.shape_[0];
     param_.batch_size_ = x.shape_[1];
     param_.input_size_ = x.shape_[2];
-
     const int direction = param_.bidirectional ? 2 : 1;
     const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
     DType* b_ptr = w.dptr_ + w.shape_[0] - bsize;
@@ -438,124 +572,308 @@ class RNNOp : public Operator{
     }
     DType* cx_ptr = NULL;
     DType* cy_ptr = NULL;
-
-    if (param_.mode == rnn_enum::kLstm) {
-      cx_ptr = in_data[rnn_enum::kStateCell].dptr<DType>();
-      if (param_.state_outputs) {
-        cy_ptr = out_data[rnn_enum::kStateCellOut].dptr<DType>();
-      }
+    if (param_.mode == rnn_enum::kLstm)
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<xpu, 3, DType>(s)).dptr_;
+    if (param_.mode == rnn_enum::kLstm && param_.state_outputs)
+      cy_ptr = (out_data[rnn_enum::kStateCellOut].get<xpu, 3, DType>(s)).dptr_;
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+
+    #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
+    if (!init_cudnn_) {
+      Init(ctx, s, in_data, out_data);
     }
 
-    // allocate temp space
-    const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                                                      param_.state_size, direction, param_.mode);
-    Tensor<cpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-        .get_space_typed<cpu, 1, DType>(Shape1(workspace_size), s);
+    #if USE_CUDNN_LSTM_PROJ
+    std::vector<int> seqLengthArray(param_.batch_size_, param_.seq_length_);
+    CUDNN_CALL(cudnnSetRNNDataDescriptor(x_data_desc_,
+                                         dtype_,
+                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                         param_.seq_length_,
+                                         param_.batch_size_,
+                                         param_.input_size_,
+                                         seqLengthArray.data(),
+                                         nullptr));
+    int out_size =
+      (param_.projection_size.has_value()) ? param_.projection_size.value() : param_.state_size;
+    out_size = (param_.bidirectional) ? (out_size * 2) : out_size;
+    CUDNN_CALL(cudnnSetRNNDataDescriptor(y_data_desc_,
+                                         dtype_,
+                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                         param_.seq_length_,
+                                         param_.batch_size_,
+                                         out_size,
+                                         seqLengthArray.data(),
+                                         nullptr));
+    if (ctx.is_train) {
+      CUDNN_CALL(cudnnSetRNNDataDescriptor(dx_data_desc_,
+                                           dtype_,
+                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                           param_.seq_length_,
+                                           param_.batch_size_,
+                                           param_.input_size_,
+                                           seqLengthArray.data(),
+                                           nullptr));
+      CUDNN_CALL(cudnnSetRNNDataDescriptor(dy_data_desc_,
+                                           dtype_,
+                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                           param_.seq_length_,
+                                           param_.batch_size_,
+                                           out_size,
+                                           seqLengthArray.data(),
+                                           nullptr));
+    }
+    #endif
+
+    #if USE_CUDNN_LSTM_PROJ
+    bool clip_state = param_.lstm_state_clip_min.has_value();
+    bool clip_nan = param_.lstm_state_clip_nan;
+    CUDNN_CALL(cudnnRNNSetClip(s->dnn_handle_,
+                               rnn_desc_,
+                               clip_state ? CUDNN_RNN_CLIP_MINMAX : CUDNN_RNN_CLIP_NONE,
+                               clip_nan ? CUDNN_NOT_PROPAGATE_NAN : CUDNN_PROPAGATE_NAN,
+                               clip_state ? param_.lstm_state_clip_min.value() : 0.0,
+                               clip_state ? param_.lstm_state_clip_max.value() : 0.0));
+    #endif
 
     if (ctx.is_train) {
-      const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
-                                                   param_.seq_length_, param_.batch_size_,
-                                                   param_.state_size, param_.mode);
-      if (init_space_ && reserve_space_size_ < r_size) {
-        Storage::Get()->Free(reserve_space_);
-        init_space_ = false;
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnRNNForwardTrainingEx(s->dnn_handle_,
+                                           rnn_desc_,
+                                           x_data_desc_,
+                                           x.dptr_,
+                                           hx_desc_,
+                                           hx.dptr_,
+                                           cx_desc_,
+                                           cx_ptr,
+                                           w_desc_,
+                                           w.dptr_,
+                                           y_data_desc_,
+                                           y.dptr_,
+                                           hy_desc_,
+                                           hy_ptr,
+                                           cy_desc_,
+                                           cy_ptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           temp_space_.dptr,
+                                           workspace_byte_,
+                                           reserve_space_.dptr,
+                                           reserve_space_byte_));
+      #else
+      CUDNN_CALL(cudnnRNNForwardTraining(s->dnn_handle_,
+                                         rnn_desc_,
+                                         param_.seq_length_,
+                                         x_desc_vec_.data(),
+                                         x.dptr_,
+                                         hx_desc_,
+                                         hx.dptr_,
+                                         cx_desc_,
+                                         cx_ptr,
+                                         w_desc_,
+                                         w.dptr_,
+                                         y_desc_vec_.data(),
+                                         y.dptr_,
+                                         hy_desc_,
+                                         hy_ptr,
+                                         cy_desc_,
+                                         cy_ptr,
+                                         temp_space_.dptr,
+                                         workspace_byte_,
+                                         reserve_space_.dptr,
+                                         reserve_space_byte_));
+      #endif
+    } else {
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnRNNForwardInferenceEx(s->dnn_handle_,
+                                            rnn_desc_,
+                                            x_data_desc_,
+                                            x.dptr_,
+                                            hx_desc_,
+                                            hx.dptr_,
+                                            cx_desc_,
+                                            cx_ptr,
+                                            w_desc_,
+                                            w.dptr_,
+                                            y_data_desc_,
+                                            y.dptr_,
+                                            hy_desc_,
+                                            hy_ptr,
+                                            cy_desc_,
+                                            cy_ptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            temp_space_.dptr,
+                                            workspace_byte_));
+      #else
+      CUDNN_CALL(cudnnRNNForwardInference(s->dnn_handle_,
+                                          rnn_desc_,
+                                          param_.seq_length_,
+                                          x_desc_vec_.data(),
+                                          x.dptr_,
+                                          hx_desc_,
+                                          hx.dptr_,
+                                          cx_desc_,
+                                          cx_ptr,
+                                          w_desc_,
+                                          w.dptr_,
+                                          y_desc_vec_.data(),
+                                          y.dptr_,
+                                          hy_desc_,
+                                          hy_ptr,
+                                          cy_desc_,
+                                          cy_ptr,
+                                          temp_space_.dptr,
+                                          workspace_byte_));
+      #endif
+    }
+    #endif
+
+    if (ctx_.dev_type == kCPU) {
+      // allocate temp space
+      const size_t work_cpu_space_size =
+          GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                              param_.state_size, direction, param_.mode);
+      if (temp_init_space_ && temp_cpu_space_size_ < work_cpu_space_size) {
+          Storage::Get()->Free(temp_cpu_space_);
+          temp_init_space_ = false;
       }
-
-      if (!init_space_) {
-        reserve_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
-        reserve_space_size_ = r_size;
-        init_space_ = true;
+      if (!temp_init_space_) {
+        temp_cpu_space_ = Storage::Get()->Alloc
+            (work_cpu_space_size * sizeof(DType), Context::CPU());
+        temp_cpu_space_size_ = work_cpu_space_size;
+        temp_init_space_ = true;
+      }
+      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
+      if (ctx.is_train) {
+        const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                                     param_.seq_length_, param_.batch_size_,
+                                                     param_.state_size, param_.mode);
+        if (init_space_ && reserve_cpu_space_size_ < r_size) {
+          Storage::Get()->Free(reserve_cpu_space_);
+          init_space_ = false;
+        }
+        if (!init_space_) {
+          reserve_cpu_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
+          reserve_cpu_space_size_ = r_size;
+          init_space_ = true;
+        }
+
+        DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
+
+        RNNForwardTraining<DType>(work_cpu_space,
+                                  reserve_space_ptr,
+                                  param_.state_outputs,
+                                  param_.num_layers,
+                                  direction,
+                                  param_.seq_length_,
+                                  param_.batch_size_,
+                                  param_.input_size_,
+                                  param_.state_size,
+                                  x.dptr_,
+                                  hx.dptr_,
+                                  cx_ptr,
+                                  w.dptr_,
+                                  b_ptr,
+                                  y.dptr_,
+                                  hy_ptr,
+                                  cy_ptr,
+                                  param_.p,
+                                  param_.mode);
+      } else {
+        RNNForwardInference<DType>(work_cpu_space,
+                                   param_.state_outputs,
+                                   param_.num_layers,
+                                   direction,
+                                   param_.seq_length_,
+                                   param_.batch_size_,
+                                   param_.input_size_,
+                                   param_.state_size,
+                                   x.dptr_,
+                                   hx.dptr_,
+                                   cx_ptr,
+                                   w.dptr_,
+                                   b_ptr,
+                                   y.dptr_,
+                                   hy_ptr,
+                                   cy_ptr,
+                                   param_.mode);
       }
-
-      DType* reserve_space_ptr = static_cast<DType*>(reserve_space_.dptr);
-      RNNForwardTraining<DType>(workspace.dptr_,
-                                reserve_space_ptr,
-                                param_.state_outputs,
-                                param_.num_layers,
-                                direction,
-                                param_.seq_length_,
-                                param_.batch_size_,
-                                param_.input_size_,
-                                param_.state_size,
-                                x.dptr_,
-                                hx.dptr_,
-                                cx_ptr,
-                                w.dptr_,
-                                b_ptr,
-                                y.dptr_,
-                                hy_ptr,
-                                cy_ptr,
-                                param_.p,
-                                param_.mode);
-    } else {
-      RNNForwardInference<DType>(workspace.dptr_,
-                                 param_.state_outputs,
-                                 param_.num_layers,
-                                 direction,
-                                 param_.seq_length_,
-                                 param_.batch_size_,
-                                 param_.input_size_,
-                                 param_.state_size,
-                                 x.dptr_,
-                                 hx.dptr_,
-                                 cx_ptr,
-                                 w.dptr_,
-                                 b_ptr,
-                                 y.dptr_,
-                                 hy_ptr,
-                                 cy_ptr,
-                                 param_.mode);
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<TBlob> &out_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK(param_.p >= 0.0f && param_.p < 1.0f)
         << "unsupported dropout value, should be 0 <= dropout < 1";
 
-    size_t in_expected = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
-    size_t out_expected = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
-    if (!param_.state_outputs) {
-      out_expected = 1;
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
     }
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    CHECK_EQ(in_grad.size(), in_expected);
-    CHECK_EQ(out_grad.size(), out_expected);
-    CHECK_EQ(req.size(), in_expected);
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
+    CHECK_EQ(in_grad.size(), num_inputs);
+    CHECK_EQ(out_grad.size(), num_outputs);
+    CHECK_EQ(req.size(), num_inputs);
     CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
     CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
-    mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     // get input + output tensors
-    Tensor<cpu, 3, DType> x = in_data[rnn_enum::kData].get<cpu, 3, DType>(s);
-    Tensor<cpu, 1, DType> w = in_data[rnn_enum::kParams].get<cpu, 1, DType>(s);
-    Tensor<cpu, 3, DType> hx = in_data[rnn_enum::kState].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> y = out_data[rnn_enum::kOut].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> dx = in_grad[rnn_enum::kData].get<cpu, 3, DType>(s);
-    Tensor<cpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<cpu, 1, DType>(s);
-    Tensor<cpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<cpu, 3, DType>(s);
-    CHECK(x.CheckContiguous());
-    CHECK(w.CheckContiguous());
-    CHECK(hx.CheckContiguous());
-    CHECK(y.CheckContiguous());
-    CHECK(dx.CheckContiguous());
-    CHECK(dw.CheckContiguous());
-    CHECK(dhx.CheckContiguous());
-    CHECK(dy.CheckContiguous());
+    Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> dx = in_grad[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 3, DType> hx = in_data[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<xpu, 3, DType>(s);
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(dw.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(dhx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+    CHECK_EQ(dy.CheckContiguous(), true);
+    CHECK_EQ(dx.CheckContiguous(), true);
+
+    if (req[rnn_enum::kParams] != kAddTo) {
+      dw = mshadow::expr::ScalarExp<DType>(0.0f);
+    }
+
     param_.seq_length_ = x.shape_[0];
     param_.batch_size_ = x.shape_[1];
     param_.input_size_ = x.shape_[2];
 
     const int direction = param_.bidirectional ? 2 : 1;
     const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
+
     DType* db_ptr = dw.dptr_ + w.shape_[0] - bsize;
 
     DType * dhy_ptr = NULL;
@@ -563,260 +881,582 @@ class RNNOp : public Operator{
       dhy_ptr = out_grad[rnn_enum::kStateOut].dptr<DType>();
     }
 
-    DType * cx_ptr = NULL;
-    DType * dcx_ptr = NULL;
-    DType * dcy_ptr = NULL;
+    DType* dcx_ptr = NULL;
+    DType* dcy_ptr = NULL;
+    DType* cx_ptr = NULL;
 
     if (param_.mode == rnn_enum::kLstm) {
       CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell";
-      cx_ptr = in_data[rnn_enum::kStateCell].dptr<DType>();
-      dcx_ptr = in_grad[rnn_enum::kStateCell].dptr<DType>();
-      if (param_.state_outputs) {
-        dcy_ptr = out_grad[rnn_enum::kStateCellOut].dptr<DType>();
-      }
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<xpu, 3, DType>(s)).dptr_;
+      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<xpu, 3, DType>(s)).dptr_;
     }
+    if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
+        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<xpu, 3, DType>(s)).dptr_;
 
-    // allocate temp space
-    const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                                                      param_.state_size, direction, param_.mode);
-    Tensor<cpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-        .get_space_typed<cpu, 1, DType>(Shape1(workspace_size), s);
-
-    size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
-                                           param_.seq_length_, param_.batch_size_,
-                                           param_.state_size, param_.mode);
-    if (!init_space_ || reserve_space_size_ != r_size) {
-      LOG(FATAL) << "Check forward init error";
+    #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
+    if (!init_cudnn_) {
+      Init(ctx, s, in_data, out_data);
     }
 
-    DType* reserve_space_ptr = static_cast<DType*>(reserve_space_.dptr);
-    RNNBackward<DType>(workspace.dptr_,
-                       reserve_space_ptr,
-                       param_.num_layers,
-                       direction,
-                       param_.seq_length_,
-                       param_.batch_size_,
-                       param_.input_size_,
-                       param_.state_size,
-                       x.dptr_,
-                       hx.dptr_,
-                       cx_ptr,
-                       w.dptr_,
-                       y.dptr_,
-                       dy.dptr_,
-                       dhy_ptr,
-                       dcy_ptr,
-                       dx.dptr_,
-                       dhx.dptr_,
-                       dcx_ptr,
-                       dw.dptr_,
-                       db_ptr,
-                       req[rnn_enum::kData],
-                       req[rnn_enum::kParams],
-                       req[rnn_enum::kState],
-                       // State cell should be present for LSTMs, but is absent for other RNNs.
-                       param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp,
-                       param_.p,
-                       param_.mode);
-  }
-
- private:
-  RNNParam param_;
-  bool init_space_;
-  size_t reserve_space_size_;
-  Storage::Handle reserve_space_;
-};  // class RNNOp
+    #if USE_CUDNN_LSTM_PROJ
+    CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
+                                      rnn_desc_,
+                                      y_data_desc_,
+                                      y.dptr_,
+                                      dy_data_desc_,
+                                      dy.dptr_,
+                                      nullptr,
+                                      nullptr,
+                                      dhy_desc_,
+                                      dhy_ptr,
+                                      dcy_desc_,
+                                      dcy_ptr,
+                                      w_desc_,
+                                      w.dptr_,
+                                      hx_desc_,
+                                      hx.dptr_,
+                                      cx_desc_,
+                                      cx_ptr,
+                                      dx_data_desc_,
+                                      dx.dptr_,
+                                      dhx_desc_,
+                                      dhx.dptr_,
+                                      dcx_desc_,
+                                      dcx_ptr,
+                                      nullptr,
+                                      nullptr,
+                                      temp_space_.dptr,
+                                      workspace_byte_,
+                                      reserve_space_.dptr,
+                                      reserve_space_byte_));
+    CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_,
+                                         rnn_desc_,
+                                         x_data_desc_,
+                                         x.dptr_,
+                                         hx_desc_,
+                                         hx.dptr_,
+                                         y_data_desc_,
+                                         y.dptr_,
+                                         temp_space_.dptr,
+                                         workspace_byte_,
+                                         dw_desc_,
+                                         dw.dptr_,
+                                         reserve_space_.dptr,
+                                         reserve_space_byte_));
+    #else
+    CUDNN_CALL(cudnnRNNBackwardData(s->dnn_handle_,
+                                    rnn_desc_,
+                                    param_.seq_length_,
+                                    y_desc_vec_.data(),
+                                    y.dptr_,
+                                    dy_desc_vec_.data(),
+                                    dy.dptr_,
+                                    dhy_desc_,
+                                    dhy_ptr,
+                                    dcy_desc_,
+                                    dcy_ptr,
+                                    w_desc_,
+                                    w.dptr_,
+                                    hx_desc_,
+                                    hx.dptr_,
+                                    cx_desc_,
+                                    cx_ptr,
+                                    dx_desc_vec_.data(),
+                                    dx.dptr_,
+                                    dhx_desc_,
+                                    dhx.dptr_,
+                                    dcx_desc_,
+                                    dcx_ptr,
+                                    temp_space_.dptr,
+                                    workspace_byte_,
+                                    reserve_space_.dptr,
+                                    reserve_space_byte_));
+    CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_,
+                                       rnn_desc_,
+                                       param_.seq_length_,
+                                       x_desc_vec_.data(),
+                                       x.dptr_,
+                                       hx_desc_,
+                                       hx.dptr_,
+                                       y_desc_vec_.data(),
+                                       y.dptr_,
+                                       temp_space_.dptr,
+                                       workspace_byte_,
+                                       dw_desc_,
+                                       dw.dptr_,
+                                       reserve_space_.dptr,
+                                       reserve_space_byte_));
+    #endif
+    #endif
+
+    if (ctx_.dev_type == kCPU) {
+      // allocate temp space
+      const size_t work_cpu_space_size =
+          GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                              param_.state_size, direction, param_.mode);
+      if (!temp_init_space_ || temp_cpu_space_size_ != work_cpu_space_size) {
+        LOG(FATAL) << "Check temp init error";
+      }
+      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
+      size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                             param_.seq_length_, param_.batch_size_,
+                                             param_.state_size, param_.mode);
 
-template<typename xpu>
-Operator* CreateOp(RNNParam param, int dtype);
+      if (!init_space_ || reserve_cpu_space_size_ != r_size) {
+        LOG(FATAL) << "Check forward init error";
+      }
 
-#if DMLC_USE_CXX11
-class RNNProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    if (param_.mode == rnn_enum::kLstm) {
-      return {"data", "parameters", "state", "state_cell"};
-    } else {
-      return {"data", "parameters", "state"};
+      DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
+      RNNBackward<DType>(work_cpu_space,
+                         reserve_space_ptr,
+                         param_.num_layers,
+                         direction,
+                         param_.seq_length_,
+                         param_.batch_size_,
+                         param_.input_size_,
+                         param_.state_size,
+                         x.dptr_,
+                         hx.dptr_,
+                         cx_ptr,
+                         w.dptr_,
+                         y.dptr_,
+                         dy.dptr_,
+                         dhy_ptr,
+                         dcy_ptr,
+                         dx.dptr_,
+                         dhx.dptr_,
+                         dcx_ptr,
+                         dw.dptr_,
+                         db_ptr,
+                         req[rnn_enum::kData],
+                         req[rnn_enum::kParams],
+                         req[rnn_enum::kState],
+                         // State cell should be present for LSTMs, but is absent for other RNNs.
+                         param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp,
+                         param_.p,
+                         param_.mode);
     }
   }
 
-  std::vector<std::string> ListOutputs() const override {
-    std::vector<std::string> outputs = {"output"};
-    if (!param_.state_outputs)
-      return outputs;
-    else
-      outputs.emplace_back("state");
-    if (param_.mode == rnn_enum::kLstm)
-      outputs.emplace_back("state_cell");
-    return outputs;
-  }
-
-  int NumOutputs() const override {
-    int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1;
-    int num_outputs = param_.state_outputs ? (mode_num + 1) : 1;
-    return num_outputs;
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
 
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(mxnet::ShapeVector *in_shape,
-                  mxnet::ShapeVector *out_shape,
-                  mxnet::ShapeVector *aux_shape) const override {
+ private:
+  inline void Init(const OpContext &ctx,
+                   mshadow::Stream<xpu> *s,
+                   const std::vector<TBlob> &in_data,
+                   const std::vector<TBlob> &out_data) {
     using namespace mshadow;
-    if (param_.mode == rnn_enum::kLstm) {
-      CHECK_EQ(in_shape->size(), 4U) << "Input:[data, parameters, state, cell_state]";
-    } else {
-      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, parameters, state]";
-    }
-    const mxnet::TShape &dshape = (*in_shape)[rnn_enum::kData];
-    if (dshape.ndim() ==  0) return false;
-    CHECK_EQ(dshape.ndim(), 3U) \
-        << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]";
-    // data: [sequence len, batch, input dimension]
-    int batch_size = dshape[1];
-    int input_size = dshape[2];
-    int numDirections = param_.bidirectional ? 2 : 1;
-    int total_layers = numDirections * param_.num_layers;  // double for bidirectional
-    int layer_size = (param_.projection_size.has_value()) ?
-                     param_.projection_size.value() : param_.state_size;
-    SHAPE_ASSIGN_CHECK(*in_shape,
-                       rnn_enum::kState,
-                       Shape3(total_layers, batch_size, layer_size));
-    if (param_.mode == rnn_enum::kLstm)
-      SHAPE_ASSIGN_CHECK(*in_shape,
-                         rnn_enum::kStateCell,
-                         Shape3(total_layers, batch_size, param_.state_size));
-
-    // calculate parameter vector length
-    int param_size = GetRnnParamSize(param_.num_layers,
-                                     input_size,
-                                     param_.state_size,
-                                     numDirections,
-                                     param_.mode,
-                                     param_.projection_size);
-    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
-
-    out_shape->clear();
-    // output: [sequence len, batch, output size]
-    mxnet::TShape oshape = dshape;
-    if (param_.projection_size.has_value()) {
-      oshape[2] = numDirections * param_.projection_size.value();
-    } else {
-      oshape[2] = numDirections * param_.state_size;
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
     }
-    out_shape->push_back(oshape);
-    if (!param_.state_outputs) {
-      return true;
-    } else {
-      // outStateShape: [layer_num, batch, state size]
-      mxnet::TShape outStateShape = dshape;
-      outStateShape[0] = total_layers;
-      outStateShape[1] = batch_size;
-      if (param_.projection_size.has_value()) {
-        outStateShape[2] = param_.projection_size.value();
-      } else {
-        outStateShape[2] = param_.state_size;
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
+
+    #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
+    #if CUDNN_MAJOR >= 5
+    format_ = CUDNN_TENSOR_NCHW;
+    #endif
+
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+      // get input + output tensors
+      Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
+      Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
+      param_.seq_length_ = x.shape_[0];
+      param_.batch_size_ = x.shape_[1];
+      param_.input_size_ = x.shape_[2];
+
+      // Tensor Descriptors
+      std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dx_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dy_vec(param_.seq_length_);
+      int dimA[3];
+      int strideA[3];
+      for (int i = 0; i < param_.seq_length_; i++) {
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&x_vec[i]));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&y_vec[i]));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&dx_vec[i]));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_vec[i]));
+
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.input_size_;
+        dimA[2] = 1;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
+
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(x_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(dx_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
+        dimA[2] = 1;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
+
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(y_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(dy_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
       }
-      out_shape->push_back(outStateShape);
-      // Deal with lstm cell state
-      if (param_.mode == rnn_enum::kLstm) {
-        mxnet::TShape cellStateShape = dshape;
-        cellStateShape[0] = total_layers;
-        cellStateShape[1] = batch_size;
-        cellStateShape[2] = param_.state_size;
-        out_shape->push_back(cellStateShape);
+      x_desc_vec_ = x_vec;
+      y_desc_vec_ = y_vec;
+      dx_desc_vec_ = dx_vec;
+      dy_desc_vec_ = dy_vec;
+
+      // set the state tensors
+      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
+      dimA[1] = param_.batch_size_;
+      dimA[2] = param_.state_size;
+      strideA[0] = dimA[2] * dimA[1];
+      strideA[1] = dimA[2];
+      strideA[2] = 1;
+      #if USE_CUDNN_LSTM_PROJ
+      int dimB[3];
+      int strideB[3];
+      dimB[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
+      dimB[1] = param_.batch_size_;
+      dimB[2] = param_.projection_size.has_value() ?
+                param_.projection_size.value() : param_.state_size;
+      strideB[0] = dimB[2] * dimB[1];
+      strideB[1] = dimB[2];
+      strideB[2] = 1;
+      #endif
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(cx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(cy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+
+      // Create Dropout descriptors
+      DType* dropout_states_ = NULL;
+      if (param_.p > 0) {
+         ctx.requested[rnn_enum::kCuDNNDropoutDescSpace].get_cudnn_dropout_desc
+            (&dropout_desc_, s, 1.0f - param_.p, seed_);
+      } else {
+        dropout_byte_ = 0;
       }
-      return true;
-    }
-  }
 
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (size_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+      CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_,
+                                           param_.p,  // discard probability
+                                           dropout_states_, dropout_byte_,
+                                           seed_));
+
+      // RNN descriptors
+      #if CUDNN_MAJOR >= 6
+      cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
+      CUDNN_CALL(cudnnSetRNNDescriptor_v6(s->dnn_handle_,
+                                          rnn_desc_,
+                                          param_.state_size,
+                                          param_.num_layers,
+                                          dropout_desc_,
+                                          input_mode_,
+                                          direction_,
+                                          mode_,
+                                          rnn_algo,
+                                          dtype_));
+      #else
+      CUDNN_CALL(cudnnSetRNNDescriptor(rnn_desc_,
+                                       param_.state_size,
+                                       param_.num_layers,
+                                       dropout_desc_,
+                                       input_mode_,
+                                       direction_,
+                                       mode_,
+                                       dtype_));
+      #endif
+      #if CUDNN_MAJOR >= 7
+        cudnnMathType_t math_type = CUDNN_DEFAULT_MATH;
+        if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) {
+          math_type = CUDNN_TENSOR_OP_MATH;
+        }
+      #if CUDNN_VERSION >= 7200
+            if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
+                (DataType<DType>::kFlag != kFloat16))
+              math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+      #endif
+        CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type));
+      #endif
+      #if USE_CUDNN_LSTM_PROJ
+      if (param_.projection_size.has_value()) {
+        CUDNN_CALL(cudnnSetRNNProjectionLayers(s->dnn_handle_,
+                                               rnn_desc_,
+                                               param_.projection_size.value(),
+                                               0));
       }
+      #endif
+      // Get temp space sizes
+      CUDNN_CALL(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
+                                          rnn_desc_,
+                                          param_.seq_length_,
+                                          x_desc_vec_.data(),
+                                          &workspace_byte_));
+      CUDNN_CALL(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
+                                                rnn_desc_,
+                                                param_.seq_length_,
+                                                x_desc_vec_.data(),
+                                                &reserve_space_byte_));
+      workspace_size_ = workspace_byte_ / sizeof(DType);
+      // Allocate the reserve space
+      reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
+      // Allocate the temp space
+      temp_space_ = Storage::Get()->Alloc(workspace_byte_, Context::GPU(s->dev_id));
+      // Check that number of params are correct
+      size_t cudnn_param_size;
+      CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_,
+                                       rnn_desc_,
+                                       x_desc_vec_[0],
+                                       &cudnn_param_size,
+                                       dtype_));
+      CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
+      // Set param descriptors
+      int dim_w[3] = {1, 1, 1};
+      dim_w[0] = w.shape_[0];
+      CUDNN_CALL(cudnnSetFilterNdDescriptor(w_desc_,
+                                            dtype_,
+                                            format_,
+                                            3,
+                                            dim_w));
+      CUDNN_CALL(cudnnSetFilterNdDescriptor(dw_desc_,
+                                            dtype_,
+                                            format_,
+                                            3,
+                                            dim_w));
+
+      // Query weight layout
+      // cudnnFilterDescriptor_t m_desc;
+      // CHECK_EQ(cudnnCreateFilterDescriptor(&m_desc), CUDNN_STATUS_SUCCESS);
+      // DType *p;
+      // int n = 2;
+      // int64_t last = 0;
+      // if (param_.mode == rnn_enum::kLstm) n = 8;
+      // else if (param_.mode == rnn_enum::kGru) n = 6;
+
+      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
+      //   for (int j = 0; j < n; ++j) {
+      //     CHECK_EQ(cudnnGetRNNLinLayerMatrixParams(s->dnn_handle_, rnn_desc_,
+      //       i, x_desc_vec_[0], w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
+      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
+      //     last = ((int64_t)(p - NULL))/sizeof(DType);
+      //     cudnnDataType_t t;
+      //     cudnnTensorFormat_t f;
+      //     int ndim = 5;
+      //     int dims[5] = {0, 0, 0, 0, 0};
+      //     CHECK_EQ(cudnnGetFilterNdDescriptor(m_desc, ndim, &t, &f, &ndim, &dims[0]),
+      //       CUDNN_STATUS_SUCCESS);
+      //     LOG(INFO) << "w: " <<  i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
+      //     for (int i = 0; i < ndim; ++i) LOG(INFO) << dims[i];
+      //   }
+      // }
+
+      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
+      //   for (int j = 0; j < n; ++j) {
+      //     CHECK_EQ(cudnnGetRNNLinLayerBiasParams(s->dnn_handle_, rnn_desc_, i, x_desc_vec_[0],
+      //       w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
+      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
+      //     last = ((int64_t)(p - NULL))/sizeof(DType);
+      //     LOG(INFO) << "b: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
+      //   }
+      // }
     }
-    out_type->clear();
-    out_type->push_back(dtype);
-    if (!param_.state_outputs) {
-      return true;
+  #endif
+  }
+  #if MXNET_USE_CUDNN_RNN
+  cudnnDataType_t dtype_;
+  bool init_cudnn_;
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnRNNMode_t mode_;
+  cudnnDirectionMode_t direction_;
+  cudnnRNNInputMode_t input_mode_;
+  cudnnDropoutDescriptor_t dropout_desc_;
+  Storage::Handle reserve_space_, temp_space_;
+  uint64_t seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
+  size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
+  int workspace_size_;
+  std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
+  #if USE_CUDNN_LSTM_PROJ
+  cudnnRNNDataDescriptor_t x_data_desc_, y_data_desc_, dx_data_desc_, dy_data_desc_;
+  #endif
+  cudnnTensorDescriptor_t hx_desc_, cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_, cy_desc_;
+  cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
+
+  cudnnFilterDescriptor_t w_desc_, dw_desc_;
+  // Allow TensorCore algo policy
+  bool cudnn_tensor_core_;
+
+  #if CUDNN_MAJOR >= 5
+  cudnnTensorFormat_t format_;
+  #endif
+  #endif
+  bool init_space_, temp_init_space_;
+  size_t reserve_cpu_space_size_, temp_cpu_space_size_;
+  Storage::Handle reserve_cpu_space_, temp_cpu_space_;
+};  //  class RNNOp
+
+static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
+                                 const Context ctx,
+                                 const mxnet::ShapeVector &in_shapes,
+                                 const std::vector<int> &in_types) {
+  const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
+  OpStatePtr state = OpStatePtr();
+  MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, {
+    if (ctx.dev_type == kGPU) {
+      state = OpStatePtr::Create<RNNOp<gpu, DType>>(param, ctx);
     } else {
-      out_type->push_back(dtype);
-      // Deal with lstm cell state
-      if (param_.mode == rnn_enum::kLstm)
-        out_type->push_back(dtype);
-      return true;
+      state = OpStatePtr::Create<RNNOp<cpu, DType>>(param, ctx);
     }
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new RNNProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "RNN";
-  }
+  });
+  return state;
+}
 
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    std::vector<int> dep = {in_data[rnn_enum::kData], in_data[rnn_enum::kParams],
-        in_data[rnn_enum::kState], out_data[rnn_enum::kOut], out_grad[rnn_enum::kOut]};
+template<typename xpu>
+void RNNStatefulCompute(const OpStatePtr& state,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  int dtype = inputs[rnn_enum::kData].type_flag_;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    RNNOp<xpu, DType>& op = state.get_state<RNNOp<xpu, DType>>();
+    op.Forward(ctx, inputs, req, outputs);
+  });
+}
 
-    if (param_.state_outputs) {
-      dep.push_back(out_data[rnn_enum::kStateOut]);
-      dep.push_back(out_grad[rnn_enum::kStateOut]);
+/*
+index description
+0: x
+1: w
+2: hx
+3: y
+4: dy
+5: hy
+6: dhy
+7: cx
+8: cy
+9: dcy
+*/
+template<typename xpu>
+void RNNStatefulGradCompute(const OpStatePtr& state,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
+  std::vector<TBlob> out_data{inputs[3]};
+  std::vector<TBlob> out_grad{inputs[4]};
+  const std::vector<TBlob> &in_grad = outputs;
+
+  int dtype = inputs[rnn_enum::kData].type_flag_;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    RNNOp<xpu, DType>& op = state.get_state<RNNOp<xpu, DType>>();
+    const RNNParam& param = op.param_;
+    int index = 5;
+    if (param.state_outputs) {
+      out_data.push_back(inputs[index++]);
+      out_grad.push_back(inputs[index++]);
     }
 
-    if (param_.mode == rnn_enum::kLstm) {
-      dep.push_back(in_data[rnn_enum::kStateCell]);
-      if (param_.state_outputs) {
-        dep.push_back(out_data[rnn_enum::kStateCellOut]);
-        dep.push_back(out_grad[rnn_enum::kStateCellOut]);
+    if (param.mode == rnn_enum::kLstm) {
+      in_data.push_back(inputs[index++]);
+      if (param.state_outputs) {
+        out_data.push_back(inputs[index++]);
+        out_grad.push_back(inputs[index]);
       }
     }
-    return dep;
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
 
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                             std::vector<int> *in_type) const override;
+    op.Backward(ctx, out_grad, in_data, out_data, req, in_grad);
+  });
+}
 
- private:
-  RNNParam param_;
-};  // class RNNProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
+
 #endif  // MXNET_OPERATOR_RNN_INL_H_
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 621b9eb110e7..7012a3c22f50 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -27,24 +27,142 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(RNNParam param, int dtype) {
-  Operator *op = nullptr;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new RNNOp<DType>(param);
-  });
-  return op;
+
+DMLC_REGISTER_PARAMETER(RNNParam);
+static inline std::vector<std::string> ListArguments(const RNNParam& param_) {
+  if (param_.mode == rnn_enum::kLstm) {
+    return {"data", "parameters", "state", "state_cell"};
+  } else {
+    return {"data", "parameters", "state"};
+  }
 }
 
-Operator *RNNProp::CreateOperatorEx(Context ctx,
-                                  mxnet::ShapeVector *in_shape,
-                                  std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+static bool RNNShape(const nnvm::NodeAttrs& attrs,
+                     std::vector<TShape> *in_shape,
+                     std::vector<TShape> *out_shape) {
+  const RNNParam& param_ = nnvm::get<RNNParam>(attrs.parsed);
+  using namespace mshadow;
+  if (param_.mode == rnn_enum::kLstm) {
+    CHECK_EQ(in_shape->size(), 4U) << "Needed input:[data, parameters, state, cell_state],"
+        << " got in_shape->size(): " << in_shape->size();
+  } else {
+    CHECK_EQ(in_shape->size(), 3U) <<
+      "Needed input:[data, parameters, state], got in_shape->size(): " << in_shape->size();
+  }
+  const TShape &dshape = (*in_shape)[rnn_enum::kData];
+  if (!mxnet::ndim_is_known(dshape)) return false;
+  CHECK_EQ(dshape.ndim(), 3U) \
+      << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]";
+  // data: [sequence len, batch, input dimension]
+  int batch_size = dshape[1];
+  int input_size = dshape[2];
+  int numDirections = param_.bidirectional ? 2 : 1;
+  int total_layers = numDirections * param_.num_layers;  // double for bidirectional
+  int layer_size = (param_.projection_size.has_value()) ?
+      param_.projection_size.value() : param_.state_size;
+  SHAPE_ASSIGN_CHECK(*in_shape,
+                     rnn_enum::kState,
+                     Shape3(total_layers, batch_size, layer_size));
+  if (param_.mode == rnn_enum::kLstm) {
+    SHAPE_ASSIGN_CHECK(*in_shape,
+                       rnn_enum::kStateCell,
+                       Shape3(total_layers, batch_size, param_.state_size));
+  }
+
+  // calculate parameter vector length
+  int param_size = GetRnnParamSize(param_.num_layers,
+                                   input_size,
+                                   param_.state_size,
+                                   numDirections,
+                                   param_.mode,
+                                   param_.projection_size);
+  SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
+  out_shape->clear();
+  // output: [sequence len, batch, output size]
+  TShape oshape = dshape;
+  if (param_.projection_size.has_value()) {
+    oshape[2] = numDirections * param_.projection_size.value();
+  } else {
+    oshape[2] = numDirections * param_.state_size;
+  }
+  out_shape->push_back(oshape);
+  if (param_.state_outputs) {
+    // outStateShape: [layer_num, batch, state size]
+    TShape outStateShape = dshape;
+    outStateShape[0] = total_layers;
+    outStateShape[1] = batch_size;
+    if (param_.projection_size.has_value()) {
+      outStateShape[2] = param_.projection_size.value();
+    } else {
+      outStateShape[2] = param_.state_size;
+    }
+    out_shape->push_back(outStateShape);
+    // Deal with lstm cell state
+    if (param_.mode == rnn_enum::kLstm) {
+      TShape cellStateShape = dshape;
+      cellStateShape[0] = total_layers;
+      cellStateShape[1] = batch_size;
+      cellStateShape[2] = param_.state_size;
+      out_shape->push_back(cellStateShape);
+    }
+  }
+  return true;
 }
 
-DMLC_REGISTER_PARAMETER(RNNParam);
+static bool RNNType(const nnvm::NodeAttrs& attrs,
+                    std::vector<int> *in_type,
+                    std::vector<int> *out_type) {
+  const RNNParam& param_ = nnvm::get<RNNParam>(attrs.parsed);
+  if (param_.mode == rnn_enum::kLstm) {
+    CHECK_EQ(in_type->size(), 4U);
+  } else {
+    CHECK_EQ(in_type->size(), 3U);
+  }
+  int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  for (size_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      TYPE_ASSIGN_CHECK(*in_type, i, dtype);
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]);
+    }
+  }
+  out_type->clear();
+  out_type->push_back(dtype);
+  if (param_.state_outputs) {
+    out_type->push_back(dtype);
+    // Deal with lstm cell state
+    if (param_.mode == rnn_enum::kLstm)
+      out_type->push_back(dtype);
+  }
+  return true;
+}
 
-MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
+struct RNNGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr &n,
+          const std::vector<nnvm::NodeEntry> &ograd) const {
+    const RNNParam& params = nnvm::get<RNNParam>(n->attrs.parsed);
+    std::vector<nnvm::NodeEntry> heads{ n->inputs[rnn_enum::kData],
+      n->inputs[rnn_enum::kParams], n->inputs[rnn_enum::kState] };
+    heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kOut, 0});
+    heads.push_back(ograd[rnn_enum::kOut]);
+    if (params.state_outputs) {
+      heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kStateOut, 0});
+      heads.push_back(ograd[rnn_enum::kStateOut]);
+    }
+    if (params.mode == rnn_enum::kLstm) {
+      heads.push_back(n->inputs[rnn_enum::kStateCell]);
+      if (params.state_outputs) {
+        heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kStateCellOut, 0});
+        heads.push_back(ograd[rnn_enum::kStateCellOut]);
+      }
+    }
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+NNVM_REGISTER_OP(RNN)
 .describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are
 implemented, with both multi-layer and bidirectional support.
 
@@ -97,7 +215,49 @@ The definition of GRU here is slightly different from paper but compatible with
             z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
             n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
             h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\
-            \end{array})code")
+            \end{array}
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<RNNParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  return params.mode == rnn_enum::kLstm ? 4 : 3;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  //  kOut
+  int num_outputs = 1;
+  if (params.state_outputs) {
+    // kOut, kStateOut, kStateCellOut
+    num_outputs = (params.mode == rnn_enum::kLstm) ? 3 : 2;
+  }
+
+  return num_outputs;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  return ListArguments(params);
+})
+.set_attr<mxnet::FInferShape>("FInferShape", RNNShape)
+.set_attr<nnvm::FInferType>("FInferType", RNNType)
+.set_attr<FCreateOpState>("FCreateOpState", CreateRNNState)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", RNNGrad{"_backward_RNN"})
+.set_attr<FResourceRequestEx>("FResourceRequestEx",
+  [](const NodeAttrs& attrs, const int dev_mask, const DispatchMode dispatch_mode) {
+    std::vector<ResourceRequest> request;
+    const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
+    if (param.p == 0) return request;
+    if (dev_mask == kGPU) {
+#if MXNET_USE_CUDNN_RNN
+      if (1.0f - param.p > 0) {
+        request.emplace_back(ResourceRequest::kCuDNNDropoutDesc);
+        return request;
+      }
+#endif
+    }
+    return request;
+})
 .add_argument("data", "NDArray-or-Symbol", "Input data to RNN")
 .add_argument("parameters", "NDArray-or-Symbol",
               "Vector of all RNN trainable parameters concatenated")
@@ -105,5 +265,15 @@ The definition of GRU here is slightly different from paper but compatible with
 .add_argument("state_cell", "NDArray-or-Symbol",
               "initial cell state for LSTM networks (only for LSTM)")
 .add_arguments(RNNParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_RNN)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  return params.mode == rnn_enum::kLstm ? 4 : 3;
+})
+.set_attr_parser(ParamParser<RNNParam>)
+.set_attr<bool>("TIsLayerOpBackward", true)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulGradCompute<cpu>);
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
index 402a8cf5f503..77bb95522711 100644
--- a/src/operator/rnn.cu
+++ b/src/operator/rnn.cu
@@ -26,24 +26,14 @@
 
 #include "./rnn-inl.h"
 #include <algorithm>
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-#include "./cudnn_rnn-inl.h"
-#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
 
 namespace mxnet {
 namespace op {
-template<>
-Operator* CreateOp<gpu>(RNNParam param, int dtype) {
-  Operator *op = NULL;
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new CuDNNRNNOp<DType>(param);
-  })
-#else
-  LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
-#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
-  return op;
-}
 
+NNVM_REGISTER_OP(RNN)
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", RNNStatefulCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_RNN)
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", RNNStatefulGradCompute<gpu>);
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
index b4db80bdd721..4c42934f1618 100644
--- a/src/operator/sequence_last-inl.h
+++ b/src/operator/sequence_last-inl.h
@@ -263,7 +263,7 @@ class SequenceLastProp : public OperatorProperty {
       SHAPE_ASSIGN_CHECK(*in_shape, seq_last::kSequenceLength, Shape1(sbatch));
 
     // calculate output size
-    mxnet::TShape shape_o(dshape.ndim() - 1);
+    mxnet::TShape shape_o(dshape.ndim() - 1, -1);
     shape_o[0] = sbatch;
     for (index_t i = 1; i < shape_o.ndim(); ++i) shape_o[i] = dshape[i + 1];
 
diff --git a/src/operator/sequence_mask-inl.h b/src/operator/sequence_mask-inl.h
index 372cf57e03dc..05a9424fd891 100644
--- a/src/operator/sequence_mask-inl.h
+++ b/src/operator/sequence_mask-inl.h
@@ -65,70 +65,24 @@ struct SequenceMaskParam : public dmlc::Parameter<SequenceMaskParam> {
   }
 };
 
-// (seqlen, batch, rest) case
-template <int req>
-struct SequenceMask0Kernel {
-  template <typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int b, DType *in, const IType *idx,
-                                  index_t max_s_len, index_t batch_size,
-                                  index_t restsize, DType value) {
-    const index_t seqpos = static_cast<int>(idx[b]);
-#pragma unroll
-    for (index_t s = seqpos; s < max_s_len; ++s) {
-      index_t incr = (s * batch_size * restsize) + (b * restsize);
-#pragma unroll
-      for (index_t r = 0; r < restsize; ++r)
-        KERNEL_ASSIGN(in[incr + r], req, value);
-    }
-  }
-};
-
-// (batch, seqlen, rest) case
-template <int req>
-struct SequenceMask1Kernel {
-  template <typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int b, DType *in, const IType *idx,
-                                  index_t max_s_len, index_t batch_size,
-                                  index_t restsize, DType value) {
-    const index_t seqpos = static_cast<int>(idx[b]);
-#pragma unroll
-    for (index_t s = seqpos; s < max_s_len; ++s) {
-      index_t incr = (b * max_s_len * restsize) + (s * restsize);
-#pragma unroll
-      for (index_t r = 0; r < restsize; ++r)
-        KERNEL_ASSIGN(in[incr + r], req, value);
-    }
-  }
-};
+template<typename DType, typename IType>
+void SequenceMaskExec(const mshadow::Tensor<cpu, 3, DType> &data,
+                  const mshadow::Tensor<cpu, 1, IType> &indices,
+                  const OpReqType req, mshadow::Stream<cpu> *const s,
+                  int axis, DType val);
+#ifdef __CUDACC__
+template<typename DType, typename IType>
+void SequenceMaskExec(const mshadow::Tensor<gpu, 3, DType> &data,
+                  const mshadow::Tensor<gpu, 1, IType> &indices,
+                  const OpReqType req, mshadow::Stream<gpu> *const s,
+                  int axis, DType val);
+#endif
 
 template <typename xpu, typename DType, typename IType>
 class SequenceMaskOp : public Operator {
  public:
   explicit SequenceMaskOp(SequenceMaskParam p) { this->param_ = p; }
 
-  void sequence_mask(const mshadow::Tensor<xpu, 3, DType> &data,
-                     const mshadow::Tensor<xpu, 1, IType> &indices,
-                     const OpReqType req, mshadow::Stream<xpu> *const s,
-                     DType val) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-
-    index_t batch = indices.size(0);
-    index_t max_seq_len = data.size(param_.axis);
-    index_t restsize = data.size(2);
-
-    MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
-      if (param_.axis == 1)
-        mxnet_op::Kernel<SequenceMask1Kernel<req_type>, xpu>::Launch(
-            s, batch, data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
-            val);
-      else
-        mxnet_op::Kernel<SequenceMask0Kernel<req_type>, xpu>::Launch(
-            s, batch, data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
-            val);
-    });
-  }
-
   virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
@@ -155,8 +109,8 @@ class SequenceMaskOp : public Operator {
     if (param_.use_sequence_length) {
       Tensor<xpu, 1, IType> indices =
           in_data[seq_mask::kSequenceLength].get<xpu, 1, IType>(s);
-      sequence_mask(out, indices, req[seq_mask::kOut], s,
-                    static_cast<DType>(param_.value));
+      SequenceMaskExec<DType, IType>(out, indices, req[seq_mask::kOut], s,
+                   param_.axis, static_cast<DType>(param_.value));
     }
   }
 
@@ -198,11 +152,12 @@ class SequenceMaskOp : public Operator {
                 s3, s);
         out_g_temp = F<mshadow_op::identity>(out_g);
         out_g = out_g_temp;
-        sequence_mask(out_g, indices, kWriteInplace, s, DType(0.));
+        SequenceMaskExec<DType, IType>(out_g, indices, kWriteInplace, s, param_.axis, DType(0.));
         Assign(data_g, kAddTo, F<mshadow_op::identity>(out_g));
       } else {
         Assign(data_g, req[seq_mask::kData], F<mshadow_op::identity>(out_g));
-        sequence_mask(data_g, indices, req[seq_mask::kData], s, DType(0.));
+        SequenceMaskExec<DType, IType>(
+          data_g, indices, req[seq_mask::kData], s, param_.axis, DType(0.));
       }
     }
   }
diff --git a/src/operator/sequence_mask.cc b/src/operator/sequence_mask.cc
index c3bf12d3a862..f4f81a801e70 100644
--- a/src/operator/sequence_mask.cc
+++ b/src/operator/sequence_mask.cc
@@ -27,6 +27,70 @@
 
 namespace mxnet {
 namespace op {
+
+// (seqlen, batch, rest) case
+template <int req>
+struct SequenceMask0CPUKernel {
+  template <typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int batch, DType *in, const IType *idx,
+                                  index_t max_s_len, index_t batch_size,
+                                  index_t restsize, DType value) {
+    const index_t seqpos = static_cast<int>(idx[batch]);
+#pragma unroll
+    for (index_t s = seqpos; s < max_s_len; ++s) {
+      index_t incr = (s * batch_size * restsize) + (batch * restsize);
+#pragma unroll
+      for (index_t r = 0; r < restsize; ++r)
+        KERNEL_ASSIGN(in[incr + r], req, value);
+    }
+  }
+};
+
+// (batch, seqlen, rest) case
+template <int req>
+struct SequenceMask1CPUKernel {
+  template <typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int batch, DType *in, const IType *idx,
+                                  index_t max_s_len, index_t batch_size,
+                                  index_t restsize, DType value) {
+    const index_t seqpos = static_cast<int>(idx[batch]);
+#pragma unroll
+    for (index_t s = seqpos; s < max_s_len; ++s) {
+      index_t incr = (batch * max_s_len * restsize) + (s * restsize);
+#pragma unroll
+      for (index_t r = 0; r < restsize; ++r)
+        KERNEL_ASSIGN(in[incr + r], req, value);
+    }
+  }
+};
+
+template<typename DType, typename IType>
+void SequenceMaskExec(
+       const mshadow::Tensor<cpu, 3, DType> &data,
+       const mshadow::Tensor<cpu, 1, IType> &indices,
+       const OpReqType req, mshadow::Stream<cpu> *const s,
+       int axis, DType val) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+
+  index_t batch = indices.size(0);
+  index_t max_seq_len = data.size(axis);
+  index_t restsize = data.size(2);
+
+  MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+    if (axis == 1) {
+      Kernel<SequenceMask1CPUKernel<req_type>, cpu>::Launch(
+        s, batch, data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
+        val);
+    } else {
+      Kernel<SequenceMask0CPUKernel<req_type>, cpu>::Launch(
+        s, batch, data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
+        val);
+    }
+  });
+}
+
 template <>
 Operator *CreateOp<cpu>(SequenceMaskParam param, int dtype, int itype) {
   Operator *op = nullptr;
diff --git a/src/operator/sequence_mask.cu b/src/operator/sequence_mask.cu
index cec627c4c697..8f196b4d67f9 100644
--- a/src/operator/sequence_mask.cu
+++ b/src/operator/sequence_mask.cu
@@ -29,6 +29,65 @@
 namespace mxnet {
 namespace op {
 
+// (seqlen, batch, rest) case
+template <int req>
+struct SequenceMask0GPUKernel {
+  template <typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType *in, const IType *idx,
+                                  index_t max_s_len, index_t batch_size,
+                                  index_t restsize, DType value) {
+    index_t batch = i / restsize % batch_size;
+    const index_t seqpos = static_cast<int>(idx[batch]);
+    index_t seq = i / restsize / batch_size;
+    if (seq >= seqpos) {
+      KERNEL_ASSIGN(in[i], req, value);
+    }
+  }
+};
+
+// (batch, seqlen, rest) case
+template <int req>
+struct SequenceMask1GPUKernel {
+  template <typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType *in, const IType *idx,
+                                  index_t max_s_len, index_t batch_size,
+                                  index_t restsize, DType value) {
+    index_t batch = i / restsize / max_s_len;
+    const index_t seqpos = static_cast<int>(idx[batch]);
+    index_t seq = i / restsize % max_s_len;
+    if (seq >= seqpos) {
+      KERNEL_ASSIGN(in[i], req, value);
+    }
+  }
+};
+
+template<typename DType, typename IType>
+void SequenceMaskExec(
+       const mshadow::Tensor<gpu, 3, DType> &data,
+       const mshadow::Tensor<gpu, 1, IType> &indices,
+       const OpReqType req, mshadow::Stream<gpu> *const s,
+       int axis, DType val) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+
+  index_t batch = indices.size(0);
+  index_t max_seq_len = data.size(axis);
+  index_t restsize = data.size(2);
+
+  MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+    if (axis == 1) {
+      Kernel<SequenceMask1GPUKernel<req_type>, gpu>::Launch(
+        s, data.shape_.Size(), data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
+        val);
+    } else {
+      Kernel<SequenceMask0GPUKernel<req_type>, gpu>::Launch(
+        s, data.shape_.Size(), data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
+        val);
+    }
+  });
+}
+
 template <> Operator *CreateOp<gpu>(SequenceMaskParam param, int dtype, int itype) {
   Operator *op = NULL;
   MSHADOW_TYPE_SWITCH(dtype, DType, {
diff --git a/src/operator/slice_channel-inl.h b/src/operator/slice_channel-inl.h
index 6125782d525b..e37ffdcf1b91 100644
--- a/src/operator/slice_channel-inl.h
+++ b/src/operator/slice_channel-inl.h
@@ -195,9 +195,9 @@ class SliceChannelProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 1U);
     mxnet::TShape dshape = in_shape->at(slice_enum::kData);
     mxnet::TShape ishape = in_shape->at(slice_enum::kData);
-    if (dshape.ndim() == 0) return false;
+    if (!mxnet::ndim_is_known(dshape)) return false;
     if (param_.axis >= 0) {
-      CHECK_LT(static_cast<size_t>(param_.axis), dshape.ndim());
+      CHECK_LT(param_.axis, dshape.ndim());
     } else {
       CHECK_LT(param_.axis + dshape.ndim(), dshape.ndim());
     }
@@ -212,15 +212,18 @@ class SliceChannelProp : public OperatorProperty {
       << " evenly sized chunks, but this is not possible because "
       << param_.num_outputs << " does not evenly divide "
       << dshape[real_axis];
-    if (param_.squeeze_axis && ishape[real_axis] != 0) {
-      CHECK_EQ(ishape[real_axis], static_cast<size_t>(param_.num_outputs))
+    if (param_.squeeze_axis && ishape[real_axis] != -1) {
+      CHECK_EQ(ishape[real_axis], param_.num_outputs)
         << "If squeeze axis is True, the size of the sliced axis must be the same as num_outputs."
         << " Input shape=" << ishape << ", axis=" << real_axis
         << ", num_outputs=" << param_.num_outputs << ".";
     }
-    dshape[real_axis] /= param_.num_outputs;
-    if (param_.squeeze_axis && (dshape[real_axis] == 1 || ishape[real_axis] == 0)) {
-      for (int d = real_axis; d < static_cast<int>(dshape.ndim()) - 1; ++d) {
+    if (dshape[real_axis] >= 0) {
+      dshape[real_axis] /= param_.num_outputs;
+    }
+    if (param_.squeeze_axis && (dshape[real_axis] == 1
+        || !mxnet::dim_size_is_known(ishape, real_axis))) {
+      for (int d = real_axis; d < dshape.ndim() - 1; ++d) {
         dshape[d] = dshape[d+1];
       }
       dshape = mxnet::TShape(&dshape[0], &dshape[dshape.ndim()-1]);
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index c5ad90ab95bc..80ab40ef6c50 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -337,19 +337,19 @@ class SoftmaxOutputProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
     const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
+    if (!shape_is_known(dshape)) return false;
 
     // label.shape == data.shape: use probability as label
     if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
       if (param_.multi_output) {
         mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]);
-        mxnet::TShape lshape2(dshape.ndim() - 1);
+        mxnet::TShape lshape2(dshape.ndim() - 1, -1);
         lshape2[0] = dshape[0];
-        for (index_t i = 2; i < dshape.ndim(); ++i)
+        for (int i = 2; i < dshape.ndim(); ++i)
           lshape2[i-1] = dshape[i];
         mxnet::TShape lshape3 = dshape;
         lshape3[1] = 1;
-        if (in_shape->at(softmaxout_enum::kLabel).ndim() == 0) {
+        if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
           in_shape->at(softmaxout_enum::kLabel) = lshape1;
         } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
         } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
@@ -361,8 +361,8 @@ class SoftmaxOutputProp : public OperatorProperty {
           throw InferShapeError(os.str(), softmaxout_enum::kLabel);
         }
       } else {
-        mxnet::TShape label_shape(dshape.ndim() - 1);
-        for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
+        mxnet::TShape label_shape(dshape.ndim() - 1, -1);
+        for (int i = 0; i + 1 < dshape.ndim(); ++i)
           label_shape[i] = dshape[i];
         SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
       }
@@ -426,11 +426,6 @@ class SoftmaxOutputProp : public OperatorProperty {
     return {{in_data[softmaxout_enum::kData], out_data[softmaxout_enum::kOut]}};
   }
 
-  std::vector<ResourceRequest> BackwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
   Operator* CreateOperator(Context ctx) const override {
     LOG(FATAL) << "Not Implemented.";
     return NULL;
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
index c34e9095f4c1..548225f0496b 100644
--- a/src/operator/softmax_output.cc
+++ b/src/operator/softmax_output.cc
@@ -85,19 +85,19 @@ static bool SoftmaxOutputShape(const nnvm::NodeAttrs& attrs,
   const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
   const mxnet::TShape &dshape = in_shape->at(0);
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
 
   // label.shape == data.shape: use probability as label
   if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
     if (param.multi_output) {
       mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]);
-      mxnet::TShape lshape2(dshape.ndim() - 1);
+      mxnet::TShape lshape2(dshape.ndim() - 1, -1);
       lshape2[0] = dshape[0];
-      for (index_t i = 2; i < dshape.ndim(); ++i)
+      for (int i = 2; i < dshape.ndim(); ++i)
         lshape2[i-1] = dshape[i];
       mxnet::TShape lshape3 = dshape;
       lshape3[1] = 1;
-      if (in_shape->at(softmaxout_enum::kLabel).ndim() == 0) {
+      if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
         in_shape->at(softmaxout_enum::kLabel) = lshape1;
       } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
       } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
@@ -109,8 +109,8 @@ static bool SoftmaxOutputShape(const nnvm::NodeAttrs& attrs,
         throw InferShapeError(os.str(), softmaxout_enum::kLabel);
       }
     } else {
-      mxnet::TShape label_shape(dshape.ndim() - 1);
-      for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
+      mxnet::TShape label_shape(dshape.ndim() - 1, -1);
+      for (int i = 0; i + 1 < dshape.ndim(); ++i)
         label_shape[i] = dshape[i];
       SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
     }
@@ -262,6 +262,9 @@ NNVM_REGISTER_OP(_backward_SoftmaxOutput)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
   return std::vector<std::pair<int, int> >{{0, 0}};
 })
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n){
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
 .set_attr_parser(ParamParser<SoftmaxOutputParam>)
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputGradCompute<cpu>);
 }  // namespace op
diff --git a/src/operator/spatial_transformer-inl.h b/src/operator/spatial_transformer-inl.h
index 9e5dee842d0d..660d57d55bab 100644
--- a/src/operator/spatial_transformer-inl.h
+++ b/src/operator/spatial_transformer-inl.h
@@ -190,10 +190,10 @@ class SpatialTransformerProp : public OperatorProperty {
     CHECK_EQ(param_.sampler_type, st::kBilinear) << "only supports bilinear sampling currently";
     const mxnet::TShape &dshape = (*in_shape)[st::kData];
     const mxnet::TShape &lshape = (*in_shape)[st::kLoc];
-    if (dshape.ndim() ==  0) return false;
+    if (!shape_is_known(dshape)) return false;
     CHECK_EQ(dshape.ndim(), 4U) \
         << "input data should be 4D in batch-num_filter-y-x";
-    if (lshape.ndim() ==  0) return false;
+    if (!shape_is_known(lshape)) return false;
     CHECK_EQ(lshape.ndim(), 2U) \
         << "locolisation paramter should be 4D in batch-num_hidden";
     if (param_.transform_type == st::kAffine) {
diff --git a/src/operator/subgraph/partition_graph.cc b/src/operator/subgraph/build_subgraph.cc
similarity index 69%
rename from src/operator/subgraph/partition_graph.cc
rename to src/operator/subgraph/build_subgraph.cc
index 90a14caa510b..32ea341d0834 100644
--- a/src/operator/subgraph/partition_graph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -19,7 +19,7 @@
 
 /*!
  *  Copyright (c) 2018 by Contributors
- * \file partition_graph.cc
+ * \file build_subgraph.cc
  * \brief
  */
 #include <nnvm/graph.h>
@@ -31,51 +31,18 @@
 
 #include "./subgraph_property.h"
 
+#define DEBUG_SUBGRAPH 0
+
 namespace nnvm {
 NodePtr CreateVariableNode(const std::string& name);
 }
 
 namespace mxnet {
-
 namespace op {
-
-using nnvm::Symbol;
-using nnvm::Node;
-using nnvm::NodePtr;
-using nnvm::NodeEntry;
-using nnvm::Graph;
-
-#define DEBUG_SUBGRAPH 0
-
 namespace sg {  // sg stands for subgraph
 
-struct SimpleNode;
-using SimpleNodePtr = std::shared_ptr<SimpleNode>;
-
-/*!
- * \brief Node of the undirected graph which replicates the network structures
- * of the computational graph. It is used to ease the graph traversal for finding
- * subgraphs.
- */
-struct SimpleNode {
-  static SimpleNodePtr Create() {
-    return std::make_shared<SimpleNode>();
-  }
-  SimpleNode() : label(-1), node(nullptr) {}
-  /*! subgraph label */
-  int label;
-  /*! the original node in the computational graph it references*/
-  nnvm::Node* node;
-  /*!
-   * \brief output nodes of the current node
-   * key is node ptr and value is an array of indices standing for the entry indices
-   * in key->inputs whose source is the current node.
-   */
-  std::unordered_map<nnvm::Node*, std::vector<size_t>> outputs;
-};  // struct SimpleNode
-
 #if DEBUG_SUBGRAPH
-void PrintSubgraph(const std::vector<SimpleNode*>& simple_nodes) {
+void PrintSubgraph(const std::vector<BiDirectedNode*>& simple_nodes) {
   std::string op_names = "";
   for (size_t i = 0; i < simple_nodes.size(); ++i) {
     op_names += simple_nodes[i]->node->attrs.name + ' ';
@@ -101,12 +68,12 @@ void PrintNodeEntries(const std::vector<nnvm::NodeEntry*>& entries) {
  * \param g the MXNet computational graph
  * \param simple_nodes the nodes of undirected graph in top sorted order
  */
-void CreateSimpleGraph(const Graph& g,
-                       std::vector<SimpleNodePtr>* simple_nodes) {
+void CreateSimpleGraph(const nnvm::Graph& g,
+                       std::vector<BiDirectedNodePtr>* simple_nodes) {
   const auto& indexed_graph = g.indexed_graph();
   simple_nodes->reserve(indexed_graph.num_nodes());
-  DFSVisit(g.outputs, [&](const NodePtr& node) {
-    SimpleNodePtr sn = SimpleNode::Create();
+  DFSVisit(g.outputs, [&](const nnvm::NodePtr& node) {
+    BiDirectedNodePtr sn = BiDirectedNode::Create();
     sn->node = node.get();
     for (size_t i = 0; i < sn->node->inputs.size(); ++i) {
       const auto& e = sn->node->inputs[i];
@@ -129,10 +96,10 @@ void CreateSimpleGraph(const Graph& g,
  * and clear the vector of subgraph nodes.
  */
 void ResetNodeLabels(const nnvm::Graph& g,
-                     const std::vector<SimpleNodePtr>& simple_nodes,
-                     std::vector<nnvm::Node*>* subgraph_nodes) {
+                     const std::vector<BiDirectedNodePtr>& simple_nodes,
+                     std::vector<BiDirectedNode*>* subgraph_nodes) {
   for (auto n : *subgraph_nodes) {
-    const auto nid = g.indexed_graph().node_id(n);
+    const auto nid = g.indexed_graph().node_id(n->node);
     simple_nodes[nid]->label = -1;
   }
   subgraph_nodes->clear();
@@ -153,20 +120,15 @@ void ResetNodeLabels(const nnvm::Graph& g,
  * \subgraph_nodes all the nodes belonging to the same subgraph of seed node
  * \excluded_nodes set of nodes that should be excluded from the current subgraph
  */
-bool LabelSubgraph(const Graph& g,
-                   SubgraphSelectorPtr subgraph_selector,
-                   const int label,
-                   const size_t snid,  // simple node id, this is a seed
-                   const std::vector<SimpleNodePtr>& simple_nodes,
-                   std::vector<nnvm::Node*>* subgraph_nodes,
-                   std::unordered_set<const nnvm::Node*>* excluded_nodes = nullptr) {
+bool LabelSubgraph(const nnvm::Graph& g, SubgraphSelectorV2Ptr subgraph_selector, const int label,
+                   const size_t snid, const std::vector<BiDirectedNodePtr>& simple_nodes,
+                   std::vector<BiDirectedNode*>* subgraph_nodes,
+                   std::unordered_set<const BiDirectedNode*>* excluded_nodes) {
   const auto& indexed_graph = g.indexed_graph();
-  std::queue<SimpleNode*> node_queue;
-  if (!excluded_nodes || !excluded_nodes->count(simple_nodes[snid]->node)) {
-    CHECK_EQ(simple_nodes[snid]->label, -1);
-    simple_nodes[snid]->label = label;
-    node_queue.push(simple_nodes[snid].get());
-  }
+  std::queue<BiDirectedNode*> node_queue;
+  CHECK_EQ(simple_nodes[snid]->label, -1);
+  simple_nodes[snid]->label = label;
+  node_queue.push(simple_nodes[snid].get());
   // key: nodes that serve as input/output nodes to the subgraph
   // value: pair of vectors of nodes in the subgraph. The first vector contains the
   // output nodes of the key in the subgraph, and the second vector contains the
@@ -180,41 +142,40 @@ bool LabelSubgraph(const Graph& g,
     std::pair<std::vector<const nnvm::Node*>,
               std::vector<const nnvm::Node*>>> non_subgraph_node_map;
   while (!node_queue.empty()) {
-    SimpleNode* cur_node = node_queue.front();
+    BiDirectedNode* cur_node = node_queue.front();
     node_queue.pop();
-    subgraph_nodes->push_back(cur_node->node);
+    subgraph_nodes->push_back(cur_node);
     // get qualified adjacent input nodes
     for (auto& e : cur_node->node->inputs) {
-      const bool select_input = (!excluded_nodes || !excluded_nodes->count(e.node.get()))
-        && subgraph_selector->SelectInput(*cur_node->node, *e.node);
+      const auto node = e.node.get();
+      const auto nid = indexed_graph.node_id(node);
+      auto snode = simple_nodes[nid].get();
+      CHECK_LT(nid, simple_nodes.size());
+      const bool select_input =
+          (snode->label == -1) && (!excluded_nodes || !excluded_nodes->count(snode)) &&
+          subgraph_selector->SelectInput(*cur_node, *snode);
       if (select_input) {
         // e.node is a subgraph node
-        const auto nid = indexed_graph.node_id(e.node.get());
-        CHECK_LT(nid, simple_nodes.size());
-        // this node has not been visited yet
-        if (simple_nodes[nid]->label == -1) {
-          simple_nodes[nid]->label = label;
-          node_queue.push(simple_nodes[nid].get());
-        }
-      } else {
+        snode->label = label;
+        node_queue.push(snode);
+      } else if (snode->label == -1) {
         // e.node is an input node of the subgraph
         non_subgraph_node_map[e.node.get()].first.push_back(cur_node->node);
       }
     }
     // get qualified output nodes
     for (auto it = cur_node->outputs.begin(); it != cur_node->outputs.end(); ++it) {
-      const bool select_output = (!excluded_nodes || !excluded_nodes->count(it->first))
-          && subgraph_selector->SelectOutput(*cur_node->node, *it->first);
+      const auto nid = indexed_graph.node_id(it->first);
+      auto snode = simple_nodes[nid].get();
+      CHECK_LT(nid, simple_nodes.size());
+      const bool select_output =
+          (snode->label == -1) && (!excluded_nodes || !excluded_nodes->count(snode)) &&
+          subgraph_selector->SelectOutput(*cur_node, *snode);
       if (select_output) {
         // it->first is a subgraph node
-        const auto nid = indexed_graph.node_id(it->first);
-        CHECK_LT(nid, simple_nodes.size());
-        // this node has not been visited yet
-        if (simple_nodes[nid]->label == -1) {
-          simple_nodes[nid]->label = label;
-          node_queue.push(simple_nodes[nid].get());
-        }
-      } else {
+        snode->label = label;
+        node_queue.push(snode);
+      } else if (snode->label == -1) {
         // it->first is an output node of the subgraph
         non_subgraph_node_map[it->first].second.push_back(cur_node->node);
       }
@@ -235,8 +196,12 @@ bool LabelSubgraph(const Graph& g,
   }
   // check whether there is a cycle between the subgraph and its input/output nodes
   auto is_ancestor = [&](const nnvm::Node* ancestor, const nnvm::Node* descendant,
-                         const std::vector<nnvm::Node*>& snodes) {
+                         const std::vector<BiDirectedNode*>& snodes) {
     if (ancestor == descendant) return true;
+    std::unordered_set<nnvm::Node*> snode_set;
+    for (const auto& sn : snodes) {
+      snode_set.insert(sn->node);
+    }
     std::stack<const nnvm::Node*> s;
     s.push(descendant);
     size_t count = 0;
@@ -251,8 +216,7 @@ bool LabelSubgraph(const Graph& g,
       }
       for (const auto& entry : top->inputs) {
         // when searching for the ancestor, the path cannot cross any subgraph node
-        auto it = std::find(snodes.begin(), snodes.end(), entry.node.get());
-        if (it == snodes.end()) {
+        if (!snode_set.count(entry.node.get())) {
           s.push(entry.node.get());
         }
       }
@@ -295,11 +259,14 @@ bool LabelSubgraph(const Graph& g,
       << "A cycle is found in the computational graph between nodes "
       << simple_nodes[excluded_node_id]->node->attrs.name << " and "
       << simple_nodes[snid]->node->attrs.name;
-    excluded_nodes->insert(simple_nodes[excluded_node_id]->node);
+    excluded_nodes->insert(simple_nodes[excluded_node_id].get());
     ResetNodeLabels(g, simple_nodes, subgraph_nodes);
     return false;
   }
-  std::sort(subgraph_nodes->begin(), subgraph_nodes->end(), node_cmp);
+  auto sim_node_cmp = [&] (const BiDirectedNode* node1, const BiDirectedNode* node2) {
+    return indexed_graph.node_id(node1->node) < indexed_graph.node_id(node2->node);
+  };
+  std::sort(subgraph_nodes->begin(), subgraph_nodes->end(), sim_node_cmp);
   return true;
 }
 
@@ -313,24 +280,22 @@ bool LabelSubgraph(const Graph& g,
  * \subgraph_nodes all the nodes belonging to the same subgraph of seed node
  * \return Subgraph node candidates sorted in the topological order
  */
-void PreSelectSubgraphNodes(const Graph& g,
-                            SubgraphSelectorPtr subgraph_selector,
-                            const int label,
-                            const size_t snid,
-                            const std::vector<SimpleNodePtr>& simple_nodes,
-                            std::vector<nnvm::Node*>* subgraph_nodes) {
-  std::unordered_set<const nnvm::Node*> excluded_nodes;
+void PreSelectSubgraphNodes(const nnvm::Graph& g, SubgraphSelectorV2Ptr subgraph_selector,
+                            const int label, const size_t snid,
+                            const std::vector<BiDirectedNodePtr>& simple_nodes,
+                            std::vector<BiDirectedNode*>* subgraph_nodes) {
+  std::unordered_set<const BiDirectedNode*> excluded_nodes;
   const size_t max_num_retry = simple_nodes.size() * simple_nodes.size();
   size_t count = 0;
   bool success = false;
   while (!success && count < max_num_retry) {
-    success = LabelSubgraph(g, subgraph_selector, label, snid, simple_nodes,
-                            subgraph_nodes, &excluded_nodes);
+    success = LabelSubgraph(g, subgraph_selector, label, snid, simple_nodes, subgraph_nodes,
+                            &excluded_nodes);
     if (!success) {
       CHECK(!excluded_nodes.empty());
       std::string excluded_node_names;
       for (auto node : excluded_nodes) {
-        excluded_node_names += node->attrs.name + ", ";
+        excluded_node_names += node->node->attrs.name + ", ";
       }
       LOG(INFO) << "Found a cycle when BFS from node " << simple_nodes[snid]->node->attrs.name
                 << ". Excluding nodes " << excluded_node_names << "and retrying";
@@ -339,126 +304,81 @@ void PreSelectSubgraphNodes(const Graph& g,
   }
   if (!success) {
     LOG(INFO) << "Tried " << count << " times of finding subgraphs starting from node "
-              << simple_nodes[snid]->node->attrs.name << " without success because a loop "
-                  "is always found between the subgraph and some other nodes. Will treat "
-                  "seed node " << simple_nodes[snid]->node->attrs.name
-              << "as a subgraph with one node";
+              << simple_nodes[snid]->node->attrs.name
+              << " without success because a loop "
+                 "is always found between the subgraph and some other nodes. Will treat "
+                 "seed node "
+              << simple_nodes[snid]->node->attrs.name << "as a subgraph with one node";
     CHECK(subgraph_nodes->empty());
     simple_nodes[snid]->label = label;
-    subgraph_nodes->push_back(simple_nodes[snid]->node);
+    subgraph_nodes->push_back(simple_nodes[snid].get());
   }
 }
 
-/*!
- * \brief Given a vector of nodes, group them into individual subgraphs
- * based upon their connectivity.
- */
-void PostProcessNodeCandidates(const nnvm::Graph& g,
-                               const std::vector<nnvm::Node*>& nodes,
-                               const std::vector<SimpleNodePtr>& simple_nodes,
-                               std::vector<std::vector<SimpleNode*>>* subgraphs,
-                               size_t* subgraph_id) {
-  const auto& indexed_graph = g.indexed_graph();
-  std::unordered_set<nnvm::Node*> node_set(nodes.begin(), nodes.end());
-  auto simple_node_cmp = [&] (const SimpleNode* node1, const SimpleNode* node2) {
+void SelectSubgraphNodes(nnvm::Graph* g, SubgraphSelectorV2Ptr subgraph_selector,
+                         const std::vector<BiDirectedNodePtr>& simple_nodes,
+                         std::vector<std::vector<BiDirectedNode*>>* subgraph_nodes,
+                         std::vector<SubgraphSelectorV2Ptr>* subgraph_selectors,
+                         const BiDirectedNode* node, const size_t snid, size_t* subgraph_id) {
+  const auto& indexed_graph = g->indexed_graph();
+  auto node_cmp = [&] (const BiDirectedNode* node1, const BiDirectedNode* node2) {
     return indexed_graph.node_id(node1->node) < indexed_graph.node_id(node2->node);
   };
-  for (auto node : nodes) {
-    if (!node_set.count(node)) {
-      // The node has been included in a subgraph
-      continue;
-    }
-    std::queue<nnvm::Node*> q;
-    q.push(node);
-    CHECK_EQ(node_set.erase(node), 1U);
-    subgraphs->emplace_back();
-    const auto nid = indexed_graph.node_id(node);
-    simple_nodes[nid]->label = *subgraph_id;
-    subgraphs->back().push_back(simple_nodes[nid].get());
-    while (!q.empty()) {
-      nnvm::Node* cur_node = q.front();
-      q.pop();
-      for (auto& e : cur_node->inputs) {
-        auto in_it = node_set.find(e.node.get());
-        if (in_it != node_set.end()) {
-          q.push(*in_it);
-          const auto in_nid = indexed_graph.node_id(*in_it);
-          simple_nodes[in_nid]->label = *subgraph_id;
-          subgraphs->back().push_back(simple_nodes[in_nid].get());
-          node_set.erase(in_it);
-        }
+  if (simple_nodes[snid]->label == -1 && subgraph_selector->Select(*node)) {
+    // pre-select nodes that can be grouped in a subgraph
+    std::vector<BiDirectedNode*> preselected_nodes;
+    PreSelectSubgraphNodes(*g, subgraph_selector, *subgraph_id, snid, simple_nodes,
+                           &preselected_nodes);
+
+    // filter out unqualified pre-selected nodes
+    std::vector<BiDirectedNode*> filtered_nodes = subgraph_selector->Filter(preselected_nodes);
+
+    // reset node labels that are not in filtered nodes
+    for (const auto n : preselected_nodes) {
+      const auto nit = std::find(filtered_nodes.begin(), filtered_nodes.end(), n);
+      if (nit == filtered_nodes.end()) {
+        n->label = -1;
       }
-      const auto cur_nid = indexed_graph.node_id(cur_node);
-      const SimpleNode* cur_snode = simple_nodes[cur_nid].get();
-      for (const auto& kv : cur_snode->outputs) {
-        const auto out_it = node_set.find(kv.first);
-        if (out_it != node_set.end()) {
-          q.push(*out_it);
-          const auto out_nid = indexed_graph.node_id(*out_it);
-          simple_nodes[out_nid]->label = *subgraph_id;
-          subgraphs->back().push_back(simple_nodes[out_nid].get());
-          node_set.erase(out_it);
-        }
+    }
+
+    if (filtered_nodes.size()) {
+      // make sure filtered_nodes is a subset of preselected_nodes
+      for (const auto n : filtered_nodes) {
+        const auto nit = std::find(preselected_nodes.begin(), preselected_nodes.end(), n);
+        CHECK(nit != preselected_nodes.end())
+            << "Node " << n->node->attrs.name
+            << " is not found in the pre-selected subgraph nodes."
+               " Please make sure that no new nodes were added in your subgraph"
+               " selector's Filter function";
       }
+
+      // make sure nodes are sorted
+      std::sort(filtered_nodes.begin(), filtered_nodes.end(), node_cmp);
+      subgraph_nodes->push_back(filtered_nodes);
+      subgraph_selectors->push_back(subgraph_selector);
+      (*subgraph_id)++;
     }
-    ++(*subgraph_id);
-    std::sort(subgraphs->back().begin(), subgraphs->back().end(), simple_node_cmp);
   }
-  CHECK(node_set.empty());
 }
 
 /*!
  * \brief Finds subgraphs with all nodes that meet certain criteria.
  * All nodes in a subgraph are marked with the same label.
  */
-void FindSubgraphs(Graph* g,
+void FindSubgraphs(nnvm::Graph* g,
                    const SubgraphProperty &subg_prop,
-                   const std::vector<SimpleNodePtr>& simple_nodes,
-                   std::vector<std::vector<SimpleNode*>>* subgraph_nodes) {
+                   const std::vector<BiDirectedNodePtr>& simple_nodes,
+                   std::vector<std::vector<BiDirectedNode*>>* subgraph_nodes,
+                   std::vector<SubgraphSelectorV2Ptr>* subgraph_selectors) {
   const auto& indexed_graph = g->indexed_graph();
   CHECK_EQ(indexed_graph.num_nodes(), simple_nodes.size());
-  auto node_cmp = [&] (const nnvm::Node* node1, const nnvm::Node* node2) {
-    return indexed_graph.node_id(node1) < indexed_graph.node_id(node2);
-  };
+
   size_t subgraph_id = 0;
   for (size_t i = 0; i < simple_nodes.size(); ++i) {
-    nnvm::Node* node = simple_nodes[i]->node;
-    auto subgraph_selector = subg_prop.CreateSubgraphSelector();
-    if (subgraph_selector->Select(*node) && simple_nodes[i]->label == -1) {
-      // pre-select nodes that can be grouped in a subgraph
-      std::vector<nnvm::Node*> preselected_nodes;
-      PreSelectSubgraphNodes(*g, subgraph_selector, subgraph_id, i, simple_nodes,
-                             &preselected_nodes);
-
-      // filter out unqualified pre-selected nodes
-      std::vector<nnvm::Node*> filtered_nodes = subgraph_selector->Filter(preselected_nodes);
-
-      // make sure filtered_nodes is a subset of preselected_nodes
-      for (const auto n : filtered_nodes) {
-        const auto nit = std::find(preselected_nodes.begin(), preselected_nodes.end(), n);
-        CHECK(nit != preselected_nodes.end())
-          << "Node " << n->attrs.name << " is not found in the pre-selected subgraph nodes."
-             " Please make sure that no new nodes were added in your subgraph"
-             " selector's Filter function";
-      }
-
-      // make sure nodes are sorted
-      std::sort(filtered_nodes.begin(), filtered_nodes.end(), node_cmp);
-
-      // reset node labels that are not in filtered nodes
-      for (const auto n : preselected_nodes) {
-        const auto nit = std::find(filtered_nodes.begin(), filtered_nodes.end(), n);
-        if (nit == filtered_nodes.end()) {
-          simple_nodes[indexed_graph.node_id(n)]->label = -1;
-        }
-      }
-      // find out subgraphs from the filtered nodes
-      std::vector<std::vector<SimpleNode*>> subgraphs;
-      PostProcessNodeCandidates(*g, filtered_nodes, simple_nodes, &subgraphs, &subgraph_id);
-      if (!subgraphs.empty()) {
-        subgraph_nodes->insert(subgraph_nodes->end(), subgraphs.begin(), subgraphs.end());
-      }
-    }
+    const auto snode = simple_nodes[i];
+    SubgraphSelectorV2Ptr subgraph_selector = subg_prop.CreateSubgraphSelectorV2();
+    SelectSubgraphNodes(g, subgraph_selector, simple_nodes, subgraph_nodes, subgraph_selectors,
+                        snode.get(), i, &subgraph_id);
   }
 }
 
@@ -488,9 +408,9 @@ void SortEntries(const std::unordered_map<const nnvm::NodeEntry*, size_t>& entry
  * \param entry_top_order_map mapping entry pointer to its top sorted position
  * \param input_entries input entries of the subgraph
  */
-void FindInputEntries(const Graph& g,
-                      const std::vector<SimpleNodePtr>& simple_nodes,
-                      const std::vector<SimpleNode*>& subgraph_nodes,
+void FindInputEntries(const nnvm::Graph& g,
+                      const std::vector<BiDirectedNodePtr>& simple_nodes,
+                      const std::vector<BiDirectedNode*>& subgraph_nodes,
                       const std::unordered_map<const nnvm::NodeEntry*, size_t>& entry_top_order_map,
                       std::vector<nnvm::NodeEntry*>* input_entries) {
   const auto& indexed_graph = g.indexed_graph();
@@ -528,9 +448,9 @@ void FindInputEntries(const Graph& g,
  * \param entry_top_order_map mapping entry pointer to its top sorted position
  * \param output_entries output entries of the subgraph
  */
-void FindOutputEntries(Graph* g,
-                       const std::vector<SimpleNodePtr>& simple_nodes,
-                       const std::vector<SimpleNode*>& subgraph_nodes,
+void FindOutputEntries(nnvm::Graph* g,
+                       const std::vector<BiDirectedNodePtr>& simple_nodes,
+                       const std::vector<BiDirectedNode*>& subgraph_nodes,
                        const std::unordered_map<const nnvm::NodeEntry*, size_t>&
                          entry_top_order_map,
                        std::vector<nnvm::NodeEntry*>* output_entries) {
@@ -618,12 +538,12 @@ void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
 
 /*!
  * \brief Replace a set of nodes belonging to the same subgraph with a subgrpah node
- * and keep the subgraph in the subgraph node. The input entries and output entries
- * of the subgraph node are kept in the same order as the subgraph's.
+ * and keep the subgraph in the subgraph node.
  */
-void CreateSubgraphNode(Graph* g,
-                        const std::vector<SimpleNodePtr>& simple_nodes,
-                        const std::vector<SimpleNode*>& subgraph_nodes,
+void CreateSubgraphNode(nnvm::Graph* g,
+                        const std::vector<BiDirectedNodePtr>& simple_nodes,
+                        const std::vector<BiDirectedNode*>& subgraph_nodes,
+                        const SubgraphSelectorV2Ptr& subgraph_selector,
                         const size_t subgraph_id,
                         std::unordered_map<const nnvm::NodeEntry*, size_t>* entry_top_order_map) {
 #if DEBUG_SUBGRAPH
@@ -647,7 +567,7 @@ void CreateSubgraphNode(Graph* g,
     sym.outputs[i] = *output_entries[i];
   }
   const SubgraphPropertyPtr& subg_prop = g->GetAttr<SubgraphPropertyPtr>("subgraph_property");
-  nnvm::NodePtr n = subg_prop->CreateSubgraphNode(sym, subgraph_id);
+  nnvm::NodePtr n = subg_prop->CreateSubgraphNode(sym, subgraph_selector, subgraph_id);
 
   // Connect the external nodes to the subgraph node.
   subg_prop->ConnectSubgraphOutputs(n, &output_entries);
@@ -664,8 +584,8 @@ void CreateSubgraphNode(Graph* g,
     nnvm::Node* node = e.node.get();
     if (indexed_graph.exist(node)) {
       const auto nid = indexed_graph.node_id(node);
-      SimpleNode* sn = simple_nodes[nid].get();
-      for (SimpleNode* dest_node : subgraph_nodes) {
+      BiDirectedNode* sn = simple_nodes[nid].get();
+      for (BiDirectedNode* dest_node : subgraph_nodes) {
         sn->outputs.erase(dest_node->node);
       }
       sn->outputs[n.get()].push_back(i);
@@ -676,6 +596,25 @@ void CreateSubgraphNode(Graph* g,
 #endif
 }
 
+/*!
+ * \brief Adjust a set of nodes belonging to the same subgraph. No new node is created, but
+ * adjust selected nodes' attributes.
+ * This can be used to implement peephole optimization. For example, adjust calibration information
+ * of quantized nodes.
+ */
+void AdjustSubgraphNode(nnvm::Graph* g,
+                        const std::vector<BiDirectedNode*>& subgraph_nodes,
+                        const SubgraphSelectorV2Ptr& subgraph_selector,
+                        const size_t subgraph_id) {
+  std::vector<nnvm::Node*> node_list;
+  for (auto node : subgraph_nodes) {
+    node_list.push_back(node->node);
+  }
+
+  const SubgraphPropertyPtr& subg_prop = g->GetAttr<SubgraphPropertyPtr>("subgraph_property");
+  subg_prop->AdjustSubgraphNode(node_list, subgraph_selector, subgraph_id);
+}
+
 }  // namespace sg
 
 /*!
@@ -683,7 +622,7 @@ void CreateSubgraphNode(Graph* g,
  * This is going to be used to sort input/output entries of subgraphs to keep
  * the topological order unchanged.
  */
-void TopSortEntries(const Graph& g,
+void TopSortEntries(const nnvm::Graph& g,
                     std::unordered_map<const nnvm::NodeEntry*, size_t>* entry_top_order_map) {
   CHECK(entry_top_order_map != nullptr);
   std::unordered_set<const nnvm::Node*> visited;
@@ -732,7 +671,7 @@ void TopSortEntries(const Graph& g,
   }
 }
 
-Graph PartitionGraph(Graph&& g) {
+nnvm::Graph BuildSubgraph(nnvm::Graph&& g) {
   if (!g.HasAttr("subgraph_property")) {  // treat the whole graph as a subgraph
     LOG(INFO) << "The graph has no attribute of subgraph_property attached. "
                  "The original graph is returned.";
@@ -740,31 +679,45 @@ Graph PartitionGraph(Graph&& g) {
   }
   using namespace sg;
   const SubgraphPropertyPtr& subg_prop = g.GetAttr<SubgraphPropertyPtr>("subgraph_property");
+  const std::string& prop_name = subg_prop->HasAttr("property_name")
+                                     ? subg_prop->GetAttr<std::string>("property_name")
+                                     : "partition graph";
+  LOG(INFO) << "start to execute " << prop_name << ".";
   // top sort NodeEntry of all the nodes' inputs
   std::unordered_map<const nnvm::NodeEntry*, size_t> entry_top_order_map;
   TopSortEntries(g, &entry_top_order_map);
 
-  // Create undirected graph for ease of finding subgraphs
-  std::vector<SimpleNodePtr> simple_nodes;
+  // Create double directional graph for ease of finding subgraphs
+  std::vector<BiDirectedNodePtr> simple_nodes;
   CreateSimpleGraph(g, &simple_nodes);
-  std::vector<std::vector<SimpleNode*>> subgraph_nodes;
-  FindSubgraphs(&g, *subg_prop, simple_nodes, &subgraph_nodes);
+  std::vector<std::vector<BiDirectedNode*>> subgraph_nodes;
+  std::vector<SubgraphSelectorV2Ptr> subgraph_selectors;
+  FindSubgraphs(&g, *subg_prop, simple_nodes, &subgraph_nodes, &subgraph_selectors);
+  CHECK_EQ(subgraph_nodes.size(), subgraph_selectors.size());
   for (size_t i = 0; i < subgraph_nodes.size(); ++i) {
 #if DEBUG_SUBGRAPH
-    std::set<SimpleNode*> simple_node_set(subgraph_nodes[i].begin(), subgraph_nodes[i].end());
+    std::set<BiDirectedNode*> simple_node_set(subgraph_nodes[i].begin(), subgraph_nodes[i].end());
     CHECK_EQ(simple_node_set.size(), subgraph_nodes[i].size());
     PrintSubgraph(subgraph_nodes[i]);
 #endif
-    CreateSubgraphNode(&g, simple_nodes, subgraph_nodes[i], i, &entry_top_order_map);
+    auto ptype = subg_prop->GetPropertyType();
+    if (ptype == SubgraphProperty::SgPropertyType::kCreate) {
+      CreateSubgraphNode(&g, simple_nodes, subgraph_nodes[i], subgraph_selectors[i], i,
+                         &entry_top_order_map);
+    } else {
+      CHECK_EQ(ptype, SubgraphProperty::SgPropertyType::kAdjust);
+      AdjustSubgraphNode(&g, subgraph_nodes[i], subgraph_selectors[i], i);
+    }
   }
   return g;
 }
 
-NNVM_REGISTER_PASS(PartitionGraph)
-.describe("Partition a graph according to the user defined rules "
+NNVM_REGISTER_PASS(BuildSubgraph)
+.describe("Apply a subgraph pass according to the user defined rules "
           "in a derived class of SubgraphProperty")
-.set_body(PartitionGraph)
+.set_body(BuildSubgraph)
 .set_change_graph(true);
 
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/subgraph/default_subgraph_property_v2.cc b/src/operator/subgraph/default_subgraph_property_v2.cc
new file mode 100644
index 000000000000..bf8ccfe5ba6f
--- /dev/null
+++ b/src/operator/subgraph/default_subgraph_property_v2.cc
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+#include "./common.h"
+#include "./subgraph_property.h"
+#include "../../imperative/cached_op.h"
+
+namespace mxnet {
+namespace op {
+
+/*
+ * This selects nodes for a subgraph that only contains operators
+ * in a given set and it visits nodes via both input and output links.
+ */
+class ContainOpSelectorV2: public SubgraphSelectorV2 {
+ public:
+  explicit ContainOpSelectorV2(const std::unordered_set<std::string>& op_names)
+    : op_names_(op_names) {}
+
+  bool Select(const BiDirectedNode &sn) override {
+    const auto &seed_node = *sn.node;
+    return !seed_node.is_variable() && op_names_.count(seed_node.op()->name);
+  }
+
+  bool SelectInput(const BiDirectedNode &sn, const BiDirectedNode &snew_node) override {
+    const auto &input_node = *snew_node.node;
+    return !input_node.is_variable() && op_names_.count(input_node.op()->name);
+  }
+
+  bool SelectOutput(const BiDirectedNode &sn, const BiDirectedNode &snew_node) override {
+    const auto &output_node = *snew_node.node;
+    return !output_node.is_variable() && op_names_.count(output_node.op()->name);
+  }
+ private:
+  const std::unordered_set<std::string>& op_names_;
+};
+
+/*
+ * This subgraph property finds a subgraph whose nodes have only operators
+ * within a set. The operators in the subgraph will be executed by _CachedOp.
+ */
+class DefaultSubgraphProperty: public SubgraphProperty {
+ public:
+  static SubgraphPropertyPtr Create() { return std::make_shared<DefaultSubgraphProperty>(); }
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                           const SubgraphSelectorPtr& subgraph_selector,
+                                           const int subgraph_id = 0) const override {
+    nnvm::NodePtr n = nnvm::Node::Create();
+    n->attrs.op = Op::Get("_CachedOp");
+    n->attrs.name = "_CachedOp" + std::to_string(subgraph_id);
+    n->attrs.subgraphs.push_back(std::make_shared<nnvm::Symbol>(sym));
+
+    std::vector<std::pair<std::string, std::string> > flags{{"static_alloc", "true"}};
+    n->attrs.parsed = CachedOpPtr(new CachedOp(sym, flags));
+
+    return n;
+  }
+  SubgraphSelectorV2Ptr CreateSubgraphSelectorV2() const override {
+    return std::make_shared<ContainOpSelectorV2>(
+        this->GetAttr<std::unordered_set<std::string>>("op_names"));
+  }
+};
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(default_v2, DefaultSubgraphProperty);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index e53ab2538a90..d61b4613602a 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -689,6 +689,9 @@ NNVM_REGISTER_OP(_sg_mkldnn_conv)
 .set_attr<FInferStorageType>("FInferStorageType", SgMKLDNNConvOpStorageType)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNConvOpForward)
 .set_attr<bool>("TIsMKLDNN", true)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.cc b/src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.h
similarity index 84%
rename from src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.cc
rename to src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.h
index fc68287b039d..f9033f48d413 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.h
@@ -16,9 +16,12 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
+#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_POST_QUANTIZE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_POST_QUANTIZE_PROPERTY_H_
 #if MXNET_USE_MKLDNN == 1
 
+#include <string>
+#include <vector>
 #include "../common.h"
 #include "../subgraph_property.h"
 #include "../../nn/mkldnn/mkldnn_convolution-inl.h"
@@ -38,16 +41,14 @@ class SgMKLDNNConvPostQuantizeSelector : public SubgraphSelector {
   };
 
  private:
-  bool disable_all;
   SelectStatus status;
   std::vector<const nnvm::Node *> matched_list;
 
  public:
-  explicit SgMKLDNNConvPostQuantizeSelector(int dis_all)
-      : disable_all(dis_all) {}
+  SgMKLDNNConvPostQuantizeSelector() {}
 
   bool Select(const nnvm::Node &n) override {
-    if ((!disable_all) && n.op() && n.op()->name == "_sg_mkldnn_conv") {
+    if (n.op() && n.op()->name == "_sg_mkldnn_conv") {
       auto const &param = nnvm::get<MKLDNNConvFusionParam>(n.attrs.parsed);
       if (param.full_conv_param.mkldnn_param.quantized) {
         status = kStart;
@@ -98,17 +99,16 @@ class SgMKLDNNConvPostQuantizeSelector : public SubgraphSelector {
 
 class SgMKLDNNConvPostQuantizeProperty : public SubgraphProperty {
  public:
-  SgMKLDNNConvPostQuantizeProperty() {
-    disable_all = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_OPT", 0);
-    if (disable_all) {
-      LOG(INFO) << "MKLDNN Convolution post-quantization optimization pass is disabled.";
-    } else {
-      LOG(INFO) << "Start to execute MKLDNN Convolution post-quantization optimization pass.";
-    }
-  }
+  SgMKLDNNConvPostQuantizeProperty() {}
+
   static SubgraphPropertyPtr Create() {
-    return std::make_shared<SgMKLDNNConvPostQuantizeProperty>();
+    static const std::string &name = "MKLDNN Convolution post-quantization optimization pass";
+    auto property = std::make_shared<SgMKLDNNConvPostQuantizeProperty>();
+    property->SetAttr<std::string>("property_name", name);
+    property->SetAttr<bool>("inference_only", true);
+    return property;
   }
+
   nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
                                    const int subgraph_id = 0) const override {
     nnvm::NodePtr conv_node = nullptr;
@@ -137,8 +137,7 @@ class SgMKLDNNConvPostQuantizeProperty : public SubgraphProperty {
   }
 
   SubgraphSelectorPtr CreateSubgraphSelector() const override {
-    auto selector =
-        std::make_shared<SgMKLDNNConvPostQuantizeSelector>(disable_all);
+    auto selector = std::make_shared<SgMKLDNNConvPostQuantizeSelector>();
     return selector;
   }
 
@@ -150,14 +149,10 @@ class SgMKLDNNConvPostQuantizeProperty : public SubgraphProperty {
       *entry_ptr = nnvm::NodeEntry{n, entry_ptr->index, 0};
     }
   }
-
- private:
-  int disable_all;
 };
 
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNConvPostQuantizeProperty);
-
 }  // namespace op
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_POST_QUANTIZE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc b/src/operator/subgraph/mkldnn/mkldnn_conv_property.h
similarity index 91%
rename from src/operator/subgraph/mkldnn/mkldnn_conv_property.cc
rename to src/operator/subgraph/mkldnn/mkldnn_conv_property.h
index e462191c2898..7fe4727a4990 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv_property.h
@@ -17,8 +17,12 @@
  * under the License.
  */
 
+#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_PROPERTY_H_
 #if MXNET_USE_MKLDNN == 1
 
+#include <string>
+#include <vector>
 #include "../common.h"
 #include "../subgraph_property.h"
 #include "../../nn/activation-inl.h"
@@ -136,20 +140,22 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
 class SgMKLDNNConvProperty : public SubgraphProperty {
  public:
   SgMKLDNNConvProperty() {
-    disable_all = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_OPT", 0);
     disable_conv_bn = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_BN", 0);
     disable_conv_relu = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_RELU", 0);
     disable_conv_sum = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_SUM", 0);
 
-    disable_all = disable_all || (disable_conv_bn && disable_conv_relu && disable_conv_sum);
-    if (disable_all) {
-      LOG(INFO) << "MKLDNN Convolution optimization pass is disabled.";
-    } else {
-      LOG(INFO) << "Start to execute MKLDNN Convolution optimization pass.";
-    }
+    disable_all = disable_conv_bn && disable_conv_relu && disable_conv_sum;
   }
   static SubgraphPropertyPtr Create() {
-    return std::make_shared<SgMKLDNNConvProperty>();
+    static const std::string &name = "MKLDNN convolution optimization pass";
+    if (dmlc::GetEnv("MXNET_DISABLE_MKLDNN_CONV_OPT", 0)) {
+      LOG(INFO) << name << " is disabled.";
+      return nullptr;
+    }
+    auto property = std::make_shared<SgMKLDNNConvProperty>();
+    property->SetAttr<std::string>("property_name", name);
+    property->SetAttr<bool>("inference_only", true);
+    return property;
   }
   nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
                                    const int subgraph_id = 0) const override {
@@ -241,9 +247,8 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
   int disable_conv_sum;
 };
 
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNConvProperty);
-
 }  // namespace op
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc.cc b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
new file mode 100644
index 000000000000..0ec05a2af087
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
@@ -0,0 +1,448 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_fc.cc
+ * \brief MKLDNN (Quantized) FullyConnected operator based on subgraph
+ * \author Ciyong Chen
+*/
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <utility>
+#include <vector>
+#include <string>
+#include "../common.h"
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../nn/mkldnn/mkldnn_fully_connected-inl.h"
+#include "../../quantization/quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+class SgMKLDNNFCOp {
+ public:
+  explicit SgMKLDNNFCOp(const nnvm::NodeAttrs &attrs)
+    : initialized_(false),
+      subgraph_sym_(*attrs.subgraphs[0]),
+      full_param_(nnvm::get<MKLDNNFCFullParam>(attrs.parsed)) {}
+
+  void Forward(const OpContext &ctx,
+               const std::vector<NDArray> &inputs,
+               const std::vector<OpReqType> &req,
+               const std::vector<NDArray> &outputs);
+
+  void Backward(const OpContext &ctx,
+                const std::vector<NDArray> &inputs,
+                const std::vector<OpReqType> &req,
+                const std::vector<NDArray> &outputs) {
+    LOG(FATAL) << "Not implemented: subgraph mkldnn fully connected only supports "
+                  "inference computation.";
+  }
+
+ private:
+  bool initialized_;
+  nnvm::Symbol subgraph_sym_;
+  MKLDNNFCFullParam full_param_;
+  std::shared_ptr<MKLDNNFullyConnectedForward> fwd_;
+  NDArray cached_bias_;
+  float cached_min_data_;
+  float cached_max_data_;
+  float cached_min_weight_;
+  float cached_max_weight_;
+  float cached_min_bias_;
+  float cached_max_bias_;
+  float cached_min_output_;
+  float cached_max_output_;
+};
+
+void SgMKLDNNFCOp::Forward(const OpContext &ctx,
+                           const std::vector<NDArray> &in_data,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<NDArray> &out_data) {
+  auto &mkldnn_param = full_param_.mkldnn_param;
+  auto &default_param = full_param_.default_param;
+  bool has_bias = !default_param.no_bias;
+  size_t base_num_inputs = has_bias ? 3 : 2;
+  size_t total_num_inputs = base_num_inputs;
+  size_t base_num_outputs = 1;
+  size_t total_num_outputs = base_num_outputs;
+
+  float min_data = 0.0;
+  float max_data = 0.0;
+  float min_weight = 0.0;
+  float max_weight = 0.0;
+  float min_bias = 0.0;
+  float max_bias = 0.0;
+
+  if (mkldnn_param.quantized) {
+    total_num_inputs = base_num_inputs * 3;
+    min_data = in_data[base_num_inputs + quantized_fullc::kDataMin].data().dptr<float>()[0];
+    max_data = in_data[base_num_inputs + quantized_fullc::kDataMax].data().dptr<float>()[0];
+    min_weight = in_data[base_num_inputs + quantized_fullc::kWeightMin].data().dptr<float>()[0];
+    max_weight = in_data[base_num_inputs + quantized_fullc::kWeightMax].data().dptr<float>()[0];
+    if (has_bias) {
+      min_bias = in_data[base_num_inputs + quantized_fullc::kBiasMin].data().dptr<float>()[0];
+      max_bias = in_data[base_num_inputs + quantized_fullc::kBiasMax].data().dptr<float>()[0];
+    }
+    if (!mkldnn_param.enable_float_output) {
+      total_num_outputs = base_num_outputs * 3;
+    }
+  }
+  CHECK_EQ(in_data.size(), total_num_inputs);
+  CHECK_EQ(out_data.size(), total_num_outputs);
+
+  NDArray data = in_data[fullc::kData];
+  NDArray weight = in_data[fullc::kWeight];
+  NDArray output = out_data[fullc::kOut];
+
+  mkldnn::memory::desc out_md = GetMemDesc(output);
+  MKLDNNFCFlattenData(default_param, out_data[fullc::kOut], &data, &out_md);
+
+  if (initialized_ && mkldnn_param.quantized) {
+    if (cached_min_data_ != min_data || cached_max_data_ != max_data ||
+        cached_min_weight_ != min_weight || cached_max_weight_ != max_weight ||
+        (has_bias && (cached_min_bias_ != min_bias || cached_max_bias_ != max_bias))) {
+          initialized_ = false;
+        }
+  }
+
+  if (!initialized_) {
+    cached_min_data_ = min_data;
+    cached_max_data_ = max_data;
+    cached_min_weight_ = min_weight;
+    cached_max_weight_ = max_weight;
+    if (has_bias) {
+      cached_bias_ = in_data[fullc::kBias];
+      cached_min_bias_ = min_bias;
+      cached_max_bias_ = max_bias;
+    } else {
+      cached_bias_ = NDArray();
+    }
+
+    if (mkldnn_param.quantized) {
+      CHECK(data.dtype() == mshadow::kInt8 || data.dtype() == mshadow::kUint8);
+      auto data_range = (data.dtype() == mshadow::kInt8) ? kInt8Range : kUint8Range;
+      float data_scale  = data_range / MaxAbs(cached_min_data_, cached_max_data_);
+      float weight_scale = kInt8Range / MaxAbs(cached_min_weight_, cached_max_weight_);
+      float quantized_out_range = mkldnn_param.with_relu ? kUint8Range : kInt8Range;
+
+      if (has_bias) {
+        NDArray bias = in_data[fullc::kBias];
+        float bias_int32_rescale = data_scale * weight_scale *
+            MaxAbs(cached_min_bias_, cached_max_bias_) / kInt8Range;
+
+        cached_bias_ = NDArray(bias.storage_type(), bias.shape(),
+                               bias.ctx(), true, mshadow::kInt32);
+        int8_t *bias_ptr = bias.data().dptr<int8_t>();
+        int32_t *quantized_bias_ptr = cached_bias_.data().dptr<int32_t>();
+        size_t bias_size = bias.shape().Size();
+        #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+        for (size_t i = 0; i < bias_size; ++i) {
+          quantized_bias_ptr[i] = bias_ptr[i] * bias_int32_rescale;
+        }
+      }
+
+      if (mkldnn_param.enable_float_output) {
+        full_param_.output_scales[0] = 1.0 / data_scale / weight_scale;
+        full_param_.requantize_scales.resize(0);
+      } else if (mkldnn_param.min_calib_range.has_value() &&
+                 mkldnn_param.max_calib_range.has_value()) {
+        full_param_.output_scales.resize(0);
+        cached_min_output_ = mkldnn_param.min_calib_range.value();
+        cached_max_output_ = mkldnn_param.max_calib_range.value();
+
+        full_param_.requantize_scales[0] = quantized_out_range /
+          MaxAbs(cached_min_output_, cached_max_output_) / data_scale / weight_scale;
+      } else {
+        Stream<cpu> *s = ctx.get_stream<cpu>();
+        mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(
+          s, 1, &cached_min_output_, &cached_max_output_,
+          &min_data, &max_data, &min_weight, &max_weight);
+      }
+    }
+
+    fwd_.reset(new MKLDNNFullyConnectedForward(full_param_, ctx.is_train, data, weight,
+      (has_bias ? &cached_bias_ : nullptr), out_md));
+    initialized_ = true;
+  }
+  std::vector<NDArray> new_inputs;
+  std::vector<OpReqType> new_req;
+  if (has_bias) {
+    new_inputs = {data, weight, cached_bias_};
+    new_req = {req[fullc::kData], req[fullc::kWeight], req[fullc::kBias]};
+  } else {
+    new_inputs = {data, weight};
+    new_req = {req[fullc::kData], req[fullc::kWeight]};
+  }
+
+  MKLDNNFCForwardFullFeature(full_param_, ctx, fwd_.get(), new_inputs, new_req, out_data);
+
+  if (mkldnn_param.quantized && !mkldnn_param.enable_float_output) {
+    float *min_output_ptr = out_data[quantized_fullc::kOutMin].data().dptr<float>();
+    float *max_output_ptr = out_data[quantized_fullc::kOutMax].data().dptr<float>();
+    *min_output_ptr = cached_min_output_;
+    *max_output_ptr = cached_max_output_;
+  }
+}
+
+static void SgMKLDNNFCParamParser(nnvm::NodeAttrs *attrs) {
+  MKLDNNFCFullParam full_param;
+  try {
+    full_param.mkldnn_param.Init(attrs->dict);
+  } catch (const dmlc::ParamError &e) {
+    std::ostringstream os;
+    os << e.what();
+    os << ", in operator " << attrs->op->name << "("
+       << "name=\"" << attrs->name << "\"";
+    for (const auto &k : attrs->dict) {
+      os << ", " << k.first << "=\"" << k.second << "\"";
+    }
+    os << ")";
+    throw dmlc::ParamError(os.str());
+  }
+  auto subgraph_sym = attrs->subgraphs[0];
+  DFSVisit(subgraph_sym->outputs, [&](const nnvm::NodePtr &node) {
+    if (node->is_variable()) return;
+    auto &node_name = node->op()->name;
+    if (node_name == "FullyConnected") {
+      full_param.default_param =
+          nnvm::get<FullyConnectedParam>(node->attrs.parsed);
+    }
+  });
+  attrs->parsed = std::move(full_param);
+}
+
+static std::vector<std::string> SgMKLDNNFCListInputNames(const NodeAttrs &attrs) {
+  auto const &full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
+  std::vector<std::string> input_names = DefaultSubgraphOpListInputs(attrs);
+  if (full_param.mkldnn_param.quantized) {
+    input_names.emplace_back("min_data");
+    input_names.emplace_back("max_data");
+    input_names.emplace_back("min_weight");
+    input_names.emplace_back("max_weight");
+    if (!full_param.default_param.no_bias) {
+      input_names.emplace_back("min_bias");
+      input_names.emplace_back("max_bias");
+    }
+  }
+  return input_names;
+}
+
+static std::vector<std::string> SgMKLDNNFCListOutputNames(const NodeAttrs &attrs) {
+  auto const &full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
+  if (full_param.mkldnn_param.quantized) {
+    if (full_param.mkldnn_param.enable_float_output)
+      return std::vector<std::string>{"output"};
+    else
+      return std::vector<std::string>{"output", "min_output", "max_output"};
+  } else {
+    return std::vector<std::string>{"output"};
+  }
+}
+
+template <typename T>
+static inline void FillBaseInputOutputInfo(const FullyConnectedParam &param,
+                                           std::vector<T> *base_in_attrs,
+                                           std::vector<T> *base_out_attrs,
+                                           std::vector<T> *in_attrs,
+                                           std::vector<T> *out_attrs) {
+  auto base_num_inputs = param.no_bias ? 2 : 3;
+
+  base_out_attrs->push_back(out_attrs->at(0));
+  for (int i = 0; i < base_num_inputs; ++i) {
+    base_in_attrs->push_back(in_attrs->at(i));
+  }
+}
+
+static bool SgMKLDNNFCInferShape(const nnvm::NodeAttrs &attrs,
+                                 mxnet::ShapeVector *in_shapes,
+                                 mxnet::ShapeVector *out_shapes) {
+  auto const &full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
+  if (full_param.mkldnn_param.quantized) {
+    mxnet::ShapeVector base_in_shapes;
+    mxnet::ShapeVector base_out_shapes;
+    FillBaseInputOutputInfo(full_param.default_param, &base_in_shapes, &base_out_shapes,
+                            in_shapes, out_shapes);
+    bool ret = DefaultSubgraphOpShape(attrs, &base_in_shapes, &base_out_shapes);
+
+    for (size_t i = 0; i < in_shapes->size(); ++i) {
+      if (i < base_in_shapes.size())
+        in_shapes->at(i) = base_in_shapes[i];
+      else
+        SHAPE_ASSIGN_CHECK(*in_shapes, i, Shape1(1));
+    }
+
+    out_shapes->at(0) = base_out_shapes[0];
+    if (!full_param.mkldnn_param.enable_float_output) {
+      SHAPE_ASSIGN_CHECK(*out_shapes, 1, Shape1(1));
+      SHAPE_ASSIGN_CHECK(*out_shapes, 2, Shape1(1));
+    }
+    return ret;
+  } else {
+    return DefaultSubgraphOpShape(attrs, in_shapes, out_shapes);
+  }
+}
+
+static bool SgMKLDNNFCInferType(const nnvm::NodeAttrs &attrs,
+                                std::vector<int> *in_types,
+                                std::vector<int> *out_types) {
+  auto const &full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
+  if (full_param.mkldnn_param.quantized) {
+    size_t base_num_inputs = full_param.default_param.no_bias ? 2 : 3;
+
+    CHECK(in_types->at(0) == mshadow::kInt8 ||
+          in_types->at(0) == mshadow::kUint8)
+        << "QuantizedFullyConnected only supports int8/uint8 input, while "
+        << in_types->at(0) << " is given.";
+    for (size_t i = 1; i < in_types->size(); ++i) {
+      if (i < base_num_inputs) {
+        TYPE_ASSIGN_CHECK(*in_types, i, mshadow::kInt8);
+      } else {
+        TYPE_ASSIGN_CHECK(*in_types, i, mshadow::kFloat32);
+      }
+    }
+
+    if (full_param.mkldnn_param.enable_float_output) {
+      TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kFloat32);
+    } else {
+      if (full_param.mkldnn_param.min_calib_range.has_value() &&
+          full_param.mkldnn_param.max_calib_range.has_value()) {
+        if (full_param.mkldnn_param.with_relu) {
+          TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kUint8);
+        } else {
+          TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kInt8);
+        }
+      } else {
+        TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kInt32);
+      }
+      TYPE_ASSIGN_CHECK(*out_types, 1, mshadow::kFloat32);
+      TYPE_ASSIGN_CHECK(*out_types, 2, mshadow::kFloat32);
+    }
+    return true;
+  } else {
+    return DefaultSubgraphOpType(attrs, in_types, out_types);
+  }
+}
+
+static bool SgMKLDNNFCStorageType(const nnvm::NodeAttrs &attrs,
+                                  const int dev_mask,
+                                  DispatchMode *dispatch_mode,
+                                  std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs) {
+  auto const &full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
+  if (full_param.mkldnn_param.quantized) {
+    std::vector<int> base_in_attrs;
+    std::vector<int> base_out_attrs;
+    FillBaseInputOutputInfo(full_param.default_param, &base_in_attrs, &base_out_attrs,
+                            in_attrs, out_attrs);
+    bool ret = DefaultSubgraphOpStorageType(attrs, dev_mask, dispatch_mode,
+                                            &base_in_attrs, &base_out_attrs);
+
+    for (size_t i = 0; i < in_attrs->size(); ++i) {
+      if (i < base_in_attrs.size())
+        in_attrs->at(i) = base_in_attrs[i];
+      else
+        type_assign(&in_attrs->at(i), mxnet::kDefaultStorage);
+    }
+
+    out_attrs->at(0) = base_out_attrs[0];
+    if (!full_param.mkldnn_param.enable_float_output) {
+      type_assign(&out_attrs->at(1), mxnet::kDefaultStorage);
+      type_assign(&out_attrs->at(2), mxnet::kDefaultStorage);
+    }
+    return ret;
+  } else {
+    return DefaultSubgraphOpStorageType(attrs, dev_mask, dispatch_mode,
+                                        in_attrs, out_attrs);
+  }
+}
+
+static OpStatePtr CreateSgMKLDNNFCState(const nnvm::NodeAttrs &attrs,
+                                        Context ctx,
+                                        const mxnet::ShapeVector &in_shapes,
+                                        const std::vector<int> &in_types) {
+  return OpStatePtr::Create<SgMKLDNNFCOp>(attrs);
+}
+
+static void SgMKLDNNFCForward(const OpStatePtr &state_pointer,
+                              const OpContext &ctx,
+                              const std::vector<NDArray> &inputs,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<NDArray> &outputs) {
+  SgMKLDNNFCOp &op = state_pointer.get_state<SgMKLDNNFCOp>();
+  op.Forward(ctx, inputs, req, outputs);
+}
+
+nnvm::NodePtr SgMKLDNNFCQuantizedOp(const NodeAttrs& attrs) {
+  nnvm::NodePtr node = nnvm::Node::Create();
+  node->attrs.op = Op::Get("_sg_mkldnn_fully_connected");
+  node->attrs.name = "quantized_" + attrs.name;
+  node->attrs.dict = attrs.dict;
+  node->attrs.dict["quantized"] = "True";
+  node->attrs.subgraphs.reserve(attrs.subgraphs.size());
+  for (auto sub : attrs.subgraphs) {
+    node->attrs.subgraphs.push_back(sub);
+  }
+  node->op()->attr_parser(&(node->attrs));
+  return node;
+}
+
+NNVM_REGISTER_OP(_sg_mkldnn_fully_connected)
+.describe(R"code(_sg_mkldnn_fully_connected)code" ADD_FILELINE)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  auto const &full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
+  auto num_inputs = full_param.default_param.no_bias ? 2 : 3;
+  if (full_param.mkldnn_param.quantized)
+    return num_inputs * 3;
+  else
+    return num_inputs;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+  auto const &full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
+  return (full_param.mkldnn_param.quantized &&
+          !full_param.mkldnn_param.enable_float_output) ? 3 : 1;
+})
+.set_attr_parser(SgMKLDNNFCParamParser)
+.set_attr<nnvm::FListInputNames>("FListInputNames", SgMKLDNNFCListInputNames)
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", SgMKLDNNFCListOutputNames)
+.set_attr<mxnet::FInferShape>("FInferShape", SgMKLDNNFCInferShape)
+.set_attr<nnvm::FInferType>("FInferType", SgMKLDNNFCInferType)
+.set_attr<FInferStorageType>("FInferStorageType", SgMKLDNNFCStorageType)
+.set_attr<FCreateOpState>("FCreateOpState", CreateSgMKLDNNFCState)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNFCForward)
+.set_attr<bool>("TIsMKLDNN", true)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+                               DefaultSubgraphOpMutableInputs)
+.set_attr<std::string>("key_var_num_args", "num_args")
+.set_attr<FQuantizedOp>("FQuantizedOp", SgMKLDNNFCQuantizedOp)
+.set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; });
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.h b/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.h
new file mode 100644
index 000000000000..f8d7ee1da6c9
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.h
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_fc_post_quantize_property.cc
+ * \brief Partition gragph property for MKLDNN Quantized FullyConnected operator
+ * \author Ciyong Chen
+*/
+
+#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_POST_QUANTIZE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_POST_QUANTIZE_PROPERTY_H_
+#if MXNET_USE_MKLDNN == 1
+
+#include <string>
+#include <vector>
+#include "../../nn/fully_connected-inl.h"
+#include "../../quantization/requantize-inl.h"
+#include "../common.h"
+#include "../subgraph_property.h"
+
+namespace mxnet {
+namespace op {
+
+#define QUANTIZED_FC_NAME "_sg_mkldnn_fully_connected"
+
+class SgMKLDNNFCPostQuantizeSelector : public SubgraphSelector {
+ public:
+  /*! \brief pattern match status */
+  enum SelectStatus {
+    kFail = 0,
+    kStart,
+    kRequantize,
+    kSuccess,
+  };
+
+ private:
+  bool disable_all;
+  bool disable_float_output;
+  SelectStatus status;
+  std::vector<const nnvm::Node *> matched_list;
+
+ public:
+  explicit SgMKLDNNFCPostQuantizeSelector(const bool dis_all,
+                                          const bool dis_float_output)
+      : disable_all(dis_all),
+        disable_float_output(dis_float_output) {}
+
+  bool Select(const nnvm::Node &n) override {
+    if ((!disable_all) && n.op() == Op::Get(QUANTIZED_FC_NAME)) {
+      status = disable_all ? kSuccess : kStart;
+      matched_list.clear();
+      matched_list.push_back(&n);
+      return true;
+    }
+    return false;
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    return false;
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    if (status == kFail || status == kSuccess || new_node.is_variable())
+      return false;
+    // If n isn't the last matched node, then we encoutered a internal
+    // branch, we should pop out the node behind n and stop fusion.
+    if (matched_list.back() != &n) {
+      if (std::find(matched_list.begin(), matched_list.end(), &n) !=
+        matched_list.end()) {
+        while (matched_list.back() != &n) {
+          matched_list.pop_back();
+        }
+      }
+
+      status = kSuccess;
+      return false;
+    }
+
+    switch (status) {
+      case kStart:
+        if (new_node.op() == Op::Get("_contrib_requantize")) {
+          auto const &param = nnvm::get<RequantizeParam>(new_node.attrs.parsed);
+          if (param.min_calib_range.has_value() &&
+              param.max_calib_range.has_value()) {
+            matched_list.push_back(&new_node);
+            status = kRequantize;
+            return true;
+          }
+        }
+      case kRequantize:
+        if ((!disable_float_output) && (new_node.op() == Op::Get("_contrib_dequantize"))) {
+            matched_list.push_back(&new_node);
+            status = kSuccess;
+            return true;
+        }
+      default:
+        status = kSuccess;
+        return false;
+    }
+  }
+
+  std::vector<nnvm::Node *> Filter(
+      const std::vector<nnvm::Node *> &candidates) override {
+    if ((status != kSuccess) || (matched_list.size() <= 1)) {
+      return std::vector<nnvm::Node *>(0);
+    } else {
+      std::vector<nnvm::Node *> ret;
+      for (auto i : matched_list) {
+        auto non_const_i = const_cast<nnvm::Node *>(i);
+        if (std::find(candidates.begin(), candidates.end(), non_const_i) !=
+            candidates.end()) {
+          ret.push_back(non_const_i);
+        }
+      }
+      return ret;
+    }
+  }
+};
+
+class SgMKLDNNFCPostQuantizeProperty : public SubgraphProperty {
+ public:
+  SgMKLDNNFCPostQuantizeProperty() {
+    disable_fuse_all = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_QFC_FUSE_ALL", false);
+    disable_float_output = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_QFC_FLOAT_OUTPUT", false);
+  }
+
+  static SubgraphPropertyPtr Create() {
+    static const std::string &name = "MKLDNN FullyConected post-quantization optimization pass";
+    auto property = std::make_shared<SgMKLDNNFCPostQuantizeProperty>();
+    property->SetAttr<std::string>("property_name", name);
+    property->SetAttr<bool>("inference_only", true);
+    return property;
+  }
+
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id = 0) const override {
+    nnvm::NodePtr fc_node = nullptr;
+    nnvm::NodePtr requantize_node = nullptr;
+    nnvm::NodePtr dequantize_node = nullptr;
+
+    DFSVisit(sym.outputs, [&](const nnvm::NodePtr &node) {
+      if (node->is_variable()) return;
+      if (node->op() == Op::Get(QUANTIZED_FC_NAME)) {
+        fc_node = node;
+      } else if (node->op() == Op::Get("_contrib_requantize")) {
+        requantize_node = node;
+      } else if (node->op() == Op::Get("_contrib_dequantize")) {
+        dequantize_node = node;
+      }
+    });
+
+    CHECK_NOTNULL(fc_node);
+    CHECK_NOTNULL(requantize_node);
+    auto const &requantize_param =
+        nnvm::get<RequantizeParam>(requantize_node->attrs.parsed);
+    CHECK(requantize_param.min_calib_range.has_value());
+    CHECK(requantize_param.max_calib_range.has_value());
+
+    // When only fused quantized_fullyconnected and requantize, set min/max_cablib_range,
+    // When fused quantized_fullyconnected + requantize + dequantize, set dequantize flag to true.
+    if (dequantize_node != nullptr) {
+      fc_node->attrs.dict["enable_float_output"] = "True";
+    } else {
+      fc_node->attrs.dict["min_calib_range"] =
+          std::to_string(requantize_param.min_calib_range.value());
+      fc_node->attrs.dict["max_calib_range"] =
+          std::to_string(requantize_param.max_calib_range.value());
+    }
+    fc_node->op()->attr_parser(&(fc_node->attrs));
+    return fc_node;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    auto selector =
+        std::make_shared<SgMKLDNNFCPostQuantizeSelector>(disable_fuse_all,
+                                                         disable_float_output);
+    return selector;
+  }
+
+  void ConnectSubgraphOutputs(
+      const nnvm::NodePtr n,
+      std::vector<nnvm::NodeEntry *> *output_entries) const override {
+    for (size_t i = 0; i < output_entries->size(); ++i) {
+      auto entry_ptr = output_entries->at(i);
+      *entry_ptr = nnvm::NodeEntry{n, entry_ptr->index, 0};
+    }
+  }
+
+ private:
+  bool disable_fuse_all;
+  bool disable_float_output;
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_POST_QUANTIZE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc_property.h b/src/operator/subgraph/mkldnn/mkldnn_fc_property.h
new file mode 100644
index 000000000000..04e140c72d86
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc_property.h
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_fc_property.cc
+ * \brief Partition gragph property for FullyConnected operator
+ * \author Ciyong Chen
+*/
+
+#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_PROPERTY_H_
+#if MXNET_USE_MKLDNN == 1
+
+#include <string>
+#include <vector>
+#include "../common.h"
+#include "../subgraph_property.h"
+
+namespace mxnet {
+namespace op {
+
+class SgMKLDNNFCSelector : public SubgraphSelector {
+ public:
+  /*! \brief pattern match status */
+  enum SelectStatus {
+    kFail = 0,
+    kStart,
+    kSuccess,
+  };
+
+ private:
+  bool disable_fc_relu;
+  SelectStatus status;
+  std::vector<const nnvm::Node *> matched_list;
+
+ public:
+  explicit SgMKLDNNFCSelector(const bool dis_fc_relu) : disable_fc_relu(dis_fc_relu) {}
+
+  bool Select(const nnvm::Node &n) override {
+    if (n.op() == Op::Get("FullyConnected")) {
+      status = disable_fc_relu ? kSuccess : kStart;
+      matched_list.clear();
+      matched_list.push_back(&n);
+      return true;
+    }
+    return false;
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    return false;
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    if (status == kFail || status == kSuccess || new_node.is_variable())
+      return false;
+
+    // If n isn't the last matched node, then we encoutered a internal
+    // branch, we should pop out the node behind n and stop fusion.
+    if (matched_list.back() != &n) {
+      if (std::find(matched_list.begin(), matched_list.end(), &n) !=
+        matched_list.end()) {
+        while (matched_list.back() != &n) {
+          matched_list.pop_back();
+        }
+      }
+
+      status = kSuccess;
+      return false;
+    }
+
+    switch (status) {
+      case kStart:
+        if (new_node.op() == Op::Get("Activation") &&
+            new_node.attrs.dict.at("act_type") == "relu") {
+          matched_list.push_back(&new_node);
+          status = kSuccess;
+          return true;
+        }
+      default:
+        status = kSuccess;
+        return false;
+    }
+  }
+
+  std::vector<nnvm::Node *> Filter(
+      const std::vector<nnvm::Node *> &candidates) override {
+    if (status == kFail) {
+      return std::vector<nnvm::Node *>(0);
+    } else {
+      std::vector<nnvm::Node *> ret;
+      for (auto i : matched_list) {
+        auto non_const_i = const_cast<nnvm::Node *>(i);
+        if (std::find(candidates.begin(), candidates.end(), non_const_i) !=
+            candidates.end()) {
+          ret.push_back(non_const_i);
+        }
+      }
+      return candidates;
+    }
+  }
+};
+
+class SgMKLDNNFCProperty : public SubgraphProperty {
+ public:
+  SgMKLDNNFCProperty() {
+    disable_fc_relu = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_FC_RELU", false);
+  }
+
+  static SubgraphPropertyPtr Create() {
+    static const std::string &name = "MKLDNN FullyConnected optimization pass";
+    if (dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FC_OPT", 0)) {
+      LOG(INFO) << name << " is disabled.";
+      return nullptr;
+    }
+    auto property = std::make_shared<SgMKLDNNFCProperty>();
+    property->SetAttr<std::string>("property_name", name);
+    property->SetAttr<bool>("inference_only", true);
+    return property;
+  }
+
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id = 0) const override {
+    nnvm::NodePtr n = nnvm::Node::Create();
+    // This op has single output, remove duplicated.
+    auto last_node = sym.outputs[0].node;
+    nnvm::Symbol new_sym;
+    new_sym.outputs.emplace_back(nnvm::NodeEntry{last_node, 0, 0});
+    std::ostringstream node_name;
+    node_name << "sg_mkldnn_";
+    DFSVisit(new_sym.outputs, [&](const nnvm::NodePtr &node) {
+      if (node->is_variable()) return;
+      auto &sub_name = node->op()->name;
+      if (sub_name == "FullyConnected") {
+        node_name << "fully_connected_";
+      } else if ((sub_name == "Activation") &&
+                 (node->attrs.dict.at("act_type") == "relu")) {
+          node_name << "relu_";
+          n->attrs.dict["with_relu"] = "True";
+      }
+    });
+    node_name << std::to_string(subgraph_id);
+    n->attrs.name = node_name.str();
+    n->attrs.op = Op::Get("_sg_mkldnn_fully_connected");
+    CHECK(n->attrs.op);
+    n->attrs.subgraphs.emplace_back(std::make_shared<nnvm::Symbol>(new_sym));
+    n->op()->attr_parser(&(n->attrs));
+    return n;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    auto selector = std::make_shared<SgMKLDNNFCSelector>(disable_fc_relu);
+    return selector;
+  }
+
+  void ConnectSubgraphOutputs(
+      const nnvm::NodePtr n,
+      std::vector<nnvm::NodeEntry *> *output_entries) const override {
+    // Connect all extern output entries to output[0]
+    for (size_t i = 0; i < output_entries->size(); ++i) {
+      auto entry_ptr = output_entries->at(i);
+      *entry_ptr = nnvm::NodeEntry{n, entry_ptr->index, 0};
+    }
+  }
+
+ private:
+  bool disable_fc_relu;
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_post_quantize_align_scale_property.h b/src/operator/subgraph/mkldnn/mkldnn_post_quantize_align_scale_property.h
new file mode 100644
index 000000000000..f8c47f0ce036
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_post_quantize_align_scale_property.h
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_ALIGN_SCALE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_ALIGN_SCALE_PROPERTY_H_
+#if MXNET_USE_MKLDNN == 1
+
+#include <string>
+#include <vector>
+#include "../common.h"
+#include "../subgraph_property.h"
+
+namespace mxnet {
+namespace op {
+
+class SgMKLDNNConcatPostQuantizeSelector : public SubgraphSelectorV2 {
+ public:
+  bool Select(const BiDirectedNode &sn) override {
+    const auto &n = *sn.node;
+    if (n.op() == Op::Get("_contrib_quantized_concat")) {
+      matched_list_.clear();
+      visit_list_.clear();
+      visit_list_.insert(&n);
+      select_output_ = (sn.outputs.size() > 1) ? false : true;
+      return true;
+    }
+    return false;
+  }
+
+  bool SelectInput(const BiDirectedNode &sn, const BiDirectedNode &snew_node) override {
+    const auto &n = *sn.node;
+    const auto &new_node = *snew_node.node;
+    if (new_node.is_variable()) return false;
+    if (visit_list_.count(&n) == 0) return false;
+    bool multiple_outputs = false;
+    for (auto i : snew_node.outputs) {
+      if (visit_list_.count(i.first) == 0) {
+        multiple_outputs = true;
+        break;
+      }
+    }
+    if (multiple_outputs) return false;
+    if (new_node.attrs.dict.count("min_calib_range") != 0 &&
+        new_node.attrs.dict.count("max_calib_range") != 0) {
+      matched_list_.push_back(&snew_node);
+      return true;
+    } else if (new_node.op() == Op::Get("_contrib_quantized_concat") ||
+               new_node.op() == Op::Get("_contrib_quantized_pooling")) {
+      visit_list_.insert(&new_node);
+      return true;
+    }
+    return false;
+  }
+
+  bool SelectOutput(const BiDirectedNode &sn, const BiDirectedNode &snew_node) override {
+    if (!select_output_) return false;
+    const auto &n = *sn.node;
+    const auto &new_node = *snew_node.node;
+    if (new_node.is_variable()) return false;
+    if (visit_list_.count(&n) == 0) {
+      return false;
+    }
+    if (new_node.op() == Op::Get("_contrib_quantized_concat") ||
+        new_node.op() == Op::Get("_contrib_quantized_pooling")) {
+      visit_list_.insert(&new_node);
+      return true;
+    }
+    return false;
+  }
+
+  virtual std::vector<BiDirectedNode *> Filter(
+      const std::vector<BiDirectedNode *> &candidates) {
+    if (matched_list_.size() < 2) {
+      return std::vector<BiDirectedNode *>(0);
+    } else {
+      std::vector<BiDirectedNode *> ret;
+      for (auto i : matched_list_) {
+        ret.push_back(const_cast<BiDirectedNode *>(i));
+      }
+      return ret;
+    }
+  }
+
+ private:
+  bool select_output_;
+  std::vector<const BiDirectedNode *> matched_list_;
+  std::unordered_set<const nnvm::Node*> visit_list_;
+};
+
+class SgMKLDNNPostQuantizeAlignScaleProperty : public SubgraphProperty {
+ public:
+  SgMKLDNNPostQuantizeAlignScaleProperty() : SubgraphProperty(kAdjust) {}
+
+  static SubgraphPropertyPtr Create() {
+    static const std::string &name = "MKLDNN post-quantization scale alignment optimization pass";
+    auto property = std::make_shared<SgMKLDNNPostQuantizeAlignScaleProperty>();
+    property->SetAttr<std::string>("property_name", name);
+    property->SetAttr<bool>("inference_only", true);
+    return property;
+  }
+
+/*!
+ * \brief Adjust selected nodes calibration range with maximum calib range.
+ * For example,
+ * conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
+ *                               kernel=(3, 3), stride=(1, 1), no_bias=True)
+ * conv2 = mx.symbol.Convolution(data=data, weight=weight * 2, name='conv2', num_filter=64,
+ *                               kernel=(3, 3), stride=(1, 1), no_bias=True)
+ * conv3 = mx.symbol.Convolution(data=data, weight=weight * 3, name='conv3', num_filter=64,
+ *                               kernel=(3, 3), stride=(1, 1), no_bias=True)
+ * conv4 = mx.symbol.Convolution(data=data, weight=weight * 4, name='conv4', num_filter=64,
+ *                               kernel=(3, 3), stride=(1, 1), no_bias=True)
+ * concat = mx.symbol.Concat(*[conv1, conv2, conv3, conv4], name="concat", dim=1)
+ * 
+ * This pass will collect the maximum calib range from conv1 to conv4, and apply it to all
+ * conv1 to conv4. Then concat don't need extra scale alignment operation. Performance and
+ * accuracy are both improved.
+ */
+  void AdjustSubgraphNode(const std::vector<nnvm::Node *> &subgraph_nodes,
+                          const SubgraphSelectorV2Ptr &subgraph_selector,
+                          const int subgraph_id = 0) const override {
+    float min_calib = 0.0f;
+    float max_calib = 0.0f;
+    for (size_t i = 0; i < subgraph_nodes.size(); ++i) {
+      auto this_min_calib = std::stof(subgraph_nodes[i]->attrs.dict["min_calib_range"]);
+      auto this_max_calib = std::stof(subgraph_nodes[i]->attrs.dict["max_calib_range"]);
+      if (min_calib > this_min_calib) min_calib = this_min_calib;
+      if (max_calib < this_max_calib) max_calib = this_max_calib;
+    }
+    for (size_t i = 0; i < subgraph_nodes.size(); ++i) {
+      auto &n = *subgraph_nodes[i];
+      n.attrs.dict["min_calib_range"] = std::to_string(min_calib);
+      n.attrs.dict["max_calib_range"] = std::to_string(max_calib);
+      if (n.op()->attr_parser) n.op()->attr_parser(&(n.attrs));
+    }
+  }
+
+  SubgraphSelectorV2Ptr CreateSubgraphSelectorV2() const override {
+    auto selector = std::make_shared<SgMKLDNNConcatPostQuantizeSelector>();
+    return selector;
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_ALIGN_SCALE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
new file mode 100644
index 000000000000..26aa3b5b8e9a
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "mkldnn_conv_property.h"
+#include "mkldnn_fc_property.h"
+#include "mkldnn_conv_post_quantize_property.h"
+#include "mkldnn_fc_post_quantize_property.h"
+#include "mkldnn_post_quantize_align_scale_property.h"
+
+namespace mxnet {
+namespace op {
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNConvProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNFCProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNConvPostQuantizeProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNFCPostQuantizeProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNPostQuantizeAlignScaleProperty);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/subgraph/subgraph_property.h b/src/operator/subgraph/subgraph_property.h
index e9fdd6619275..aac3b3f2d0fc 100644
--- a/src/operator/subgraph/subgraph_property.h
+++ b/src/operator/subgraph/subgraph_property.h
@@ -30,6 +30,29 @@
 namespace mxnet {
 namespace op {
 
+struct BiDirectedNode;
+using BiDirectedNodePtr = std::shared_ptr<BiDirectedNode>;
+
+/*!
+ * \brief Node of the undirected graph which replicates the network structures
+ * of the computational graph. It is used to ease the graph traversal for finding
+ * subgraphs.
+ */
+struct BiDirectedNode {
+  static BiDirectedNodePtr Create() { return std::make_shared<BiDirectedNode>(); }
+  BiDirectedNode() : label(-1), node(nullptr) {}
+  /*! subgraph label */
+  int label;
+  /*! the original node in the computational graph it references*/
+  nnvm::Node* node;
+  /*!
+   * \brief output nodes of the current node
+   * key is node ptr and value is an array of indices standing for the entry indices
+   * in key->inputs whose source is the current node.
+   */
+  std::unordered_map<nnvm::Node*, std::vector<size_t>> outputs;
+};  // struct BiDirectedNode
+
 /*
  * This provides criteria for the graph partitioning algorithm to select
  * nodes to subgraphs.
@@ -85,6 +108,83 @@ class SubgraphSelector {
 
 using SubgraphSelectorPtr = std::shared_ptr<SubgraphSelector>;
 
+class SubgraphSelectorV2 {
+ public:
+  virtual ~SubgraphSelectorV2() {}
+  /*!
+   * \brief Determines if to search for other nodes to form a subgraph from the seed_node.
+   */
+  virtual bool Select(const BiDirectedNode& seed_node) = 0;
+  /*!
+   * \brief Determines if to select input_node when traverse to the cur_node.
+   * \param cur_node the node for determining whether its input_node should be selected
+   * \param input_node the input node of the cur_node
+   * \return true if input_node is selected
+   */
+  virtual bool SelectInput(const BiDirectedNode& cur_node,
+                           const BiDirectedNode& input_node) = 0;
+  /*!
+   * \brief Determines if to select output_node when traverse to the cur_node.
+   * \param cur_node the node for determining whether its output_node should be selected
+   * \param output_node the output node of the cur_node
+   * \return true if output_node is selected
+   */
+  virtual bool SelectOutput(const BiDirectedNode& cur_node,
+                            const BiDirectedNode& output_node) = 0;
+  /*!
+   * \brief Post processes pre-selected subgraph nodes. Return a list of nodes that
+   *        users want to keep in subgraph(s).
+   * \param candidates re-selected subgraph nodes to filt
+   * \return a list of nodes to keep
+   */
+  virtual std::vector<BiDirectedNode*> Filter(
+      const std::vector<BiDirectedNode*>& candidates) {
+    return candidates;
+  }
+};
+
+using SubgraphSelectorV2Ptr = std::shared_ptr<SubgraphSelectorV2>;
+
+class SubgraphSelectorV2Bridge : public SubgraphSelectorV2 {
+ public:
+  explicit SubgraphSelectorV2Bridge(SubgraphSelectorPtr ptr) : ss_ptr_(ptr) {}
+
+  virtual ~SubgraphSelectorV2Bridge() {}
+
+  bool Select(const BiDirectedNode& seed_node) override {
+    return ss_ptr_->Select(*seed_node.node);
+  }
+
+  bool SelectInput(const BiDirectedNode& cur_node,
+                   const BiDirectedNode& input_node) override {
+    return ss_ptr_->SelectInput(*cur_node.node, *input_node.node);
+  }
+
+  bool SelectOutput(const BiDirectedNode& cur_node,
+                    const BiDirectedNode& output_node) override {
+    return ss_ptr_->SelectOutput(*cur_node.node, *output_node.node);
+  }
+
+  std::vector<BiDirectedNode*> Filter(
+      const std::vector<BiDirectedNode*>& candidates) override {
+    std::unordered_map<nnvm::Node*, BiDirectedNode*> node_2_snode_map;
+    std::vector<nnvm::Node*> n_candidates;
+    for (auto i : candidates) {
+      node_2_snode_map[i->node] = i;
+      n_candidates.push_back(i->node);
+    }
+    auto n_ret = ss_ptr_->Filter(n_candidates);
+    std::vector<BiDirectedNode*> ret;
+    for (auto i : n_ret) ret.push_back(node_2_snode_map[i]);
+    return ret;
+  }
+
+  const SubgraphSelectorPtr& GetV1ptr() const { return ss_ptr_; }
+
+ private:
+  SubgraphSelectorPtr ss_ptr_;
+};
+
 /*!
  * \brief This provides a set of properties for partitioning a graph into subgraphs,
  *        reconstructing a new graph from the subgraphs and creating a subgraph
@@ -92,18 +192,83 @@ using SubgraphSelectorPtr = std::shared_ptr<SubgraphSelector>;
  */
 class SubgraphProperty {
  public:
+  /*! \brief Property type */
+  enum SgPropertyType {
+    kCreate,
+    kAdjust,
+  };
+
+  explicit SubgraphProperty(SgPropertyType type = kCreate) : type_(type) {}
+
   /*!
    * \brief The criteria of selecting the subgraph nodes.
    */
-  virtual SubgraphSelectorPtr CreateSubgraphSelector() const = 0;
+  virtual SubgraphSelectorPtr CreateSubgraphSelector() const {
+    LOG(FATAL) << "No CreateSubgraphSelector is implemented for this SubgraphProperty.";
+    return nullptr;
+  }
+
+  virtual SubgraphSelectorV2Ptr CreateSubgraphSelectorV2() const {
+    auto v1_ptr = CreateSubgraphSelector();
+    return std::make_shared<SubgraphSelectorV2Bridge>(v1_ptr);
+  }
+
+  /*!
+   * \brief Create an nnvm node for a given subgraph. Here users can customize how to
+   *        execute the operators in the subgraph.
+   * \param sym the symbol to create subgraph node
+   * \param subgraph_id subgraph id
+   */
+  virtual nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol& sym,
+                                           const int subgraph_id = 0) const {
+    CHECK_EQ(GetPropertyType(), kCreate);
+    LOG(FATAL) << "Not implement CreateSubgraphNode() for this subgraph property.";
+    return nullptr;
+  }
+
+  /*!
+   * \brief Create an nnvm node for a given subgraph. Here users can customize how to
+   *        execute the operators in the subgraph.
+   * \param sym the symbol to create subgraph node
+   * \param subgraph_selector the selector used for creating this subgraph
+   * \param subgraph_id subgraph id
+   */
+  virtual nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol& sym,
+                                           const SubgraphSelectorPtr& subgraph_selector,
+                                           const int subgraph_id = 0) const {
+    return CreateSubgraphNode(sym, subgraph_id);
+  }
+
   /*!
    * \brief Create an nnvm node for a given subgraph. Here users can customize how to
    *        execute the operators in the subgraph.
    * \param sym the symbol to create subgraph node
+   * \param subgraph_selector The selector used for selecting this node set
    * \param subgraph_id subgraph id
    */
-  virtual nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
-                                           const int subgraph_id = 0) const = 0;
+  virtual nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol& sym,
+                                           const SubgraphSelectorV2Ptr& subgraph_selector,
+                                           const int subgraph_id = 0) const {
+    CHECK_EQ(GetPropertyType(), kCreate);
+    const auto bridge = static_cast<SubgraphSelectorV2Bridge*>(subgraph_selector.get());
+    return CreateSubgraphNode(sym, bridge->GetV1ptr(), subgraph_id);
+  }
+
+  /*!
+   * \brief Adjust nnvm nodes from a given subgraph. No new node is created, but adjust
+   *        selected nodes' attributes. This can be used to implement peephole optimization.
+   *        Here users can customize how to adjust the operators in the subgraph. 
+   * \param subgraph_nodes the subgraph nodes to adjust
+   * \param subgraph_selector The selector used for selecting this node set.
+   * \param subgraph_id subgraph id
+   */
+  virtual void AdjustSubgraphNode(const std::vector<nnvm::Node*>& subgraph_nodes,
+                                  const SubgraphSelectorV2Ptr &subgraph_selector,
+                                  const int subgraph_id = 0) const {
+    CHECK_EQ(GetPropertyType(), kAdjust);
+    LOG(FATAL) << "Not implement AdjustSubgraphNode() for this subgraph property.";
+  }
+
   /*!
    * \brief Connect subgraph internal output with external output entries.
    *        By default, each output entry will connect to an unique internal output.
@@ -145,8 +310,20 @@ class SubgraphProperty {
     CHECK(it != attrs_.end()) << "Cannot find attribute " << name << " in SubgraphProperty";
     return nnvm::get<T>(*it->second);
   }
+  /*!
+   * \brief Check if the attr exists.
+   */
+  bool HasAttr(const std::string& name) const {
+    auto it = attrs_.find(name);
+    return it != attrs_.end();
+  }
+  /*!
+   * \brief Get the property type.
+   */
+  SgPropertyType GetPropertyType() const { return type_; }
 
  protected:
+  SgPropertyType type_;
   std::unordered_map<std::string, std::shared_ptr<nnvm::any>> attrs_;
 };
 
@@ -160,35 +337,34 @@ class SubgraphPropertyRegistry {
     return &inst;
   }
 
-  SubgraphPropertyPtr CreateSubgraphProperty(const std::string& name) {
+  std::vector<SubgraphPropertyPtr> CreateSubgraphProperty(const std::string& name) {
     auto it = prop_fn_map_.find(name);
     CHECK(it != prop_fn_map_.end()) << "SubgraphProperty " << name
                                     << " is not found in SubgraphPropertyRegistry";
-    return it->second();
-  }
-
-  SubgraphPropertyCreateFn __REGISTER_OR_GET__(const std::string& name,
-                                               SubgraphPropertyCreateFn fn) {
-    if (prop_fn_map_.count(name) == 0U) {
-      return __REGISTER__(name, fn);
-    } else {
-      return prop_fn_map_.at(name);
+    std::vector<SubgraphPropertyPtr> ret;
+    ret.reserve(it->second.size());
+     for (auto i : it->second) {
+       auto ptr_it = prop_ptr_map_.find(i);
+       if (ptr_it == prop_ptr_map_.end()) {
+         prop_ptr_map_[i] = i();
+         ptr_it = prop_ptr_map_.find(i);
+       }
+       if (ptr_it->second) ret.emplace_back(ptr_it->second);
     }
+    return ret;
   }
 
- private:
   SubgraphPropertyCreateFn __REGISTER__(const std::string& name, SubgraphPropertyCreateFn fn) {
-    CHECK_EQ(prop_fn_map_.count(name), 0U) << "Subgraph property " << name
-                                           << " has been registered";
-    prop_fn_map_[name] = fn;
-    return prop_fn_map_[name];
+    prop_fn_map_[name].push_back(fn);
+    return fn;
   }
 
   SubgraphPropertyRegistry() = default;
   SubgraphPropertyRegistry(const SubgraphPropertyRegistry&) = delete;
   SubgraphPropertyRegistry(SubgraphPropertyRegistry&&) = delete;
   SubgraphPropertyRegistry& operator=(const SubgraphPropertyRegistry&) = delete;
-  std::unordered_map<std::string, SubgraphPropertyCreateFn> prop_fn_map_;
+  std::unordered_map<std::string, std::vector<SubgraphPropertyCreateFn>> prop_fn_map_;
+  std::unordered_map<SubgraphPropertyCreateFn, SubgraphPropertyPtr> prop_ptr_map_;
 };
 
 // This op name set is for setting the names of operators that should be grouped into
@@ -198,9 +374,14 @@ class SubgraphPropertyRegistry {
 typedef dmlc::ThreadLocalStore<std::unordered_map<std::string, std::unordered_set<std::string>>>
   SubgraphPropertyOpNameSet;
 
+#define DECLARE_PROPERTY_EX(NAME, SubgraphPropertyType, X) \
+  static const DMLC_ATTRIBUTE_UNUSED auto __make_##SubgraphPropertyType##_##Name##_##X##__
+#define DECLARE_PROPERTY(NAME, SubgraphPropertyType, X) \
+  DECLARE_PROPERTY_EX(NAME, SubgraphPropertyType, X)
+
 #define MXNET_REGISTER_SUBGRAPH_PROPERTY(Name, SubgraphPropertyType) \
-  static DMLC_ATTRIBUTE_UNUSED auto __make_ ## SubgraphPropertyType ## _ ## Name ## __ = \
-    SubgraphPropertyRegistry::Get()->__REGISTER_OR_GET__(#Name, &SubgraphPropertyType::Create)
+  DECLARE_PROPERTY(Name, SubgraphPropertyType, __LINE__) =           \
+      SubgraphPropertyRegistry::Get()->__REGISTER__(#Name, &SubgraphPropertyType::Create)
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index 8934438d428a..e53d911614a0 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -178,7 +178,7 @@ bool as_bool_scalar(const NDArray &a) {
 }
 
 bool is_shape_udf(const mxnet::TShape &x) {
-  return x.ndim() == 0 || x.Size() == 0;
+  return !shape_is_known(x);
 }
 
 bool is_stype_udf(const int &x) {
@@ -225,7 +225,7 @@ void LoopState::Forward(int iter_no,
     if (!out_bufs[i].IsSame(coutputs[i])) {
       // The line below checks whether dynamic shape exists.
       // If so, re-initialize the shape.
-      if (coutputs[i].shape().ndim() == 0) {
+      if (!shape_is_known(coutputs[i].shape())) {
         const_cast<NDArray &>(coutputs[i]).Init(out_bufs[i].shape());
       }
       CopyFromTo(out_bufs[i], coutputs[i]);
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index 91adf576dc07..19528349c0c7 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -67,7 +67,7 @@ bool is_type_udf(const int &x);
 
 template <typename T>
 void extract_by_loc(const std::vector<T> &array,
-                    const nnvm::Tuple<dim_t> input_locs,
+                    const mxnet::Tuple<dim_t> input_locs,
                     std::vector<T> *out) {
   out->clear();
   out->reserve(input_locs.ndim());
@@ -94,11 +94,11 @@ bool fill_value(T *x, T *y, bool x_empty, bool y_empty) {
 }
 
 template <typename T>
-bool sync_in_in(const nnvm::Tuple<dim_t> &input_locs,
-                         std::vector<T> *in,
-                         std::vector<T> *subg_in,
-                         std::function<bool(const T &)> is_empty) {
-  for (size_t i = 0; i < input_locs.ndim(); ++i) {
+bool sync_in_in(const mxnet::Tuple<dim_t> &input_locs,
+                std::vector<T> *in,
+                std::vector<T> *subg_in,
+                std::function<bool(const T &)> is_empty) {
+  for (int i = 0; i < input_locs.ndim(); ++i) {
     T &x = in->at(input_locs[i]);
     T &y = subg_in->at(i);
     fill_value(&x, &y, is_empty(x), is_empty(y));
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
index 1609764f0ebe..dfe9fa606e95 100644
--- a/src/operator/svm_output-inl.h
+++ b/src/operator/svm_output-inl.h
@@ -143,9 +143,9 @@ class SVMOutputProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
     const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
-    mxnet::TShape label_shape(dshape.ndim() - 1);
-    for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
+    if (!mxnet::ndim_is_known(dshape)) return false;
+    mxnet::TShape label_shape(dshape.ndim() - 1, -1);
+    for (int i = 0; i + 1 < dshape.ndim(); ++i)
       label_shape[i] = dshape[i];
     SHAPE_ASSIGN_CHECK(*in_shape, svm_enum::kLabel, label_shape);
     out_shape->clear();
diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index ce835084ab32..7335daa48392 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -69,11 +69,11 @@ class SwapAxisOp : public Operator {
 
   void Reshape2Five(mshadow::Shape<5> *inter_shape,
                     const mxnet::TShape &shape,
-                    uint32_t dim1, uint32_t dim2) {
+                    int dim1, int dim2) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    index_t ndim_in = shape.ndim();
-    index_t si;
+    int ndim_in = shape.ndim();
+    int si;
 
     if (dim1 > dim2) {
       std::swap(dim1, dim2);
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index 5d6c49ff8882..54db35061c6a 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -72,15 +72,15 @@ void BinaryBroadcastComputeImpl(Stream<gpu> *s, const OpReqType req,
 }
 
 const int nthread_reduce = kMaxThreadsPerBlock;
-template<typename Reducer, int ndim, typename DType, typename OP, int unroll>
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP, int unroll>
 __launch_bounds__(nthread_reduce)
 __global__ void reduce_kernel(const int N, const int M, const bool addto,
-                              const DType* __restrict big, DType *small,
+                              const DType* __restrict big, OType *small,
                               const Shape<ndim> big_shape0, const Shape<ndim> small_shape,
                               const Shape<ndim> big_shape, const Shape<ndim> big_stride,
                               const int Mnext, const bool do_transpose) {
   extern __shared__ char shTileChar[];
-  DType* shTile = (DType*)(shTileChar);
+  AType* shTile = (AType*)(shTileChar);
   const int tid = threadIdx.x + threadIdx.y*blockDim.x;
   const int bx = (do_transpose) ? blockDim.y : blockDim.x;
   const int by = (do_transpose) ? blockDim.x : blockDim.y;
@@ -95,7 +95,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
       Shape<ndim> coord = unravel(idx, small_shape);
       int idx_big0 = ravel(coord, big_shape0);
 
-      DType val, residual;
+      AType val, residual;
       Reducer::SetInitValue(val, residual);
       if (idx < N) {
         for (int k = tidy + Mstart; k < Mend; k += by*unroll) {
@@ -113,7 +113,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
           }
           #pragma unroll
           for (int u=0;u < unroll;u++) {
-            if (k + u*by < Mend) Reducer::Reduce(val, tmp[u], residual);
+            if (k + u*by < Mend) Reducer::Reduce(val, AType(tmp[u]), residual);
           }
         }
       }
@@ -127,7 +127,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
         shTile[it0 * 2 + 1] = residual;
         __syncthreads();
         for (int t=1;t < by;t <<= 1) {
-          DType tmp, tmp_residual;
+          AType tmp, tmp_residual;
           Reducer::SetInitValue(tmp, tmp_residual);
           if (tidy + t < by) {
             tmp = shTile[(it0 + t*fbx) * 2];
@@ -139,12 +139,12 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
         }
         if (idx < N && tidy == 0) {
           Reducer::Finalize(shTile[tidx * 2], shTile[tidx * 2 + 1]);
-          assign(&small[idx + m0*N], addto, shTile[tidx * 2]);
+          assign(&small[idx + m0*N], addto, OType(shTile[tidx * 2]));
         }
       } else {
         if (idx < N) {
           Reducer::Finalize(val, residual);
-          assign(&small[idx + m0*N], addto, val);
+          assign(&small[idx + m0*N], addto, OType(val));
         }
       }
     }
@@ -261,18 +261,18 @@ __global__ void reduce_lines_kernel(const int N, const int M, const bool addto,
   }
 }
 
-template<typename Reducer, int ndim, typename DType, typename OP>
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
 __global__ void reduce_kernel_M1(const int N, const bool addto,
-                                const DType* __restrict big, DType *small, const Shape<ndim> bshape,
+                                const DType* __restrict big, OType *small, const Shape<ndim> bshape,
                                 const Shape<ndim> sshape) {
   for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) {
     Shape<ndim> coord = unravel(idx, sshape);
     int j = ravel(coord, bshape);
-    DType val, residual;
+    AType val, residual;
     Reducer::SetInitValue(val, residual);
-    Reducer::Reduce(val, OP::Map(big[j]), residual);
+    Reducer::Reduce(val, AType(OP::Map(big[j])), residual);
     Reducer::Finalize(val, residual);
-    assign(&small[idx], addto, val);
+    assign(&small[idx], addto, OType(val));
   }
 }
 
@@ -491,7 +491,7 @@ ReduceImplConfig<ndim> ConfigureReduceImpl(const mxnet::TShape& small, const mxn
 
     if (config.Mnext > 1) {
       // small_dptr[] is N*Mnext*sizeof(DType) bytes
-      config.workspace_size += config.N*config.Mnext*sizeof(DType);
+      config.workspace_size += config.N*config.Mnext*sizeof(double);
       // Set gridDim.y to Mnext
       config.kernel_1.gridDim.y = std::min(kBaseGridNum, config.Mnext);
     }
@@ -516,23 +516,22 @@ ReduceImplConfig<ndim> ConfigureReduceImpl(const mxnet::TShape& small, const mxn
     {__VA_ARGS__}                                                     \
   }
 
-template<typename Reducer, int ndim, typename DType, typename OP>
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
 void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
                 const TBlob& big, const Tensor<gpu, 1, char>& workspace,
                 const ReduceImplConfig<ndim>& config) {
   if (config.M == 1) {
-    reduce_kernel_M1<Reducer, ndim, DType, OP>
+    reduce_kernel_M1<Reducer, ndim, AType, DType, OType, OP>
     <<< config.kernel_1.gridDim, config.kernel_1.blockDim, 0, stream >>>(
-      config.N, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(), big.shape_.get<ndim>(),
+      config.N, req == kAddTo, big.dptr<DType>(), small.dptr<OType>(), big.shape_.get<ndim>(),
       small.shape_.get<ndim>());
     MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel_M1);
   } else {
-
-    DType* small_dptr = small.dptr<DType>();
+    OType* small_dptr = small.dptr<OType>();
     bool addto = (req == kAddTo);
     if (config.Mnext > 1) {
       // small_dptr[] is N*Mnext*sizeof(DType) bytes
-      small_dptr = reinterpret_cast<DType*>(workspace.dptr_);
+      small_dptr = reinterpret_cast<OType*>(workspace.dptr_);
       addto = false;
       // Check that the workspace is contigiuous
       CHECK_EQ(workspace.CheckContiguous(), true);
@@ -544,7 +543,7 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
       config.kernel_1.blockDim.x : config.kernel_1.blockDim.y;
     const bool do_unroll = ( config.M / (by*config.Mnext) >= config.unroll_reduce );
     KERNEL_UNROLL_SWITCH(do_unroll, ReduceImplConfig<ndim>::unroll_reduce, UNROLL, {
-      reduce_kernel<Reducer, ndim, DType, OP, UNROLL>
+      reduce_kernel<Reducer, ndim, AType, DType, OType, OP, UNROLL>
       <<< config.kernel_1.gridDim, config.kernel_1.blockDim, config.kernel_1.shMemSize, stream>>>(
         config.N, config.M, addto, big.dptr<DType>(), small_dptr, big.shape_.get<ndim>(),
         small.shape_.get<ndim>(), config.rshape, config.rstride, config.Mnext,
@@ -553,9 +552,9 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
     MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel);
 
     if (config.Mnext > 1) {
-      reduce_lines_kernel<Reducer, DType>
+      reduce_lines_kernel<Reducer, OType>
       <<< config.kernel_2.gridSize, config.kernel_2.blockSize, 0, stream >>>
-        (config.N, config.Mnext, req == kAddTo, config.N, small_dptr, small.dptr<DType>());
+        (config.N, config.Mnext, req == kAddTo, config.N, small_dptr, small.dptr<OType>());
       MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_lines_kernel);
     }
   }
@@ -610,14 +609,26 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const TBlob& lhs, const
 
 #undef KERNEL_UNROLL_SWITCH
 
-template<typename Reducer, int ndim, typename DType, typename OP>
+template<typename Reducer, int ndim, typename DType, typename OP, bool safe_acc = false>
 void Reduce(Stream<gpu> *s, const TBlob& small, const OpReqType req,
             const Tensor<gpu, 1, char>& workspace, const TBlob& big) {
   if (req == kNullOp) return;
   cudaStream_t stream = Stream<gpu>::GetStream(s);
   ReduceImplConfig<ndim> config =
     ConfigureReduceImpl<ndim, DType>(small.shape_, big.shape_, NULL, NULL);
-  ReduceImpl<Reducer, ndim, DType, OP>(stream, small, req, big, workspace, config);
+  if (safe_acc) {
+    MXNET_ACC_TYPE_SWITCH(mshadow::DataType<DType>::kFlag, DataType, AType, {
+      typedef typename std::conditional<safe_acc, AType, DataType>::type AccType;
+      MSHADOW_TYPE_SWITCH(small.type_flag_, OType, {
+        typedef typename std::conditional<safe_acc, OType, DataType>::type OutType;
+        config = ConfigureReduceImpl<ndim, AccType>(small.shape_, big.shape_, NULL, NULL);
+        ReduceImpl<Reducer, ndim, AccType, DataType, OutType, OP>(
+          stream, small, req, big, workspace, config);
+      });
+    });
+  } else {
+    ReduceImpl<Reducer, ndim, DType, DType, DType, OP>(stream, small, req, big, workspace, config);
+  }
 }
 
 template <typename Reducer, int ndim, typename DType, typename OP>
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 0f6913e6e9df..be589c41168b 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -153,21 +153,21 @@ MSHADOW_XINLINE void binary_broadcast_assign(const index_t idx, const bool addto
   assign(&out[idx], addto, OP::Map(lhs[j], rhs[k]));
 }
 
-template<typename Reducer, int ndim, typename DType, typename OP>
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
 MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const bool addto,
-                                       const DType* __restrict big, DType *small,
+                                       const DType* __restrict big, OType *small,
                                        const Shape<ndim>& bshape, const Shape<ndim>& sshape,
                                        const Shape<ndim>& rshape, const Shape<ndim>& rstride) {
   Shape<ndim> coord = unravel(idx, sshape);
   index_t j = ravel(coord, bshape);
-  DType val, residual;
+  AType val, residual;
   Reducer::SetInitValue(val, residual);
   for (size_t k = 0; k < M; ++k) {
     coord = unravel(k, rshape);
-    Reducer::Reduce(val, OP::Map(big[j + dot(coord, rstride)]), residual);
+    Reducer::Reduce(val, AType(OP::Map(big[j + dot(coord, rstride)])), residual);
   }
   Reducer::Finalize(val, residual);
-  assign(&small[idx], addto, val);
+  assign(&small[idx], addto, OType(val));
 }
 
 #ifdef __CUDACC__
@@ -194,15 +194,15 @@ void BinaryBroadcastComputeImpl(Stream<cpu> *s, const OpReqType req,
                            out.shape_.get<ndim>());
 }
 
-template<typename Reducer, int ndim, typename DType, typename OP>
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
 void seq_reduce_compute(const size_t N, const size_t M, const bool addto,
-                        const DType *big, DType *small, const Shape<ndim> bshape,
+                        const DType *big, OType *small, const Shape<ndim> bshape,
                         const Shape<ndim> sshape, const Shape<ndim> rshape,
                         const Shape<ndim> rstride) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
-    seq_reduce_assign<Reducer, ndim, DType, OP>(idx, M, addto, big, small, bshape, sshape, rshape,
-      rstride);
+    seq_reduce_assign<Reducer, ndim, AType, DType, OType, OP>(idx, M, addto, big, small,
+        bshape, sshape, rshape, rstride);
   }
 }
 
@@ -227,16 +227,28 @@ void seq_reduce_compute_extra_mem(const size_t N, const size_t M, const bool add
   }
 }
 
-template <typename Reducer, int ndim, typename DType, typename OP>
+template <typename Reducer, int ndim, typename DType, typename OP, bool safe_acc = false>
 void Reduce(Stream<cpu>* s, const TBlob& small, const OpReqType req,
             const Tensor<cpu, 1, char>& workspace, const TBlob& big) {
   if (req == kNullOp) return;
   Shape<ndim> rshape, rstride;
   diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
   size_t N = small.shape_.Size(), M = rshape.Size();
-  seq_reduce_compute<Reducer, ndim, DType, OP>(
-    N, M, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(),
-    big.shape_.get<ndim>(), small.shape_.get<ndim>(), rshape, rstride);
+  if (!safe_acc) {
+    seq_reduce_compute<Reducer, ndim, DType, DType, DType, OP>(
+      N, M, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(),
+      big.shape_.get<ndim>(), small.shape_.get<ndim>(), rshape, rstride);
+  } else {
+    MXNET_ACC_TYPE_SWITCH(mshadow::DataType<DType>::kFlag, DataType, AType, {
+      typedef typename std::conditional<safe_acc, AType, DataType>::type AccType;
+      MSHADOW_TYPE_SWITCH(small.type_flag_, OType, {
+        typedef typename std::conditional<safe_acc, OType, DataType>::type OutType;
+        seq_reduce_compute<Reducer, ndim, AccType, DataType, OutType, OP>(
+          N, M, req == kAddTo, big.dptr<DataType>(), small.dptr<OutType>(),
+          big.shape_.get<ndim>(), small.shape_.get<ndim>(), rshape, rstride);
+      });
+    });
+  }
 }
 
 template <typename Reducer, int ndim, typename DType, typename OP>
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index b13906af6624..fc51d8af0f01 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -67,6 +67,7 @@ struct ReduceAxesParam : public dmlc::Parameter<ReduceAxesParam> {
 struct NormParam : public dmlc::Parameter<NormParam> {
   int ord;
   dmlc::optional<mxnet::TShape> axis;
+  dmlc::optional<int> out_dtype;
   bool keepdims;
   DMLC_DECLARE_PARAMETER(NormParam) {
     DMLC_DECLARE_FIELD(ord).set_default(2)
@@ -78,6 +79,15 @@ struct NormParam : public dmlc::Parameter<NormParam> {
       If `axis` is int, a reduction is performed on a particular axis.
       If `axis` is a 2-tuple, it specifies the axes that hold 2-D matrices,
       and the matrix norms of these matrices are computed.)code");
+    DMLC_DECLARE_FIELD(out_dtype)
+      .add_enum("float16", mshadow::kFloat16)
+      .add_enum("float32", mshadow::kFloat32)
+      .add_enum("float64", mshadow::kFloat64)
+      .add_enum("int64", mshadow::kInt64)
+      .add_enum("int32", mshadow::kInt32)
+      .add_enum("int8", mshadow::kInt8)
+      .set_default(dmlc::optional<int>())
+      .describe(R"code(The data type of the output.)code");
     DMLC_DECLARE_FIELD(keepdims).set_default(false)
       .describe("If this is set to `True`, the reduced axis is left "
                 "in the result as dimension with size one.");
@@ -129,9 +139,9 @@ struct BroadcastAxesParam : public dmlc::Parameter<BroadcastAxesParam> {
   mxnet::TShape axis;
   mxnet::TShape size;
   DMLC_DECLARE_PARAMETER(BroadcastAxesParam) {
-    DMLC_DECLARE_FIELD(axis).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(axis).set_default(mxnet::TShape(0, -1))
       .describe("The axes to perform the broadcasting.");
-    DMLC_DECLARE_FIELD(size).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(size).set_default(mxnet::TShape(0, -1))
       .describe("Target sizes of the broadcasting axes.");
   }
 };
@@ -139,7 +149,7 @@ struct BroadcastAxesParam : public dmlc::Parameter<BroadcastAxesParam> {
 struct BroadcastToParam : public dmlc::Parameter<BroadcastToParam> {
   mxnet::TShape shape;
   DMLC_DECLARE_PARAMETER(BroadcastToParam) {
-    DMLC_DECLARE_FIELD(shape).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(shape).set_default(mxnet::TShape(0, -1))
       .describe("The shape of the desired array."
                 " We can set the dim to zero if it's same as the original."
                 " E.g `A = broadcast_to(B, shape=(10, 0, 0))` "
@@ -165,7 +175,7 @@ inline int CheckAxis(int axis, int ndim) {
 }
 
 inline mxnet::TShape AxisShapeCompact(mxnet::TShape shape, int *axis, bool allow_2d) {
-  int ndim = static_cast<int>(shape.ndim());
+  int ndim = shape.ndim();
   index_t leading = 1, trailing = 1, M = shape[*axis];
   for (int i = 0; i < *axis; ++i) leading *= shape[i];
   for (int i = *axis + 1; i < ndim; ++i) trailing *= shape[i];
@@ -186,7 +196,7 @@ inline mxnet::TShape ReduceAxisShapeImpl(const mxnet::TShape& ishape,
                                          bool keepdims) {
   if (!axis || ishape.ndim() == 1) {
     if (keepdims) {
-      return mxnet::TShape(ishape.ndim());
+      return mxnet::TShape(ishape.ndim(), 1);
     }
     return mshadow::Shape1(1);
   }
@@ -198,7 +208,7 @@ inline mxnet::TShape ReduceAxisShapeImpl(const mxnet::TShape& ishape,
     return oshape;
   }
 
-  mxnet::TShape oshape(ishape.ndim() - 1);
+  mxnet::TShape oshape(ishape.ndim() - 1, 1);
   for (int i = 0; i < new_axis; ++i) oshape[i] = ishape[i];
   for (int i = new_axis+1; i < static_cast<int>(ishape.ndim()); ++i) {
     oshape[i-1] = ishape[i];
@@ -212,7 +222,7 @@ inline bool ReduceAxisShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& ishape = (*in_attrs)[0];
-  if (ishape.ndim() == 0) return false;
+  if (!shape_is_known(ishape)) return false;
 
   const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0,
@@ -223,12 +233,12 @@ inline bool ReduceAxisShape(const nnvm::NodeAttrs& attrs,
 inline mxnet::TShape ReduceAxesShapeImpl(const mxnet::TShape& ishape,
                                          const dmlc::optional<mxnet::TShape>& axis,
                                          bool keepdims, bool exclude) {
-  // if axis doesn't have value, treat it same mxnet::TShape().
+  // if axis doesn't have value, treat it same mxnet::TShape(0).
   if (!axis.has_value() || axis.value().ndim() == 0) {
     if (keepdims) {
-      return mxnet::TShape(ishape.ndim());
+      return mxnet::TShape(ishape.ndim(), 1);
     } else {
-      return mxnet::TShape(1);
+      return mxnet::TShape(1, 1);
     }
   }
   // axis has value
@@ -256,9 +266,9 @@ inline mxnet::TShape ReduceAxesShapeImpl(const mxnet::TShape& ishape,
   if (keepdims) {
     oshape = mxnet::TShape(ishape);
   } else if (exclude) {
-    oshape = mxnet::TShape(axes.ndim());
+    oshape = mxnet::TShape(axes.ndim(), 1);
   } else {
-    oshape = mxnet::TShape(std::max<index_t>(1, ishape.ndim() - axes.ndim()));
+    oshape = mxnet::TShape(std::max(1, ishape.ndim() - axes.ndim()), 1);
   }
 
   if (keepdims && exclude) {
@@ -294,7 +304,7 @@ inline bool ReduceAxesShape(const nnvm::NodeAttrs& attrs,
                             mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  if ((*in_attrs)[0].ndim() == 0) return false;
+  if (!shape_is_known((*in_attrs)[0])) return false;
   const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0,
                      ReduceAxesShapeImpl((*in_attrs)[0], param.axis,
@@ -302,12 +312,29 @@ inline bool ReduceAxesShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+inline bool NormType(const nnvm::NodeAttrs& attrs,
+                     std::vector<int> *in_attrs,
+                     std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
+  if (param.out_dtype.has_value()) {
+    CHECK_NE(in_attrs->at(0), -1)
+      << "input data type should be specified when out_dtype is not null";
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.out_dtype.value());
+  } else {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[0]);
+  }
+  return (*out_attrs)[0] != -1;
+}
+
 inline bool NormShape(const nnvm::NodeAttrs& attrs,
                       mxnet::ShapeVector *in_attrs,
                       mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  if ((*in_attrs)[0].ndim() == 0) return false;
+  if (!shape_is_known((*in_attrs)[0])) return false;
   const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0,
                      ReduceAxesShapeImpl((*in_attrs)[0], param.axis,
@@ -320,12 +347,12 @@ inline bool BroadcastAxesShape(const nnvm::NodeAttrs& attrs,
                                mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  if ((*in_attrs)[0].ndim() == 0) return false;
+  if (!shape_is_known((*in_attrs)[0])) return false;
   const BroadcastAxesParam& param = nnvm::get<BroadcastAxesParam>(attrs.parsed);
   CHECK_EQ(param.axis.ndim() , param.size.ndim());
   mxnet::TShape &ishape = (*in_attrs)[0];
   mxnet::TShape oshape = ishape;
-  for (index_t i = 0; i < param.axis.ndim(); ++i) {
+  for (int i = 0; i < param.axis.ndim(); ++i) {
     CHECK_EQ(oshape[param.axis[i]], 1U) << "Broadcasting axis must have size 1";
     oshape[param.axis[i]] = param.size[i];
   }
@@ -339,13 +366,13 @@ inline bool BroadcastToShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& ishape = (*in_attrs)[0];
-  if (ishape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(ishape)) return false;
   const BroadcastToParam& param = nnvm::get<BroadcastToParam>(attrs.parsed);
   CHECK_EQ(ishape.ndim(), param.shape.ndim())
     << "Operand of shape " << ishape << " cannot be broadcasted to " << param.shape;
   mxnet::TShape oshape = param.shape;
-  for (index_t i = 0; i < ishape.ndim(); ++i) {
-    if (oshape[i] != 0) {
+  for (int i = 0; i < ishape.ndim(); ++i) {
+    if (oshape[i] != -1) {
       CHECK(ishape[i] == oshape[i] || ishape[i] == 1)
         << "Array cannot be broadcasted from " << ishape << " to " << param.shape;
     } else {
@@ -364,7 +391,7 @@ inline bool BroadcastLikeShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape& lhs_shape = (*in_attrs)[0];
   mxnet::TShape& rhs_shape = (*in_attrs)[1];
 
-  if ((lhs_shape.ndim() == 0) || (lhs_shape.ndim() == 0)) {
+  if (!mxnet::ndim_is_known(lhs_shape) || !mxnet::ndim_is_known(rhs_shape)) {
     return false;
   }
 
@@ -377,8 +404,8 @@ inline bool BroadcastLikeShape(const nnvm::NodeAttrs& attrs,
       << "Operand of shape " << lhs_shape << " cannot be broadcasted to " << rhs_shape;
 
     oshape = mxnet::TShape(rhs_shape);
-    for (index_t i = 0; i < lhs_shape.ndim(); ++i) {
-      if (rhs_shape[i] != 0) {
+    for (int i = 0; i < lhs_shape.ndim(); ++i) {
+      if (rhs_shape[i] != -1) {
         CHECK(lhs_shape[i] == rhs_shape[i] || lhs_shape[i] == 1)
           << "Array cannot be broadcasted from " << lhs_shape << " to " << rhs_shape;
       } else {
@@ -396,7 +423,7 @@ inline bool BroadcastLikeShape(const nnvm::NodeAttrs& attrs,
       << "Empty axes tuple is not allowed";
 
     oshape = mxnet::TShape(lhs_shape);
-    for (index_t i = 0; i < lhs_axes.ndim(); ++i) {
+    for (int i = 0; i < lhs_axes.ndim(); ++i) {
       auto copyfrom = lhs_axes[i];
       if (copyfrom < 0) {
         copyfrom =  lhs_shape.ndim() + copyfrom;
@@ -423,9 +450,9 @@ inline bool BroadcastLikeShape(const nnvm::NodeAttrs& attrs,
 
 inline void BroadcastReduceShapeCompact(const mxnet::TShape& big, const mxnet::TShape& small,
                                         mxnet::TShape *new_big, mxnet::TShape *new_small) {
-  index_t idim = std::max<index_t>(big.ndim(), MXNET_SPECIAL_MAX_NDIM);
-  *new_big = mxnet::TShape(idim);
-  *new_small = mxnet::TShape(idim);
+  const int idim = std::max(big.ndim(), MXNET_SPECIAL_MAX_NDIM);
+  *new_big = mxnet::TShape(idim, 1);
+  *new_small = mxnet::TShape(idim, 1);
   index_t j = 0;
   if (small.Size() == 1) {
     (*new_big)[j++] = big.Size();
@@ -451,12 +478,10 @@ inline void BroadcastReduceShapeCompact(const mxnet::TShape& big, const mxnet::T
       ++j;
     }
   }
-  if (j <= 2) {
-    new_small->assign(&(*new_small)[0], &(*new_small)[2]);
-    new_big->assign(&(*new_big)[0], &(*new_big)[2]);
-  } else if (j <= MXNET_SPECIAL_MAX_NDIM) {
-    new_small->assign(&(*new_small)[0], &(*new_small)[MXNET_SPECIAL_MAX_NDIM]);
-    new_big->assign(&(*new_big)[0], &(*new_big)[MXNET_SPECIAL_MAX_NDIM]);
+  if (j <= MXNET_SPECIAL_MAX_NDIM) {
+    const int ndim = (j <= 2? 2 : MXNET_SPECIAL_MAX_NDIM);
+    new_small->assign(new_small->begin(), new_small->begin() + ndim);
+    new_big->assign(new_big->begin(), new_big->begin() + ndim);
   } else {
     LOG(FATAL) << "Too many reduction axes from " << big << " to " << small;
   }
@@ -525,7 +550,7 @@ void SearchAxisCompute(const nnvm::NodeAttrs& attrs,
   });
 }
 
-template<typename xpu, typename reducer, bool normalize = false,
+template<typename xpu, typename reducer, bool safe_acc, bool normalize = false,
          typename OP = op::mshadow_op::identity>
 void ReduceAxesComputeImpl(const OpContext& ctx,
                            const std::vector<TBlob>& inputs,
@@ -538,20 +563,22 @@ void ReduceAxesComputeImpl(const OpContext& ctx,
   mxnet::TShape src_shape, dst_shape;
   BroadcastReduceShapeCompact(inputs[0].shape_, small, &src_shape, &dst_shape);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    const TBlob in_data = inputs[0].reshape(src_shape);
-    const TBlob out_data = outputs[0].reshape(dst_shape);
-    BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
-      size_t workspace_size = broadcast::ReduceWorkspaceSize<NDim, DType>(
-          s, out_data.shape_, req[0], in_data.shape_);
-      Tensor<xpu, 1, char> workspace =
-          ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
-      broadcast::Reduce<reducer, NDim, DType, OP>(
-          s, out_data, req[0], workspace, in_data);
-      if (normalize) {
-        auto out = out_data.FlatTo2D<xpu, DType>(s);
-        out /= scalar<DType>(src_shape.Size()/dst_shape.Size());
-      }
+  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      const TBlob in_data = inputs[0].reshape(src_shape);
+      const TBlob out_data = outputs[0].reshape(dst_shape);
+      BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
+        size_t workspace_size = broadcast::ReduceWorkspaceSize<NDim, DType>(
+            s, out_data.shape_, req[0], in_data.shape_);
+        Tensor<xpu, 1, char> workspace =
+            ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+        broadcast::Reduce<reducer, NDim, DType, OP, safe_acc>(
+            s, out_data, req[0], workspace, in_data);
+        if (normalize) {
+          auto out = out_data.FlatTo2D<xpu, OType>(s);
+          out /= scalar<OType>(src_shape.Size()/dst_shape.Size());
+        }
+      });
     });
   });
 }
@@ -571,7 +598,7 @@ void ReduceAxesCompute(const nnvm::NodeAttrs& attrs,
     small = ReduceAxesShapeImpl(inputs[0].shape_, param.axis, true, param.exclude);
   }
 
-  ReduceAxesComputeImpl<xpu, reducer, normalize, OP>(ctx, inputs, req, outputs, small);
+  ReduceAxesComputeImpl<xpu, reducer, false, normalize, OP>(ctx, inputs, req, outputs, small);
 }
 
 template <typename red_op, int req, int axis>
@@ -813,6 +840,35 @@ void ReduceAxesOpForwardEx(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
   }
 }
 
+template<int req, typename OP>
+struct reduce_axes_backward_broadcast {
+  template<typename DType, typename OType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  DType *data,
+                                  OType *out,
+                                  DType *igrad,
+                                  OType *ograd,
+                                  mshadow::Shape<5> in_shape,
+                                  mshadow::Shape<5> out_shape,
+                                  const uint32_t ndim) {
+    size_t in_stride = 1;
+    size_t out_stride = 1;
+    index_t idx = i;
+    index_t out_idx = i;
+    for (int iter = ndim - 1; iter >= 0; --iter) {
+      size_t dim_idx = idx % in_shape[iter];
+      out_idx -= dim_idx * in_stride;
+      if (out_shape[iter] != 1) {
+        out_idx += dim_idx * out_stride;
+      }
+      idx /= in_shape[iter];
+      in_stride *= in_shape[iter];
+      out_stride *= out_shape[iter];
+    }
+    KERNEL_ASSIGN(igrad[i], req, DType(ograd[out_idx]) * OP::Map(data[i], DType(out[out_idx])));
+  }
+};
+
 template<typename xpu, typename OP, bool normalize = false>
 void ReduceAxesBackwardUseInOutImpl(const OpContext& ctx,
                                     const mxnet::TShape &small,
@@ -821,37 +877,58 @@ void ReduceAxesBackwardUseInOutImpl(const OpContext& ctx,
                                     const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
+  using namespace mxnet_op;
 
   mxnet::TShape src_shape, dst_shape;
   BroadcastReduceShapeCompact(outputs[0].shape_, small, &src_shape, &dst_shape);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    if (dst_shape.ndim() == 2) {
-      Tensor<xpu, 2, DType> igrad =
-        outputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-      Tensor<xpu, 2, DType> ograd =
-        inputs[0].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
-      Tensor<xpu, 2, DType> data =
-        inputs[1].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-      Tensor<xpu, 2, DType> out =
-        inputs[2].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
-      ASSIGN_DISPATCH(igrad, req[0],
-          broadcast_to(ograd, src_shape)*F<OP>(data, broadcast_to(out, src_shape)));
-      if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
-    } else {
-      const int ndim = MXNET_SPECIAL_MAX_NDIM;
-      Tensor<xpu, ndim, DType> igrad =
-        outputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-      Tensor<xpu, ndim, DType> ograd =
-        inputs[0].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
-      Tensor<xpu, ndim, DType> data =
-        inputs[1].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-      Tensor<xpu, ndim, DType> out =
-        inputs[2].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
-      ASSIGN_DISPATCH(igrad, req[0],
-          broadcast_to(ograd, src_shape)*F<OP>(data, broadcast_to(out, src_shape)));
-      if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
-    }
+
+  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      mshadow::Shape<5> in_shape;
+      mshadow::Shape<5> out_shape;
+      for (int i = 0; i < 5; ++i) {
+        if (i < dst_shape.ndim()) {
+          in_shape[i] = src_shape[i];
+          out_shape[i] = dst_shape[i];
+        } else {
+          in_shape[i] = 1;
+          out_shape[i] = 1;
+        }
+      }
+      if (dst_shape.ndim() == 2) {
+        Tensor<xpu, 2, DType> igrad =
+          outputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
+        Tensor<xpu, 2, OType> ograd =
+          inputs[0].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+        Tensor<xpu, 2, DType> data =
+          inputs[1].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
+        Tensor<xpu, 2, OType> out =
+          inputs[2].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+        MXNET_REQ_TYPE_SWITCH(req[0], Req, {
+          Kernel<reduce_axes_backward_broadcast<Req, OP>, xpu>::Launch(
+            s, outputs[0].shape_.Size(), data.dptr_, out.dptr_, igrad.dptr_, ograd.dptr_,
+            in_shape, out_shape, src_shape.ndim());
+        });
+        if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
+      } else {
+        const int ndim = MXNET_SPECIAL_MAX_NDIM;
+        Tensor<xpu, ndim, DType> igrad =
+          outputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, OType> ograd =
+          inputs[0].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, DType> data =
+          inputs[1].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, OType> out =
+          inputs[2].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+        MXNET_REQ_TYPE_SWITCH(req[0], Req, {
+          Kernel<reduce_axes_backward_broadcast<Req, OP>, xpu>::Launch(
+            s, outputs[0].shape_.Size(), data.dptr_, out.dptr_, igrad.dptr_, ograd.dptr_,
+            in_shape, out_shape, src_shape.ndim());
+        });
+        if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
+      }
+    });
   });
 }
 
@@ -1090,14 +1167,42 @@ void LpNormCompute(const nnvm::NodeAttrs& attrs,
     small = ReduceAxesShapeImpl(inputs[0].shape_, param.axis, true, false);
   }
   if (param.ord == 1) {
-    ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, mshadow_op::abs>(
-          ctx, inputs, req, outputs, small);
+    ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, false, mshadow_op::abs>(
+        ctx, inputs, req, outputs, small);
   } else if (param.ord == 2) {
-    ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, false, mshadow_op::identity>(
+    ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, true, false, mshadow_op::identity>(
         ctx, inputs, req, outputs, small);
   }
 }
 
+template<int req>
+struct norm_backward_broadcast {
+  template<typename DType, typename OType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  DType *igrad,
+                                  OType *ograd,
+                                  DType *data,
+                                  mshadow::Shape<5> in_shape,
+                                  mshadow::Shape<5> out_shape,
+                                  const uint32_t ndim) {
+    size_t in_stride = 1;
+    size_t out_stride = 1;
+    index_t idx = i;
+    index_t out_idx = i;
+    for (int iter = ndim - 1; iter >= 0; --iter) {
+      size_t dim_idx = idx % in_shape[iter];
+      out_idx -= dim_idx * in_stride;
+      if (out_shape[iter] != 1) {
+        out_idx += dim_idx * out_stride;
+      }
+      idx /= in_shape[iter];
+      in_stride *= in_shape[iter];
+      out_stride *= out_shape[iter];
+    }
+    KERNEL_ASSIGN(igrad[i], req, ograd[out_idx] * mshadow_op::sign::Map(data[i]));
+  }
+};
+
 template<typename xpu>
 void LpNormGradCompute(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
@@ -1106,6 +1211,7 @@ void LpNormGradCompute(const nnvm::NodeAttrs& attrs,
                        const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
+  using namespace mxnet_op;
   if (req[0] == kNullOp) return;
 
   const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
@@ -1119,27 +1225,46 @@ void LpNormGradCompute(const nnvm::NodeAttrs& attrs,
     mxnet::TShape src_shape, dst_shape;
     BroadcastReduceShapeCompact(outputs[0].shape_, small, &src_shape, &dst_shape);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      if (dst_shape.ndim() == 2) {
-        Tensor<xpu, 2, DType> ograd =
-          inputs[0].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
-        Tensor<xpu, 2, DType> igrad =
-          outputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-        Tensor<xpu, 2, DType> data =
-          inputs[1].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-        ASSIGN_DISPATCH(igrad, req[0],
-          broadcast_to(ograd, src_shape)*F<mshadow_op::sign>(data));
+    mshadow::Shape<5> in_shape;
+    mshadow::Shape<5> out_shape;
+    for (int i = 0; i < 5; ++i) {
+      if (i < dst_shape.ndim()) {
+        in_shape[i] = src_shape[i];
+        out_shape[i] = dst_shape[i];
       } else {
-        const int ndim = MXNET_SPECIAL_MAX_NDIM;
-        Tensor<xpu, ndim, DType> igrad =
-          outputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, DType> ograd =
-          inputs[0].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, DType> data =
-          inputs[1].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-        ASSIGN_DISPATCH(igrad, req[0],
-          broadcast_to(ograd, src_shape)*F<mshadow_op::sign>(data));
+        in_shape[i] = 1;
+        out_shape[i] = 1;
       }
+    }
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, OType, {
+        if (dst_shape.ndim() == 2) {
+          Tensor<xpu, 2, OType> ograd =
+            inputs[0].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+          Tensor<xpu, 2, DType> igrad =
+            outputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
+          Tensor<xpu, 2, DType> data =
+            inputs[1].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
+          MXNET_REQ_TYPE_SWITCH(req[0], Req, {
+            Kernel<norm_backward_broadcast<Req>, xpu>::Launch(
+              s, igrad.shape_.Size(), igrad.dptr_, ograd.dptr_, data.dptr_,
+              in_shape, out_shape, src_shape.ndim());
+          });
+        } else {
+          const int ndim = MXNET_SPECIAL_MAX_NDIM;
+          Tensor<xpu, ndim, DType> igrad =
+            outputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
+          Tensor<xpu, ndim, OType> ograd =
+            inputs[0].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+          Tensor<xpu, ndim, DType> data =
+            inputs[1].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
+          MXNET_REQ_TYPE_SWITCH(req[0], Req, {
+            Kernel<norm_backward_broadcast<Req>, xpu>::Launch(
+              s, igrad.shape_.Size(), igrad.dptr_, ograd.dptr_, data.dptr_,
+              in_shape, out_shape, src_shape.ndim());
+          });
+        }
+      });
     });
   } else if (param.ord == 2) {
     ReduceAxesBackwardUseInOutImpl<xpu, mshadow_op::div, false>(ctx, small, inputs,
diff --git a/src/operator/tensor/broadcast_reduce_op_index.cc b/src/operator/tensor/broadcast_reduce_op_index.cc
index ed9a90d04f30..f3d101372a1c 100644
--- a/src/operator/tensor/broadcast_reduce_op_index.cc
+++ b/src/operator/tensor/broadcast_reduce_op_index.cc
@@ -109,6 +109,7 @@ Examples::
 .add_argument("data", "NDArray-or-Symbol", "The input array");
 
 NNVM_REGISTER_OP(pick)
+.add_alias("choose_element_0index")
 .describe(R"code(Picks elements from an input array according to the input indices along the given axis.
 
 Given an input array of shape ``(d0, d1)`` and indices of shape ``(i0,)``, the result will be
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cc b/src/operator/tensor/broadcast_reduce_op_value.cc
index 52fd61aa110e..f4231917e90d 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cc
+++ b/src/operator/tensor/broadcast_reduce_op_value.cc
@@ -352,7 +352,7 @@ Examples::
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<NormParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", NormShape)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", NormType)
 .set_attr<FInferStorageType>("FInferStorageType", LpNormStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_norm" })
 .set_attr<FResourceRequest>("FResourceRequest",
diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index 1e3c1c9701d4..c95c1ce414f2 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -84,19 +84,19 @@ inline mxnet::TShape DiagShapeImpl(const mxnet::TShape& ishape, const int k,
 
   auto s = std::min(h, w);
   if (s < 0) {
-    s = 0;
+    s = -1;
   }
 
   if (x1 > x2) {
     std::swap(x1, x2);
   }
 
-  int32_t n_dim = static_cast<int32_t>(ishape.ndim()) - 1;
-  mxnet::TShape oshape(n_dim);
+  int32_t n_dim = ishape.ndim() - 1;
+  mxnet::TShape oshape(n_dim, -1);
 
   // remove axis1 and axis2 and append the new axis to the end
   uint32_t idx = 0;
-  for (int32_t i = 0; i <= n_dim; ++i) {
+  for (int i = 0; i <= n_dim; ++i) {
     if (i != x1 && i != x2) {
       oshape[idx++] = ishape[i];
     }
@@ -114,7 +114,7 @@ inline bool DiagOpShape(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(out_attrs->size(), 1U);
 
     const mxnet::TShape& ishape = (*in_attrs)[0];
-    if (ishape.ndim() == 0) {
+    if (!mxnet::ndim_is_known(ishape)) {
       return false;
     }
 
@@ -129,7 +129,7 @@ inline bool DiagOpShape(const nnvm::NodeAttrs& attrs,
     }
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
 
-    return out_attrs->at(0).ndim() != 0U;
+    return shape_is_known(out_attrs->at(0));
 }
 
 inline bool DiagOpType(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
index 163b4426cb2b..318254b26b9f 100644
--- a/src/operator/tensor/dot-inl.h
+++ b/src/operator/tensor/dot-inl.h
@@ -38,7 +38,9 @@
 #ifdef __CUDACC__
 #include "./dot-inl.cuh"
 #endif  // __CUDACC__
-
+#if (MSHADOW_USE_MKL == 1)
+#include "sparse_matrix.h"
+#endif
 namespace mxnet {
 namespace op {
 
@@ -775,13 +777,35 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx,
   }
 
   using nnvm::dim_t;
-
+#if (MSHADOW_USE_MKL == 1)
+  TShape lhs_shape = lhs.shape();
+  TShape rhs_shape = rhs.shape_;
+#endif
   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
   const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
   const TBlob& data_r = rhs;
   const TBlob data_out = *ret;
 
+#if (MSHADOW_USE_MKL == 1)
+  if (data_l.type_flag_ == mshadow::kFloat32
+    && indptr_l.type_flag_ == mshadow::kInt64
+    && col_idx_l.type_flag_ == mshadow::kInt64
+    && !trans_lhs) {
+    bool ret = mkl_DotCsrDnsDns(static_cast<SP_INT64*>(indptr_l.dptr_),
+                                static_cast<SP_INT64*>(col_idx_l.dptr_),
+                                data_l.dptr<float>(),
+                                data_r.dptr<float>(),
+                                data_out.dptr<float>(),
+                                lhs_shape[0],
+                                lhs_shape[1],
+                                rhs_shape[1]);
+    if (ret) {
+      return;
+    }
+  }
+#endif
+
   MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
@@ -1217,20 +1241,20 @@ inline bool DotShape(const nnvm::NodeAttrs& attrs,
     if (Ta) {
       L[0] = mshadow::Shape1(lshape[0]);
       L[1] = lshape.ndim() > 1 ?
-             mxnet::TShape(&lshape[1], &lshape[lshape.ndim()]) : mxnet::TShape(1);
+             mxnet::TShape(&lshape[1], lshape.end()) : mxnet::TShape(1, 1);
     } else {
       L[0] = lshape.ndim() > 1 ?
-             mxnet::TShape(&lshape[0], &lshape[lshape.ndim()-1]) : mxnet::TShape(1);
+             mxnet::TShape(&lshape[0], &lshape[lshape.ndim()-1]) : mxnet::TShape(1, 1);
       L[1] = mshadow::Shape1(lshape[lshape.ndim()-1]);
     }
     if (Tb) {
       R[0] = rshape.ndim() > 1 ?
-             mxnet::TShape(&rshape[0], &rshape[rshape.ndim()-1]) : mxnet::TShape(1);
+             mxnet::TShape(&rshape[0], &rshape[rshape.ndim()-1]) : mxnet::TShape(1, 1);
       R[1] = mshadow::Shape1(rshape[rshape.ndim()-1]);
     } else {
       R[0] = mshadow::Shape1(rshape[0]);
       R[1] = rshape.ndim() > 1 ?
-             mxnet::TShape(&rshape[1], &rshape[rshape.ndim()]) : mxnet::TShape(1);
+             mxnet::TShape(&rshape[1], rshape.end()) : mxnet::TShape(1, 1);
     }
 
     if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) {
@@ -1238,8 +1262,8 @@ inline bool DotShape(const nnvm::NodeAttrs& attrs,
         << "dot shape error: " << lshape << " X " << rshape;
     }
     std::vector<index_t> buf;
-    if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]);
-    if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]);
+    if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], L[Ta].end());
+    if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], R[!Tb].end());
     mxnet::TShape oshape(buf.begin(), buf.end());
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   }
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 1d2b7c9c1163..73019fa8389b 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -48,33 +48,32 @@ inline bool BinaryBroadcastShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape& rhs = (*in_attrs)[1];
 
   // avoid pre-mature shape inference.
-  if (lhs.ndim() == 0 || rhs.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(lhs) || !mxnet::ndim_is_known(rhs)) return false;
 
   if (lhs == rhs) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, lhs);
-    return true;
+    return shape_is_known(lhs);
   }
-  mxnet::TShape out(std::max(lhs.ndim(), rhs.ndim()));
-  index_t bl = out.ndim() - lhs.ndim();
-  index_t br = out.ndim() - rhs.ndim();
-  for (index_t i = 0; i < out.ndim(); ++i) {
-    index_t l = 1, r = 1;
+  mxnet::TShape out(std::max(lhs.ndim(), rhs.ndim()), -1);
+  const int bl = out.ndim() - lhs.ndim();
+  const int br = out.ndim() - rhs.ndim();
+  for (int i = 0; i < out.ndim(); ++i) {
+    int l = 1, r = 1;
     if (i >= bl) l = lhs[i-bl];
     if (i >= br) r = rhs[i-br];
+    if (!mxnet::dim_size_is_known(l) || !mxnet::dim_size_is_known(r)) continue;
     if (l != r) {
-      if (l == 0 || r == 0) {
-        out[i] = 0;
-      } else {
-        CHECK(l == 1 || r == 1)
-          << "operands could not be broadcast together with shapes " << lhs << " " << rhs;
-        out[i] = std::max(l, r);
-      }
+      // Make it compatible with NumPy.
+      // For example, (2, 3) cannot broadcast to (2, 0, 3), but (1, 3) can broadcast to (2, 0, 3).
+      CHECK(l == 1 || r == 1)
+        << "operands could not be broadcast together with shapes " << lhs << " " << rhs;
+      out[i] = (l == 1 ? r : l);
     } else {
       out[i] = l;
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, out);
-  return true;
+  return shape_is_known(lhs) && shape_is_known(rhs) && shape_is_known(out);
 }
 
 inline bool BinaryBroadcastMulStorageType(const nnvm::NodeAttrs& attrs,
@@ -146,15 +145,15 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet:
                                        const mxnet::TShape& oshape, mxnet::TShape *new_lshape,
                                        mxnet::TShape *new_rshape, mxnet::TShape *new_oshape) {
   if (lshape == rshape) return 0;
-  index_t odim = std::max<index_t>(oshape.ndim(), broadcast::MAX_DIM);
-  *new_lshape = mxnet::TShape(odim);
-  *new_rshape = mxnet::TShape(odim);
-  *new_oshape = mxnet::TShape(odim);
-  index_t bl = oshape.ndim() - lshape.ndim();
-  index_t br = oshape.ndim() - rshape.ndim();
-  index_t j = 0, lprod = 1, rprod = 1, oprod = 1;
-  for (index_t i = 0; i < oshape.ndim(); ++i) {
-    index_t l = 1, r = 1, o = oshape[i];
+  const int odim = std::max(oshape.ndim(), broadcast::MAX_DIM);
+  *new_lshape = mxnet::TShape(odim, 1);
+  *new_rshape = mxnet::TShape(odim, 1);
+  *new_oshape = mxnet::TShape(odim, 1);
+  int bl = oshape.ndim() - lshape.ndim();
+  int br = oshape.ndim() - rshape.ndim();
+  int j = 0, lprod = 1, rprod = 1, oprod = 1;
+  for (int i = 0; i < oshape.ndim(); ++i) {
+    int l = 1, r = 1, o = oshape[i];
     if (i >= bl) l = lshape[i-bl];
     if (i >= br) r = rshape[i-br];
     if ((lprod != rprod || l != r) &&
@@ -176,9 +175,9 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet:
   }
   if (j <= broadcast::MAX_DIM) {
     BROADCAST_NDIM_SWITCH(j, NDim, {
-      new_lshape->assign(&(*new_lshape)[0], &(*new_lshape)[NDim]);
-      new_rshape->assign(&(*new_rshape)[0], &(*new_rshape)[NDim]);
-      new_oshape->assign(&(*new_oshape)[0], &(*new_oshape)[NDim]);
+      new_lshape->assign(new_lshape->begin(), new_lshape->begin() + NDim);
+      new_rshape->assign(new_rshape->begin(), new_rshape->begin() + NDim);
+      new_oshape->assign(new_oshape->begin(), new_oshape->begin() + NDim);
     });
   } else {
     LOG(FATAL) << "Too many broadcast dimensions with operands " << lshape << " " << rshape;
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index c26f3949143f..dc876a02a2b4 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -413,9 +413,9 @@ bool ReshapeLikeShapeCompute(const nnvm::NodeAttrs &attrs,
   GetReshapeLikeParams(param, lshape, rshape, &lhs_begin, &lhs_end, &rhs_begin,
                        &rhs_end);
 
-  int lhsrank = static_cast<int>(lshape.ndim());
+  int lhsrank = lshape.ndim();
   int orank = lhsrank + (rhs_end - rhs_begin) - (lhs_end - lhs_begin);
-  mxnet::TShape oshape(orank);
+  mxnet::TShape oshape(orank, -1);
 
   for (int i = 0; i < lhs_begin; ++i)
     oshape[i] = lshape[i];
@@ -436,7 +436,7 @@ bool ReshapeLikeShapeCompute(const nnvm::NodeAttrs &attrs,
       << "shape " << oshape << " because they have different "
       << "size.";
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 DMLC_REGISTER_PARAMETER(ReshapeLikeParam);
@@ -481,7 +481,16 @@ Negative indices are supported, and `None` can be used for either `lhs_end` or `
     [](const NodeAttrs& attrs) { return std::vector<uint32_t>(1, 1); })
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 .set_attr<mxnet::FInferShape>("FInferShape", ReshapeLikeShapeCompute)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
+                                             std::vector<int> *in_attrs,
+                                             std::vector<int> *out_attrs) {
+    CHECK_EQ(in_attrs->size(), 2) << " in operator " << attrs.name;
+    std::vector<int> checked_in_attrs = { (*in_attrs)[0] };
+    bool ret = !type_is_none((*in_attrs)[1]) &&
+               ElemwiseType<1, 1>(attrs, &checked_in_attrs, out_attrs);
+    (*in_attrs)[0] = checked_in_attrs[0];
+    return ret;
+  })
 .set_attr<nnvm::FGradient>(
     "FGradient",  [](const nnvm::NodePtr& n,
                      const std::vector<nnvm::NodeEntry>& ograds) {
@@ -528,7 +537,7 @@ Example::
      mxnet::ShapeVector *out_attrs) {
     CHECK_EQ(in_attrs->size(), 1U);
     CHECK_EQ(out_attrs->size(), 1U);
-    mxnet::TShape target_shape(1);
+    mxnet::TShape target_shape(1, -1);
     target_shape[0] = in_attrs->at(0).ndim();
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, target_shape);
     return !shape_is_none(out_attrs->at(0));
@@ -580,7 +589,7 @@ Example::
      mxnet::ShapeVector *out_attrs) {
     CHECK_EQ(in_attrs->size(), 1U);
     CHECK_EQ(out_attrs->size(), 1U);
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, 1U);
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
     return !shape_is_none(out_attrs->at(0));
   })
 .set_attr<nnvm::FInferType>("FInferType",
diff --git a/src/operator/tensor/histogram-inl.h b/src/operator/tensor/histogram-inl.h
index 51d0bdb6c2b6..7194445d7b52 100644
--- a/src/operator/tensor/histogram-inl.h
+++ b/src/operator/tensor/histogram-inl.h
@@ -46,13 +46,13 @@ namespace op {
 
 struct HistogramParam : public dmlc::Parameter<HistogramParam> {
     dmlc::optional<int> bin_cnt;
-    dmlc::optional<nnvm::Tuple<double>> range;
+    dmlc::optional<mxnet::Tuple<double>> range;
     DMLC_DECLARE_PARAMETER(HistogramParam) {
       DMLC_DECLARE_FIELD(bin_cnt)
         .set_default(dmlc::optional<int>())
         .describe("Number of bins for uniform case");
       DMLC_DECLARE_FIELD(range)
-        .set_default(dmlc::optional<nnvm::Tuple<double>>())
+        .set_default(dmlc::optional<mxnet::Tuple<double>>())
         .describe("The lower and upper range of the bins. if not provided, "
                   "range is simply (a.min(), a.max()). values outside the "
                   "range are ignored. the first element of the range must be "
@@ -86,9 +86,9 @@ inline bool HistogramOpShape(const nnvm::NodeAttrs& attrs,
   if (has_cnt) {
     // if cnt is specified, the output histogram has shape (cnt,)
     // while output bins has shape (cnt+1,)
-    const int bin_cnt = param.bin_cnt.value();
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape({bin_cnt}));
-    SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape({bin_cnt + 1}));
+    const dim_t bin_cnt = param.bin_cnt.value();
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, bin_cnt));
+    SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape(1, bin_cnt + 1));
   } else {
     // if cnt is not specified, the output histogram has shape (bins.Size() - 1)
     // while output bins has same shape as input bins
@@ -97,11 +97,11 @@ inline bool HistogramOpShape(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(oshape.ndim(), 1U) << "bins argument should be an 1D vector";
     CHECK_GE(oshape.Size(), 2U) << "number of bounds should be >= 2";
 
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape({(oshape[0] - 1)}));
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, oshape[0] - 1));
     SHAPE_ASSIGN_CHECK(*out_attrs, 1, in_attrs->at(1));
   }
 
-  return !shape_is_none(out_attrs->at(0)) && !shape_is_none(out_attrs->at(1)) &&
+  return shape_is_known(out_attrs->at(0)) && shape_is_known(out_attrs->at(1)) &&
          out_attrs->at(0).Size() == out_attrs->at(1).Size() - 1;
 }
 
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 8979531fef4e..6469aae17558 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -145,20 +145,20 @@ inline bool EmbeddingOpShape(const nnvm::NodeAttrs& attrs,
                              mxnet::ShapeVector *out_attrs) {
   using namespace mshadow;
   const mxnet::TShape &dshape = (*in_attrs)[embedding::kData];
-  if (dshape.ndim() ==  0) return false;
+  if (!shape_is_known(dshape)) return false;
   const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*in_attrs, embedding::kWeight, Shape2(param.input_dim,
                                                            param.output_dim));
   out_attrs->clear();
 
-  mxnet::TShape oshape(dshape.ndim()+1);
-  for (size_t i = 0; i < dshape.ndim(); ++i) {
+  mxnet::TShape oshape(dshape.ndim()+1, -1);
+  for (int i = 0; i < dshape.ndim(); ++i) {
     oshape[i] = dshape[i];
   }
   oshape[dshape.ndim()] = param.output_dim;
 
   out_attrs->push_back(oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 template<typename ParamType>
@@ -682,18 +682,18 @@ inline bool TakeOpShape(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   const mxnet::TShape &arrshape = (*in_attrs)[take_::kArr];
   const mxnet::TShape &idxshape = (*in_attrs)[take_::kIdx];
-  if (idxshape.ndim() == 0U || idxshape.Size() == 0U) return false;
+  if (!shape_is_known(idxshape)) return false;
   const TakeParam& param = nnvm::get<TakeParam>(attrs.parsed);
   if (param.mode == take_::kRaise) {
     LOG(FATAL) << "Raise is not supported for the time being...";
   }
-  CHECK(param.axis >= -1 * (int)arrshape.ndim() && param.axis < (int)arrshape.ndim())
+  CHECK(param.axis >= -1 * arrshape.ndim() && param.axis < arrshape.ndim())
     << "Axis should be in the range of [-r, r-1] where r is the rank of input tensor";
 
   out_attrs->clear();
 
   const index_t actual_axis = param.axis + ((param.axis < 0) ? arrshape.ndim() : 0);
-  mxnet::TShape oshape(idxshape.ndim() + arrshape.ndim() - 1);
+  mxnet::TShape oshape(idxshape.ndim() + arrshape.ndim() - 1, -1);
   for (index_t i = 0; i < idxshape.ndim(); ++i) {
     oshape[i + actual_axis] = idxshape[i];
   }
@@ -705,7 +705,7 @@ inline bool TakeOpShape(const nnvm::NodeAttrs& attrs,
     }
   }
   out_attrs->push_back(oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 inline bool TakeOpType(const nnvm::NodeAttrs& attrs,
@@ -1170,6 +1170,7 @@ inline bool OneHotOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   // The shape of indices
   const mxnet::TShape& ishape = (*in_attrs)[0];
+  if (!shape_is_known(ishape)) return false;
 
   int depth = 0;
   double on_value = 1.0;
@@ -1177,13 +1178,13 @@ inline bool OneHotOpShape(const nnvm::NodeAttrs& attrs,
   int dtype = mshadow::kFloat32;
   GetOneHotParams(param, &depth, &on_value, &off_value, &dtype);
 
-  mxnet::TShape oshape(ishape.ndim() + 1);
+  mxnet::TShape oshape(ishape.ndim() + 1, -1);
   for (index_t i = 0; i < ishape.ndim(); ++i) {
     oshape[i] = ishape[i];
   }
   oshape[oshape.ndim()-1] = depth;
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 inline bool OneHotOpType(const nnvm::NodeAttrs& attrs,
@@ -1270,15 +1271,15 @@ inline bool GatherNDShape(const nnvm::NodeAttrs& attrs,
   CHECK_LE(ishape[0], 10)
     << "gather_nd supports indexing along at most 10 dimensions.";
 
-  mxnet::TShape oshape(ishape.ndim() - 1 + dshape.ndim() - ishape[0]);
+  mxnet::TShape oshape(ishape.ndim() - 1 + dshape.ndim() - ishape[0], -1);
 
-  for (size_t i = 0; i < ishape.ndim() - 1; ++i) oshape[i] = ishape[i+1];
+  for (int i = 0; i < ishape.ndim() - 1; ++i) oshape[i] = ishape[i+1];
   for (int i = 0; i < dshape.ndim() - ishape[0]; ++i) {
     oshape[ishape.ndim()-1+i] = dshape[ishape[0] + i];
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 inline bool GatherNDType(const nnvm::NodeAttrs& attrs,
@@ -1370,7 +1371,7 @@ inline bool ScatterNDShape(const nnvm::NodeAttrs& attrs,
 
   bool valid = dshape.ndim() == ishape.ndim() - 1 + oshape.ndim() - ishape[0];
 
-  for (size_t i = 0; i < ishape.ndim() - 1; ++i) {
+  for (int i = 0; i < ishape.ndim() - 1; ++i) {
     valid = valid && dshape[i] == ishape[i+1];
   }
   for (int i = 0; i < oshape.ndim() - ishape[0]; ++i) {
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 680431dfecd1..b2e3830064ae 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -28,6 +28,7 @@
 #include <mxnet/base.h>
 #include <mxnet/operator_util.h>
 #include <mxnet/op_attr_types.h>
+#include <mxnet/imperative.h>
 #include <dmlc/parameter.h>
 #include <dmlc/optional.h>
 #include <vector>
@@ -49,7 +50,7 @@ struct InitOpParam : public dmlc::Parameter<InitOpParam> {
   int dtype;
   DMLC_DECLARE_PARAMETER(InitOpParam) {
     DMLC_DECLARE_FIELD(shape)
-    .set_default(mxnet::TShape())
+    .set_default(mxnet::TShape(0, 1))
     .describe("The shape of the output");
     DMLC_DECLARE_FIELD(ctx)
     .set_default("")
@@ -213,9 +214,13 @@ inline bool InitShape(const nnvm::NodeAttrs& attrs,
   const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 0U);
   CHECK_EQ(out_attrs->size(), 1U);
-  if ((*out_attrs)[0].ndim() != 0 && param.shape.ndim() == 0) return true;
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, param.shape);
-  return true;
+  mxnet::TShape param_shape = param.shape;
+  if (!Imperative::Get()->is_np_comp()) {
+    common::ConvertToNumpyShape(&param_shape);
+  }
+  if (shape_is_known((*out_attrs)[0]) && !shape_is_known(param_shape)) return true;
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, param_shape);
+  return shape_is_known(out_attrs->at(0));
 }
 
 template<typename ParamType>
@@ -273,6 +278,8 @@ inline bool InitStorageType(const nnvm::NodeAttrs& attrs,
  */
 template <bool is_integer = false, typename ValueType, typename xpu>
 void Fill(mshadow::Stream<xpu> *s, const TBlob& b, const OpReqType req, ValueType val) {
+  // If b is a zero-size tensor, do nothing.
+  if (b.Size() == 0) return;
   if (req != kNullOp) {
     const size_t size = b.Size();
     if (val == 0) {
diff --git a/src/operator/tensor/la_op.cc b/src/operator/tensor/la_op.cc
index 252bdf8d9460..12cea91f5800 100644
--- a/src/operator/tensor/la_op.cc
+++ b/src/operator/tensor/la_op.cc
@@ -18,10 +18,11 @@
  */
 
 /*!
- * Copyright (c) 2017 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file la_op.cc
  * \brief CPU implementation of Operators for advanced linear algebra.
  */
+
 #include "./la_op.h"
 #include "./la_op-inl.h"
 
@@ -48,11 +49,12 @@ Here, *alpha* and *beta* are scalar parameters, and *op()* is either the identit
 matrix transposition (depending on *transpose_a*, *transpose_b*).
 
 If *n>2*, *gemm* is performed separately for a batch of matrices. The column indices of the matrices
-are given by the last dimensions of the tensors, the row indices by the axis specified with the *axis* 
+are given by the last dimensions of the tensors, the row indices by the axis specified with the *axis*
 parameter. By default, the trailing two dimensions will be used for matrix encoding.
 
 For a non-default axis parameter, the operation performed is equivalent to a series of swapaxes/gemm/swapaxes
-calls. For example let *A*, *B*, *C* be 5 dimensional tensors. Then gemm(*A*, *B*, *C*, axis=1) is equivalent to
+calls. For example let *A*, *B*, *C* be 5 dimensional tensors. Then gemm(*A*, *B*, *C*, axis=1) is equivalent
+to the following without the overhead of the additional swapaxis operations::
 
     A1 = swapaxes(A, dim1=1, dim2=3)
     B1 = swapaxes(B, dim1=1, dim2=3)
@@ -60,8 +62,6 @@ calls. For example let *A*, *B*, *C* be 5 dimensional tensors. Then gemm(*A*, *B
     C = gemm(A1, B1, C)
     C = swapaxis(C, dim1=1, dim2=3)
 
-without the overhead of the additional swapaxis operations.
-
 When the input data is of type float32 and the environment variables MXNET_CUDA_ALLOW_TENSOR_CORE
 and MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION are set to 1, this operator will try to use
 pseudo-float16 precision (float32 math with float16 I/O) precision in order to use
@@ -126,19 +126,18 @@ Here *alpha* is a scalar parameter and *op()* is either the identity or the matr
 transposition (depending on *transpose_a*, *transpose_b*).
 
 If *n>2*, *gemm* is performed separately for a batch of matrices. The column indices of the matrices
-are given by the last dimensions of the tensors, the row indices by the axis specified with the *axis* 
+are given by the last dimensions of the tensors, the row indices by the axis specified with the *axis*
 parameter. By default, the trailing two dimensions will be used for matrix encoding.
 
 For a non-default axis parameter, the operation performed is equivalent to a series of swapaxes/gemm/swapaxes
 calls. For example let *A*, *B* be 5 dimensional tensors. Then gemm(*A*, *B*, axis=1) is equivalent to
+the following without the overhead of the additional swapaxis operations::
 
     A1 = swapaxes(A, dim1=1, dim2=3)
     B1 = swapaxes(B, dim1=1, dim2=3)
     C = gemm2(A1, B1)
     C = swapaxis(C, dim1=1, dim2=3)
 
-without the overhead of the additional swapaxis operations.
-
 When the input data is of type float32 and the environment variables MXNET_CUDA_ALLOW_TENSOR_CORE
 and MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION are set to 1, this operator will try to use
 pseudo-float16 precision (float32 math with float16 I/O) precision in order to use
@@ -317,7 +316,6 @@ If *n>2*, *trmm* is performed separately on the trailing two dimensions for all
 
 .. note:: The operator supports float32 and float64 data types only.
 
-
 Examples::
 
    // Single triangular matrix multiply
diff --git a/src/operator/tensor/la_op.h b/src/operator/tensor/la_op.h
index 5e18e0ef5a25..db4607fe9262 100644
--- a/src/operator/tensor/la_op.h
+++ b/src/operator/tensor/la_op.h
@@ -384,7 +384,7 @@ mshadow::Tensor<xpu, dim, DType> LaOpFlatten(const TBlob& blob,
   }
   // Collapse ranges [0,axis-1] and [axis+1,ndim-2].
   CHECK_EQ(dim, 4);
-  mxnet::TShape shape(dim);
+  mxnet::TShape shape(dim, -1);
   shape[0] = 1;
   for (int i = 0; i < axis; ++i) {
     shape[0] *= blob.shape_[i];
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 3a58c1200ae0..0e7f66240926 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -49,17 +49,17 @@ namespace op {
 struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   mxnet::TShape target_shape;
   bool keep_highest;
-  nnvm::Tuple<int> shape;
+  mxnet::Tuple<int> shape;
   bool reverse;
   DMLC_DECLARE_PARAMETER(ReshapeParam) {
     DMLC_DECLARE_FIELD(shape)
-    .set_default(nnvm::Tuple<int>())
+    .set_default(mxnet::Tuple<int>())
     .describe("The target shape");
     DMLC_DECLARE_FIELD(reverse)
     .set_default(false)
     .describe("If true then the special values are inferred from right to left");
     DMLC_DECLARE_FIELD(target_shape)
-    .set_default(mxnet::TShape())
+    .set_default(mxnet::TShape(0, -1))
     .describe("(Deprecated! Use ``shape`` instead.) "
               "Target new shape. One and only one dim can be 0, "
               "in which case it will be inferred from the rest of dims");
@@ -71,11 +71,11 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
 };
 
 template<typename IType>
-inline mxnet::TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
-                                const mxnet::TShape& dshape, bool reverse) {
+inline mxnet::TShape InferReshapeShape(const mxnet::Tuple<IType>& shape,
+                                       const mxnet::TShape& dshape, bool reverse) {
   std::vector<IType> dshape_vec;
   std::vector<IType> param_shape_vec(shape.begin(), shape.end());
-  for (index_t i = 0; i < dshape.ndim(); ++i) {
+  for (int i = 0; i < dshape.ndim(); ++i) {
     dshape_vec.push_back(dshape[i]);
   }
   std::vector<IType> tmp;
@@ -102,28 +102,31 @@ inline mxnet::TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
     } else if (proposed_dim == -2) {
       // copy all remaining dims from source
       while (src_idx < dshape_len) {
-        size_t dn = dshape_vec[src_idx++];
+        const int dn = dshape_vec[src_idx++];
         tmp.push_back(dn);
       }
     } else if (proposed_dim == -3) {
       // merge two dims from source
       CHECK_LT(src_idx, dshape_len-1);
-      size_t d1 = dshape_vec[src_idx++];
-      size_t d2 = dshape_vec[src_idx++];
-      size_t dn = d1 * d2;
-      tmp.push_back(dn);
+      const int d1 = dshape_vec[src_idx++];
+      const int d2 = dshape_vec[src_idx++];
+      if (!mxnet::dim_size_is_known(d1) || !mxnet::dim_size_is_known(d2)) {
+        tmp.push_back(-1);
+      } else {
+        tmp.push_back(d1 * d2);
+      }
     } else if (proposed_dim == -4) {
       // split the source dim s into two dims
       // read the left dim and then the right dim (either can be -1)
       CHECK_LT(i + 2, params_len);
       CHECK_LT(src_idx, dshape_len);
-      size_t d0 = dshape_vec[src_idx++];
+      const int d0 = dshape_vec[src_idx++];
       IType d1 = param_shape_vec[++i];
       IType d2 = param_shape_vec[++i];
       CHECK(d1 != -1 || d2 != -1) << "Split dims cannot both be -1.";
-      if (d1 == -1) d1 = d0 / d2;
-      if (d2 == -1) d2 = d0 / d1;
-      CHECK(d1 * d2 == static_cast<IType>(d0) || static_cast<IType>(d0) == IType(0)) <<
+      if (d1 == -1 && d0 >= 0) d1 = d0 / d2;  // d0 must be known to do this
+      if (d2 == -1 && d0 >= 0) d2 = d0 / d1;  // d0 must be known to do this
+      CHECK(d1 * d2 == static_cast<IType>(d0) || static_cast<IType>(d0) == IType(-1)) <<
         "Split dims " << d1 << ", " << d2 << " do not divide original dim " << d0;
       tmp.push_back(d1);
       tmp.push_back(d2);
@@ -135,12 +138,12 @@ inline mxnet::TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
   }
 
   if (inf_idx >= 0) {
-    if (dshape.Size() > 0) {
+    if (shape_is_known(dshape)) {
       IType new_size = 1;
       for (IType x : tmp) new_size *= x;
       tmp[inf_idx] = dshape.Size() / new_size;
     } else {
-      tmp[inf_idx] = 0;
+      tmp[inf_idx] = -1;
     }
   }
   if (reverse) {
@@ -153,24 +156,24 @@ inline mxnet::TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
 }
 
 inline bool ReverseReshapeInferShape(mxnet::TShape *in, const mxnet::TShape& out) {
-  if (in->Size() && out.Size()) {
+  if (shape_is_known(*in) && shape_is_known(out)) {
     return true;
-  } else if (!out.Size()) {
+  } else if (!shape_is_known(out)) {
     return false;
   } else {
     int zero_axis = -1;
-    int non_zero_prod = 1;
-    for (index_t i = 0; i < in->ndim(); i++) {
-      if ((*in)[i] == 0) {
+    int known_dim_size_prod = 1;
+    for (int i = 0; i < in->ndim(); i++) {
+      if (!mxnet::dim_size_is_known(*in, i)) {
         if (zero_axis != -1)
           return false;  // more than 1 zero found.
         else
           zero_axis = i;
       } else {
-        non_zero_prod *= (*in)[i];
+        known_dim_size_prod *= (*in)[i];
       }
     }
-    (*in)[zero_axis] = out.Size() / non_zero_prod;
+    (*in)[zero_axis] = out.Size() / known_dim_size_prod;
     return true;
   }
 }
@@ -182,11 +185,11 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape &dshape = (*in_attrs)[0];
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   mxnet::TShape oshape;
   if (param_.shape.ndim() != 0) {
     oshape = InferReshapeShape(param_.shape, dshape, param_.reverse);
-  } else if (param_.target_shape.ndim()) {
+  } else if (param_.target_shape.ndim() != -1) {
     LOG(INFO) << "Using target_shape will be deprecated.";
     oshape = param_.target_shape;
     int neg_count = 0;
@@ -195,7 +198,7 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
     if (param_.keep_highest) {
       oshape[0] = dshape[0];
     }
-    for (index_t i = start_idx; i < oshape.ndim(); ++i) {
+    for (int i = start_idx; i < oshape.ndim(); ++i) {
       if (oshape[i] == 0) {
         neg_count++;
         inf_idx = i;
@@ -206,13 +209,16 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
       oshape[inf_idx] = dshape.Size() / oshape.Size();
     }
   } else {
-    return (*out_attrs)[0].ndim() && ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
+    return shape_is_known((*out_attrs)[0])
+           && ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
   }
   ReverseReshapeInferShape(&dshape, oshape);
+#if 0
   CHECK_EQ(oshape.Size(), dshape.Size())
     << "Target shape size is different to source. "
     << "Target: " << oshape
     << "\nSource: " << dshape;
+#endif
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
 }
@@ -223,9 +229,9 @@ inline bool FlattenShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape &dshape = (*in_attrs)[0];
-  if (dshape.ndim() == 0) return false;
-  uint32_t target_dim = 1;
-  for (uint32_t i = 1; i < dshape.ndim(); ++i) {
+  if (!shape_is_known(dshape)) return false;
+  int target_dim = 1;
+  for (int i = 1; i < dshape.ndim(); ++i) {
     target_dim *= dshape[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape2(dshape[0], target_dim));
@@ -235,9 +241,13 @@ inline bool FlattenShape(const nnvm::NodeAttrs& attrs,
 struct TransposeParam : public dmlc::Parameter<TransposeParam> {
   mxnet::TShape axes;
   DMLC_DECLARE_PARAMETER(TransposeParam) {
-    DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape(0, -1))
     .describe("Target axis order. By default the axes will be inverted.");
   }
+
+  bool operator==(const TransposeParam &other) const {
+    return this->axes == other.axes;
+  }
 };
 
 template<typename xpu>
@@ -310,8 +320,8 @@ void Transpose(const nnvm::NodeAttrs& attrs,
   const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
   CHECK_EQ(req[0], kWriteTo) << "Transpose does not support inplace";
   if (param.axes.ndim() == 0) {
-    mxnet::TShape axes = mxnet::TShape(inputs[0].ndim());
-    for (index_t i = 0; i < axes.ndim(); ++i) {
+    mxnet::TShape axes(inputs[0].ndim(), -1);
+    for (int i = 0; i < axes.ndim(); ++i) {
       axes[i] = axes.ndim() - 1 - i;
     }
     TransposeImpl<xpu>(ctx.run_ctx, inputs[0], outputs[0], axes);
@@ -328,20 +338,20 @@ inline bool TransposeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& shp = (*in_attrs)[0];
   CHECK_LE(shp.ndim(), 6U) << "Transpose support at most 6 dimensions";
-  mxnet::TShape ret(shp.ndim());
+  mxnet::TShape ret(shp.ndim(), -1);
   if (param.axes.ndim() == 0) {
-    for (index_t i = 0; i < shp.ndim(); ++i) {
+    for (int i = 0; i < shp.ndim(); ++i) {
       ret[i] = shp[shp.ndim()-1-i];
     }
   } else {
     CHECK_EQ(shp.ndim(), param.axes.ndim());
-    for (size_t i = 0; i < shp.ndim(); ++i) {
+    for (int i = 0; i < shp.ndim(); ++i) {
       CHECK(param.axes[i] < static_cast<int64_t>(shp.ndim()));
       ret[i] = shp[param.axes[i]];
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret);
-  return true;
+  return shape_is_known(ret);
 }
 
 
@@ -362,7 +372,7 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   const ExpandDimParam& param = nnvm::get<ExpandDimParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  if (in_attrs->at(0).ndim() == 0U && out_attrs->at(0).ndim() == 0U) {
+  if (!mxnet::ndim_is_known(in_attrs->at(0)) && !mxnet::ndim_is_known(out_attrs->at(0))) {
     return false;
   }
 
@@ -370,7 +380,7 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape& oshape = (*out_attrs)[0];
   int indim = ishape.ndim();
   bool unknown_ishape = false;
-  if (0 == indim) {
+  if (-1 == indim) {
     indim = oshape.ndim() - 1;
     unknown_ishape = true;
   }
@@ -382,27 +392,27 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   CHECK(axis >= 0 && axis <= indim)
       << "axis must be in the range [" << -indim << ", " << indim << "] ("
       << param.axis << " provided)";
-  mxnet::TShape ret(indim + 1);
+  mxnet::TShape ret(indim + 1, -1);
   for (int i = 0; i < axis; ++i) {
-    ret[i] = (unknown_ishape? 0 : ishape[i]);
+    ret[i] = (unknown_ishape? -1 : ishape[i]);
   }
   ret[axis] = 1;
   for (int i = axis+1; i < indim+1; ++i) {
-    ret[i] = (unknown_ishape? 0 : ishape[i-1]);
+    ret[i] = (unknown_ishape? -1 : ishape[i-1]);
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret);
 
-  ret = mxnet::TShape(indim);
+  ret = mxnet::TShape(indim, -1);
   for (int i = 0; i < axis; ++i) ret[i] = oshape[i];
   for (int i = axis+1; i < indim+1; ++i) ret[i-1] = oshape[i];
   SHAPE_ASSIGN_CHECK(*in_attrs, 0, ret);
-  return true;
+  return shape_is_known(in_attrs->at(0)) && shape_is_known(out_attrs->at(0));
 }
 
 // Currently MKLDNN only supports step = 1 or step has no value
 inline bool SupportMKLDNNSlice(const SliceParam& param) {
   if (param.step.ndim() == 0U) return true;
-  for (uint32_t i = 0; i < param.step.ndim(); ++i) {
+  for (int i = 0; i < param.step.ndim(); ++i) {
     if (param.step[i].has_value() && param.step[i].value() != 1)
       return false;
   }
@@ -585,11 +595,11 @@ void SliceCsrImpl(const SliceParam &param, const OpContext& ctx,
   const mxnet::TShape ishape = in.shape();
   const mxnet::TShape oshape = out.shape();
 
-  uint32_t N = ishape.ndim();
-  mxnet::TShape begin(N), end(N);
-  for (uint32_t i = 0; i < N; ++i) {
+  int N = ishape.ndim();
+  mxnet::TShape begin(N, -1), end(N, -1);
+  for (int i = 0; i < N; ++i) {
     int s = 0;
-    if (param.begin[i]) {
+    if (i < param.begin.ndim() && param.begin[i]) {
       s = *param.begin[i];
       if (s < 0) s += ishape[i];
     }
@@ -630,9 +640,9 @@ void SliceEx(const nnvm::NodeAttrs& attrs,
 
 template<int ndim>
 inline void GetIndexRange(const mxnet::TShape& dshape,
-                          const nnvm::Tuple<dmlc::optional<int>>& param_begin,
-                          const nnvm::Tuple<dmlc::optional<int>>& param_end,
-                          const nnvm::Tuple<dmlc::optional<int>>& param_step,
+                          const mxnet::Tuple<dmlc::optional<int>>& param_begin,
+                          const mxnet::Tuple<dmlc::optional<int>>& param_end,
+                          const mxnet::Tuple<dmlc::optional<int>>& param_step,
                           common::StaticArray<index_t, ndim>* begin,
                           common::StaticArray<index_t, ndim>* end,
                           common::StaticArray<index_t, ndim>* step) {
@@ -647,56 +657,49 @@ inline void GetIndexRange(const mxnet::TShape& dshape,
     << "Static array size=" << ndim
     << " is not equal to data shape ndim=" << dshape.ndim();
 
-  if (param_step.ndim() != 0U) {
+  if (param_step.ndim() != 0) {
     CHECK_EQ(param_step.ndim(), param_begin.ndim())
       << "step and begin must have the same length";
   }
 
   for (index_t i = 0; i < param_begin.ndim(); ++i) {
-    index_t b = 0, e = dshape[i], s = 1;
-    const index_t len = dshape[i];
-    if (param_step.ndim() != 0U) {
-      const auto& opt_step_val = param_step[i];
-      if (opt_step_val.has_value()) {
-        s = opt_step_val.value();
-        CHECK_NE(s, 0) << "slice op step[" << i << "] cannot be 0";
-      }
-    }
+    index_t s = param_step.ndim() != 0U && param_step[i].has_value() ? param_step[i].value() : 1;
+    CHECK_NE(s, 0) << "slice op step[" << i << "] cannot be 0";
 
-    if (len) {
-      if (param_begin[i].has_value()) {
-        b = param_begin[i].value();
-        if (b < 0) {
-          b += len;
-          CHECK_GE(b, 0) << "slicing with begin[" << i << "]="
-                         << b - len << " exceeds limit of " << len;
-        }
-      } else if (s < 0) {
-        b = len - 1;
+    index_t b = 0, e = 0;
+    const index_t len = dshape[i];
+    if (len > 0) {
+      b = param_begin[i].has_value() ? param_begin[i].value() : (s < 0 ? len - 1 : 0);
+      e = param_end[i].has_value() ? param_end[i].value() : (s < 0 ? -1 : len);
+
+      // checking upper and lower bounds for begin
+      if (b < 0) {
+        b += len;
+        CHECK_GE(b, 0) << "slicing with begin[" << i << "]=" << b - len
+                       << " exceeds limit of input dimension[" << i << "]=" << len;
       }
-      CHECK_LT(b, len) << "slicing with begin[" << i << "]="
-                       << b << " exceends limit of " << len;
-
-      if (param_end[i].has_value()) {
-        e = param_end[i].value();
-        if (e < 0) {
-          e += len;
-          CHECK_GE(e, 0) << "slicing with end[" << i << "]="
-                         << e - len << " exceeds limit of " << len;
-        }
-      } else if (s < 0) {
-        e = -1;
+      CHECK_LT(b, len) << "slicing with begin[" << i << "]=" << b
+                       << " exceeds limit of input dimension[" << i << "]=" << len;
+
+      // checking upper and lower bounds for end
+      if (e < 0 && param_end[i].has_value()) {
+        e += len;
+        CHECK_GE(e, 0) << "slicing with end[" << i << "]=" << e - len
+                       << " exceeds limit of input dimension[" << i << "]=" << len;
       }
-      CHECK_LE(e, len) << "slicing with end[" << i << "]="
-                       << e << " exceeds limit of " << len;
-    } else {
-      b = 0;
-      e = 0;
+      CHECK_LE(e, len) << "slicing with end[" << i << "]=" << e
+                       << " exceeds limit of input dimension[" << i << "]=" << len;
+
+      // checking begin==end case which is not supported
+      CHECK_NE(b, e) << "slicing with begin[" << i << "]=end[" << i << "]="
+                     << e << " results in an empty tensor and is not supported";
     }
+
     (*begin)[i] = b;
     (*end)[i] = e;
     (*step)[i] = s;
   }
+
   for (index_t i = param_begin.ndim(); i < dshape.ndim(); ++i) {
     (*begin)[i] = 0;
     (*end)[i] = dshape[i];
@@ -726,7 +729,7 @@ inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape& dshape = (*in_attrs)[0];
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   mxnet::TShape oshape = dshape;
 
@@ -740,7 +743,7 @@ inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
   });
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return !shape_is_none(dshape) && !shape_is_none(oshape);
+  return shape_is_known(oshape);
 }
 
 template<int ndim, int req, typename xpu>
@@ -946,7 +949,7 @@ inline bool SliceAssignOpShape(const nnvm::NodeAttrs& attrs,
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
-    for (index_t i = 0; i < param.begin.ndim(); ++i) {
+    for (int i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &vshape);
     }
@@ -1000,8 +1003,8 @@ void SliceAssignOpForward(const nnvm::NodeAttrs& attrs,
 
 struct SliceAssignScalarParam : public dmlc::Parameter<SliceAssignScalarParam> {
   double scalar;
-  nnvm::Tuple<dmlc::optional<int>> begin, end;
-  nnvm::Tuple<dmlc::optional<int>> step;
+  mxnet::Tuple<dmlc::optional<int>> begin, end;
+  mxnet::Tuple<dmlc::optional<int>> step;
   DMLC_DECLARE_PARAMETER(SliceAssignScalarParam) {
     DMLC_DECLARE_FIELD(scalar)
     .set_default(0)
@@ -1011,7 +1014,7 @@ struct SliceAssignScalarParam : public dmlc::Parameter<SliceAssignScalarParam> {
     DMLC_DECLARE_FIELD(end)
     .describe("ending indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(step)
-    .set_default(nnvm::Tuple<dmlc::optional<int>>())
+    .set_default(mxnet::Tuple<dmlc::optional<int>>())
     .describe("step for the slice operation, supports negative values.");
   }
 };
@@ -1022,7 +1025,7 @@ inline bool SliceAssignScalarOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape& dshape = (*in_attrs)[0];
-  if (dshape.ndim() == 0U || dshape.Size() == 0U) return false;
+  if (!shape_is_known(dshape)) return false;
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, dshape);
   return true;
 }
@@ -1117,9 +1120,9 @@ inline void GetSliceAxisParams(const SliceAxisParam& param, const mxnet::TShape&
                            int* axis, index_t* begin, index_t* end) {
   *axis = param.axis;
   if (*axis < 0) {
-    *axis += static_cast<int>(ishape.ndim());
+    *axis += ishape.ndim();
   }
-  CHECK(*axis < static_cast<int>(ishape.ndim()) && *axis >= 0) <<
+  CHECK(*axis < ishape.ndim() && *axis >= 0) <<
     "Transformed axis must be smaller than the source ndim and larger than zero! Recieved axis=" <<
     param.axis << ", src_ndim=" << ishape.ndim() << ", transformed axis=" << *axis;
   index_t axis_size = static_cast<index_t>(ishape[*axis]);
@@ -1128,7 +1131,7 @@ inline void GetSliceAxisParams(const SliceAxisParam& param, const mxnet::TShape&
   if (*begin < 0) {
     *begin += axis_size;
   }
-  if (axis_size) {
+  if (axis_size > 0) {
     if (!static_cast<bool>(param.end)) {
       *end = axis_size;
     } else {
@@ -1156,11 +1159,16 @@ inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& ishape = (*in_attrs)[0];
+  if (!mxnet::ndim_is_known(ishape)) return false;
   int axis;
   index_t begin, end;
   GetSliceAxisParams(param, ishape, &axis, &begin, &end);
-  mxnet::TShape shape(ishape.ndim());
-  for (index_t i = 0; i < ishape.ndim(); ++i) {
+  if (!mxnet::dim_size_is_known(ishape, axis)) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, ishape);
+    return false;
+  }
+  mxnet::TShape shape(ishape.ndim(), -1);
+  for (int i = 0; i < ishape.ndim(); ++i) {
     if (static_cast<int>(i) == axis) {
       shape[i] = static_cast<index_t>(end - begin);
     } else {
@@ -1168,7 +1176,7 @@ inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
-  return true;
+  return shape_is_known(shape);
 }
 
 
@@ -1184,7 +1192,7 @@ void SliceAxis(const nnvm::NodeAttrs& attrs,
   int axis;
   index_t begin, end;
   GetSliceAxisParams(param, inputs[0].shape_, &axis, &begin, &end);
-  int ndim = static_cast<int>(outputs[0].ndim());
+  int ndim = outputs[0].ndim();
 
   if (axis + 1 == ndim) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
@@ -1255,9 +1263,9 @@ void SliceAxisGrad_(const nnvm::NodeAttrs& attrs,
 }
 
 struct SliceLikeParam : public dmlc::Parameter<SliceLikeParam> {
-  mxnet::TShape axes;
+  mxnet::Tuple<int> axes;
   DMLC_DECLARE_PARAMETER(SliceLikeParam) {
-    DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(axes).set_default(mxnet::Tuple<int>())
     .describe("List of axes on which input data will be sliced according to the "
               "corresponding size of the second input. By default will slice on "
               "all axes. Negative axes are supported.");
@@ -1276,7 +1284,7 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(ishape.ndim(), from_shape.ndim())
       << "By default slice_axis performs slice on all axes, but ndim mismatch "
          "for inputs: " << ishape.ndim() << " vs. " << from_shape.ndim();
-    for (index_t i = 0; i < ishape.ndim(); ++i) {
+    for (int i = 0; i < ishape.ndim(); ++i) {
       CHECK_GE(ishape[i], from_shape[i])
         << "Slice axis " << i << " with size " << from_shape[i]
         << "exceeds limit of input with size " << ishape[i];
@@ -1284,7 +1292,7 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, from_shape);
   } else {
     mxnet::TShape shape(ishape);
-    for (index_t i = 0; i < param.axes.ndim(); ++i) {
+    for (int i = 0; i < param.axes.ndim(); ++i) {
       int axis = static_cast<int>(param.axes[i]);
       if (axis < 0) {
         axis += static_cast<int>(ishape.ndim());
@@ -1307,21 +1315,21 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
 
 inline void SliceLikeInferRanges(const mxnet::TShape& dshape,
                                  const mxnet::TShape& fshape,
-                                 const mxnet::TShape& axes,
-                                 nnvm::Tuple<dmlc::optional<int>>* param_begin,
-                                 nnvm::Tuple<dmlc::optional<int>>* param_end,
-                                 nnvm::Tuple<dmlc::optional<int>>* param_step) {
+                                 const mxnet::Tuple<int>& axes,
+                                 mxnet::Tuple<dmlc::optional<int>>* param_begin,
+                                 mxnet::Tuple<dmlc::optional<int>>* param_end,
+                                 mxnet::Tuple<dmlc::optional<int>>* param_step) {
   std::vector<dmlc::optional<int>> pb(dshape.ndim());
   std::vector<dmlc::optional<int>> pe(dshape.ndim());
   std::vector<dmlc::optional<int>> ps(dshape.ndim());
   if (axes.ndim() == 0) {
-    for (index_t i = 0; i < dshape.ndim(); ++i) {
+    for (int i = 0; i < dshape.ndim(); ++i) {
       pb[i] = 0;
       pe[i] = fshape[i];
       ps[i] = 1;
     }
   } else {
-    for (index_t i = 0; i < axes.ndim(); ++i) {
+    for (int i = 0; i < axes.ndim(); ++i) {
       int axis = static_cast<int>(axes[i]);
       if (axis < 0) {
         axis += static_cast<int>(dshape.ndim());
@@ -1337,9 +1345,9 @@ inline void SliceLikeInferRanges(const mxnet::TShape& dshape,
       ps[axis] = 1;
     }
   }
-  *param_begin = nnvm::Tuple<dmlc::optional<int>>(pb.begin(), pb.end());
-  *param_end = nnvm::Tuple<dmlc::optional<int>>(pe.begin(), pe.end());
-  *param_step = nnvm::Tuple<dmlc::optional<int>>(ps.begin(), ps.end());
+  *param_begin = mxnet::Tuple<dmlc::optional<int>>(pb.begin(), pb.end());
+  *param_end = mxnet::Tuple<dmlc::optional<int>>(pe.begin(), pe.end());
+  *param_step = mxnet::Tuple<dmlc::optional<int>>(ps.begin(), ps.end());
 }
 
 template<typename xpu>
@@ -1358,9 +1366,9 @@ void SliceLikeForward(const nnvm::NodeAttrs& attrs,
   const TBlob& out = outputs[0];
   const mxnet::TShape& ishape = data.shape_;
   const mxnet::TShape& from_shape = inputs[1].shape_;
-  nnvm::Tuple<dmlc::optional<int>> param_begin;
-  nnvm::Tuple<dmlc::optional<int>> param_end;
-  nnvm::Tuple<dmlc::optional<int>> param_step;
+  mxnet::Tuple<dmlc::optional<int>> param_begin;
+  mxnet::Tuple<dmlc::optional<int>> param_end;
+  mxnet::Tuple<dmlc::optional<int>> param_step;
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
@@ -1406,9 +1414,9 @@ void SliceLikeBackward(const nnvm::NodeAttrs& attrs,
 
   const mxnet::TShape& ishape = ograd.shape_;
   const mxnet::TShape& from_shape = outputs[1].shape_;
-  nnvm::Tuple<dmlc::optional<int>> param_begin;
-  nnvm::Tuple<dmlc::optional<int>> param_end;
-  nnvm::Tuple<dmlc::optional<int>> param_step;
+  mxnet::Tuple<dmlc::optional<int>> param_begin;
+  mxnet::Tuple<dmlc::optional<int>> param_end;
+  mxnet::Tuple<dmlc::optional<int>> param_step;
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
@@ -1549,7 +1557,7 @@ inline void GetRepeatParams(const RepeatParam& param, const mxnet::TShape& ishap
   CHECK_GE(*repeats, 0) << "repeats cannot be a negative number";
   *axisOpt = param.axis;
   if (static_cast<bool>(*axisOpt)) {
-    int ndims = static_cast<int>(ishape.ndim());
+    int ndims = ishape.ndim();
     int axis = axisOpt->value();
     if (axis < 0) {
       axis += ndims;
@@ -1568,34 +1576,33 @@ inline bool RepeatOpShape(const nnvm::NodeAttrs& attrs,
   int repeats = 0;
   dmlc::optional<int> axisOpt;
   GetRepeatParams(param, ishape, &repeats, &axisOpt);
-  // If 0 repeats, return an empty 0 dim array
+  // If 0 repeats, return an empty 1-dim, 0-size array
   if (0 == repeats) {
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape());
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 0));
     return true;
   }
 
   // If repeats > 0, multiply the size of the corresponding axis by repeats
   if (static_cast<bool>(axisOpt)) {
-    int ndims = static_cast<int>(ishape.ndim());
+    int ndims = ishape.ndim();
     int axis = axisOpt.value();
     if (axis < 0) {
       axis += ndims;
     }
-    mxnet::TShape shape(ishape.ndim());
-    for (index_t i = 0; i < ishape.ndim(); ++i) {
-      if (static_cast<int>(i) == axis) {
-        shape[i] = static_cast<index_t>(repeats) * ishape[i];
+    mxnet::TShape shape(ishape.ndim(), -1);
+    for (int i = 0; i < ishape.ndim(); ++i) {
+      if (i == axis) {
+        shape[i] = repeats * ishape[i];
       } else {
         shape[i] = ishape[i];
       }
     }
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   } else {  // If axis is not input by user, return a flat 1D array of size = in.size*repeats
-    mxnet::TShape shape(1);
-    shape[0] = ishape.Size() * static_cast<index_t>(repeats);
+    mxnet::TShape shape(1, ishape.Size() * repeats);
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   }
-  return true;
+  return shape_is_known(out_attrs->at(0));
 }
 
 inline bool RepeatOpType(const nnvm::NodeAttrs& attrs,
@@ -1623,16 +1630,16 @@ inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForRepeatOp(
   const int repeats) {
   if (static_cast<bool>(axisOpt)) {
     int axis = axisOpt.value();
-    int ndim = static_cast<int>(ishape.ndim());
+    int ndim = ishape.ndim();
     if (axis < 0)  {
       axis += ndim;
     }
-    CHECK(axis >= 0 && axis < static_cast<int>(ishape.ndim())) << "Invalid input of axis";
+    CHECK(axis >= 0 && axis < ishape.ndim()) << "Invalid input of axis";
 
     // reshape the input tensor by adding a dim at the (axis+1)-th dim
-    mxnet::TShape rshape(ishape.ndim()+1);
+    mxnet::TShape rshape(ishape.ndim()+1, 1);
     // the shape we want to broadcast to
-    mxnet::TShape bshape(rshape.ndim());
+    mxnet::TShape bshape(rshape.ndim(), 1);
     int i = 0;
     while (i <= axis) {
       rshape[i] = bshape[i] = ishape[i];
@@ -1640,7 +1647,7 @@ inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForRepeatOp(
     }
     rshape[i] = 1;
     bshape[i] = repeats;
-    while (i < static_cast<int>(ishape.ndim())) {
+    while (i < ishape.ndim()) {
       rshape[i+1] = ishape[i];
       bshape[i+1] = ishape[i];
       ++i;
@@ -1651,11 +1658,11 @@ inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForRepeatOp(
     // reshape the tensor into shape (ishape.Size(), 1)
     // then add one dim at axis = 1 and broadcast to
     // shape (ishape.Size(), repeats)
-    mxnet::TShape rshape(2);
+    mxnet::TShape rshape(2, 1);
     rshape[0] = ishape.Size();
     rshape[1] = 1;
 
-    mxnet::TShape bshape(2);
+    mxnet::TShape bshape(2, 1);
     bshape[0] = rshape[0];
     bshape[1] = repeats;
     return std::make_pair(rshape, bshape);
@@ -1670,7 +1677,7 @@ void RepeatOpForward(const nnvm::NodeAttrs& attrs,
                      const std::vector<TBlob>& outputs) {
   const TBlob& iTBlob = inputs[0];
   const mxnet::TShape& ishape = iTBlob.shape_;
-  if (ishape.ndim() == 0) return;
+  if (!shape_is_known(ishape)) return;
 
   int repeats = 0;
   dmlc::optional<int> axisOpt;
@@ -1714,7 +1721,7 @@ void RepeatOpBackward(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1U);
 
   const mxnet::TShape& oshape = outputs[0].shape_;
-  if (oshape.ndim() == 0) return;
+  if (!shape_is_known(oshape)) return;
 
   int repeats = 0;
   dmlc::optional<int> axisOpt;
@@ -1735,12 +1742,12 @@ void RepeatOpBackward(const nnvm::NodeAttrs& attrs,
     inputs[0].type_flag_, inputs[0].dev_id());
   std::vector<TBlob> newInputs = {iblob};
 
-  ReduceAxesComputeImpl<xpu, mshadow::red::sum, false>(
+  ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, false>(
       ctx, newInputs, req, newOutputs, rshapes.first);
 }
 
 struct TileParam : public dmlc::Parameter<TileParam> {
-  mxnet::TShape reps;
+  mxnet::Tuple<int> reps;
   DMLC_DECLARE_PARAMETER(TileParam) {
     DMLC_DECLARE_FIELD(reps)
       .describe("The number of times for repeating the tensor a. Each dim size of reps"
@@ -1758,19 +1765,22 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   const TileParam& param = nnvm::get<TileParam>(attrs.parsed);
   const mxnet::TShape& ishape = (*in_attrs)[0];
-  const mxnet::TShape& reps = param.reps;
+  if (!shape_is_known(ishape)) {
+    return false;
+  }
+  const mxnet::Tuple<int>& reps = param.reps;
   // If reps is empty, return a identical input array
-  if (reps.ndim() == 0 || ishape.ndim() == 0) {
+  if (reps.ndim() == 0) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, ishape);
     return true;
   }
-  for (size_t i = 0; i < reps.ndim(); ++i) {
+  for (int i = 0; i < reps.ndim(); ++i) {
     CHECK_GT(reps[i], 0) << "invalid reps=" << i << ", dim size must be greater than zero";
   }
-  mxnet::TShape oshape(std::max(ishape.ndim(), reps.ndim()));
-  int i1 = static_cast<int>(ishape.ndim()) - 1;
-  int i2 = static_cast<int>(reps.ndim()) - 1;
-  for (int i = static_cast<int>(oshape.ndim()) - 1; i >= 0; --i) {
+  mxnet::TShape oshape(std::max(ishape.ndim(), reps.ndim()), -1);
+  int i1 = ishape.ndim() - 1;
+  int i2 = reps.ndim() - 1;
+  for (int i = oshape.ndim() - 1; i >= 0; --i) {
     if (i1 >= 0 && i2 >= 0) {
       oshape[i] = ishape[i1--] * reps[i2--];
     } else if (i1 >= 0) {
@@ -1780,7 +1790,7 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 inline bool TileOpType(const nnvm::NodeAttrs& attrs,
@@ -1804,20 +1814,20 @@ inline bool TileOpType(const nnvm::NodeAttrs& attrs,
  */
 inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForTileOp(
   const mxnet::TShape& ishape,
-  const mxnet::TShape& reps) {
+  const mxnet::Tuple<int>& reps) {
   if (ishape.ndim() == 0 || reps.ndim() == 0) {
     return std::make_pair(ishape, ishape);
   }
 
   // The shape we want to broadcast to
-  mxnet::TShape bshape(std::max(ishape.ndim(), reps.ndim()) * 2);
+  mxnet::TShape bshape(std::max(ishape.ndim(), reps.ndim()) * 2, 1);
 
   // The shape of the input tensor after adding new axes before each dim
-  mxnet::TShape rshape(bshape.ndim());
+  mxnet::TShape rshape(bshape.ndim(), 1);
 
-  int i1 = static_cast<int>(ishape.ndim()) - 1;
-  int i2 = static_cast<int>(reps.ndim()) - 1;
-  for (int i = static_cast<int>(bshape.ndim()) - 1; i >= 0; --i) {
+  int i1 = ishape.ndim() - 1;
+  int i2 = reps.ndim() - 1;
+  for (int i = bshape.ndim() - 1; i >= 0; --i) {
     if (0 == (i & 1)) {
       bshape[i] = (i2 >= 0? reps[i2--] : 1);
       rshape[i] = 1;
@@ -1857,10 +1867,10 @@ void TileOpForward(const nnvm::NodeAttrs& attrs,
 
   if (inputs[0].Size() == 0) return;
   const mxnet::TShape& ishape = inputs[0].shape_;
-  const mxnet::TShape& reps = nnvm::get<TileParam>(attrs.parsed).reps;
+  const mxnet::Tuple<int>& reps = nnvm::get<TileParam>(attrs.parsed).reps;
 
   // If any one of the number in reps is zero, return immediately
-  for (index_t i = 0; i < reps.ndim(); ++i) {
+  for (int i = 0; i < reps.ndim(); ++i) {
     if (0 == reps[i]) return;
   }
 
@@ -1899,10 +1909,10 @@ void TileOpBackward(const nnvm::NodeAttrs& attrs,
 
   if (inputs[0].Size() == 0) return;
   const mxnet::TShape& oshape = outputs[0].shape_;
-  const mxnet::TShape& reps = nnvm::get<TileParam>(attrs.parsed).reps;
+  const mxnet::Tuple<int>& reps = nnvm::get<TileParam>(attrs.parsed).reps;
 
   // If any one of the number in reps is zero, return immediately
-  for (index_t i = 0; i < reps.ndim(); ++i) {
+  for (int i = 0; i < reps.ndim(); ++i) {
     if (0 == reps[i]) return;
   }
 
@@ -1917,12 +1927,12 @@ void TileOpBackward(const nnvm::NodeAttrs& attrs,
     inputs[0].type_flag_, inputs[0].dev_id());
   std::vector<TBlob> newInputs = {iblob};
 
-  ReduceAxesComputeImpl<xpu, mshadow::red::sum, false>(
+  ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, false>(
       ctx, newInputs, req, newOutputs, rshapes.first);
 }
 
 struct ReverseParam : public dmlc::Parameter<ReverseParam> {
-  nnvm::Tuple<int> axis;
+  mxnet::Tuple<int> axis;
   DMLC_DECLARE_PARAMETER(ReverseParam) {
     DMLC_DECLARE_FIELD(axis)
     .describe("The axis which to reverse elements.");
@@ -1993,10 +2003,10 @@ void ReverseOpForward(const nnvm::NodeAttrs& attrs,
   std::vector<index_t>  trailing_(param.axis.ndim());
   index_t reverse_index = 0;
   for (int axis : param.axis) {
-    CHECK_LT(axis, static_cast<int>(ishape.ndim()));
+    CHECK_LT(axis, ishape.ndim());
     stride_[reverse_index] = ishape[axis];
     trailing_[reverse_index] = 1;
-    for (index_t i2 = axis + 1; i2 < ishape.ndim(); ++i2) {
+    for (int i2 = axis + 1; i2 < ishape.ndim(); ++i2) {
       trailing_[reverse_index] *= ishape[i2];
     }
     reverse_index++;
@@ -2057,9 +2067,9 @@ inline bool StackOpShape(const nnvm::NodeAttrs& attrs,
   for (const mxnet::TShape& i : (*in_attrs)) {
     shape_assign(&dshape, i);
   }
-  if (dshape.ndim() == 0) return false;
+  if (!shape_is_known(dshape)) return false;
 
-  mxnet::TShape oshape(dshape.ndim() + 1);
+  mxnet::TShape oshape(dshape.ndim() + 1, -1);
   int axis = CheckAxis(param.axis, oshape.ndim());
   for (int i = 0; i < axis; ++i) {
     oshape[i] = dshape[i];
@@ -2070,7 +2080,7 @@ inline bool StackOpShape(const nnvm::NodeAttrs& attrs,
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
 
-  return true;
+  return shape_is_known(oshape);
 }
 
 
@@ -2143,10 +2153,10 @@ void StackOpBackward(const nnvm::NodeAttrs& attrs,
 }
 
 struct SqueezeParam : public dmlc::Parameter<SqueezeParam> {
-  dmlc::optional<mxnet::TShape> axis;
+  dmlc::optional<mxnet::Tuple<int>> axis;
   DMLC_DECLARE_PARAMETER(SqueezeParam) {
     DMLC_DECLARE_FIELD(axis)
-    .set_default(dmlc::optional<mxnet::TShape>())
+    .set_default(dmlc::optional<mxnet::Tuple<int>>())
     .describe("Selects a subset of the single-dimensional entries in the shape."
               " If an axis is selected with shape entry greater than one, an error is raised.");
   }
@@ -2159,7 +2169,7 @@ struct SqueezeParam : public dmlc::Parameter<SqueezeParam> {
 inline size_t SqueezeShapeHelper(mxnet::TShape* shape) {
   CHECK(shape != nullptr);
   size_t count = 0;
-  for (size_t i = 0; i < shape->ndim(); ++i) {
+  for (int i = 0; i < shape->ndim(); ++i) {
     if ((*shape)[i] == 0) {
       ++count;
     } else {
@@ -2177,12 +2187,12 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape& dshape = in_attrs->at(0);
   const int dndim = dshape.ndim();
-  if (shape_is_none(dshape)) return false;
+  if (!shape_is_known(dshape)) return false;
   mxnet::TShape oshape = dshape;
   if (param.axis.has_value()) {
     // preprocess axis
-    mxnet::TShape axes = param.axis.value();
-    for (size_t i = 0; i < axes.ndim(); ++i) {
+    mxnet::Tuple<int> axes = param.axis.value();
+    for (int i = 0; i < axes.ndim(); ++i) {
       if (axes[i] < 0) {
         axes[i] += dndim;
         CHECK_GE(axes[i], 0)
@@ -2197,7 +2207,7 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
       oshape[axes[i]] = 0;
     }
   } else {
-    for (size_t i = 0; i < oshape.ndim(); ++i) {
+    for (int i = 0; i < oshape.ndim(); ++i) {
       if (oshape[i] == 1) oshape[i] = 0;
     }
   }
@@ -2226,7 +2236,7 @@ inline bool DepthToSpaceOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   CHECK_EQ(in_attrs->at(0).ndim(), 4) << "Operation Depth To Space requires exactly 4D tensor";
 
-  mxnet::TShape expected_out(4);
+  mxnet::TShape expected_out(4, -1);
 
   mxnet::TShape& in_shape = in_attrs->at(0);
   int block = param.block_size;
@@ -2244,14 +2254,14 @@ inline bool DepthToSpaceOpShape(const nnvm::NodeAttrs& attrs,
 
   expected_out[0] = in_shape[0];
   expected_out[1] = in_shape[1] / (block * block);
-  size_t i = 2;
+  int i = 2;
   while (i < expected_out.ndim()) {
     expected_out[i] = in_shape[i] * block;
     ++i;
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, expected_out);
-  return true;
+  return shape_is_known(expected_out);
 }
 
 inline bool DepthToSpaceOpType(const nnvm::NodeAttrs& attrs,
@@ -2390,7 +2400,7 @@ inline bool SpaceToDepthOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   CHECK_EQ(in_attrs->at(0).ndim(), 4) << "Operation Space To Depth requires exactly 4D tensor";
 
-  mxnet::TShape expected_out(in_attrs->at(0).ndim());
+  mxnet::TShape expected_out(in_attrs->at(0).ndim(), -1);
 
   mxnet::TShape& in_shape = in_attrs->at(0);
   int block = param.block_size;
@@ -2411,14 +2421,14 @@ inline bool SpaceToDepthOpShape(const nnvm::NodeAttrs& attrs,
 
   expected_out[0] = in_shape[0];
   expected_out[1] = in_shape[1] * block * block;
-  uint32_t i = 2;
+  int i = 2;
   while (i < expected_out.ndim()) {
     expected_out[i] = in_shape[i] / block;
     ++i;
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, expected_out);
-  return true;
+  return shape_is_known(expected_out);
 }
 
 inline bool SpaceToDepthOpType(const nnvm::NodeAttrs& attrs,
@@ -2559,7 +2569,7 @@ struct SplitParam : public dmlc::Parameter<SplitParam> {
 };  // struct SplitParam
 
 inline mxnet::TShape GetSplitIndices(const mxnet::TShape& ishape, int axis, int sections) {
-  mxnet::TShape indices(sections+1);
+  mxnet::TShape indices(sections+1, -1);
   indices[0] = 0;
   int64_t section_size = ishape[axis] / sections;
   for (int i = 0; i < sections; ++i) {
@@ -2591,7 +2601,7 @@ inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   mxnet::TShape dshape = in_attrs->at(split_enum::kData);
   mxnet::TShape ishape = in_attrs->at(split_enum::kData);
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   if (param.axis >= 0) {
     CHECK_LT(static_cast<size_t>(param.axis), dshape.ndim());
   } else {
@@ -2606,7 +2616,7 @@ inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
   int num_outputs = (param.sections > 0) ? indices.ndim() - 1 : indices.ndim();
   // Pre-compute squeezed output shape for future usage
   mxnet::TShape squeezed_dshape = dshape;
-  for (int d = real_axis; d < static_cast<int>(squeezed_dshape.ndim()) - 1; ++d) {
+  for (int d = real_axis; d < squeezed_dshape.ndim() - 1; ++d) {
     squeezed_dshape[d] = squeezed_dshape[d+1];
   }
   squeezed_dshape = mxnet::TShape(&squeezed_dshape[0], &squeezed_dshape[squeezed_dshape.ndim()-1]);
@@ -2638,7 +2648,7 @@ inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
       back_calculate_dshape[real_axis] += (*out_attrs)[i][real_axis];
     }
   }
-  for (int d = real_axis + 1; d < static_cast<int>(ishape.ndim()); ++d) {
+  for (int d = real_axis + 1; d < ishape.ndim(); ++d) {
     if (param.squeeze_axis) {
       back_calculate_dshape[d] = (*out_attrs)[0][d - 1];
     } else {
@@ -2848,4 +2858,15 @@ inline uint32_t SplitNumOutputs(const NodeAttrs& attrs) {
 }  // namespace op
 }  // namespace mxnet
 
+namespace std {
+template<>
+struct hash<mxnet::op::TransposeParam> {
+  size_t operator()(const mxnet::op::TransposeParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.axes);
+    return ret;
+  }
+};
+}  // namespace std
+
 #endif  // MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 3bca330f98b0..b80c9a54510f 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -339,6 +339,35 @@ Example::
   })
 .add_argument("data", "NDArray-or-Symbol", "Input array.");
 
+#if MXNET_USE_MKLDNN == 1
+static void TransposeComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                  const OpContext& ctx,
+                                  const std::vector<NDArray>& inputs,
+                                  const std::vector<OpReqType>& req,
+                                  const std::vector<NDArray>& outputs) {
+  const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
+  CHECK_EQ(req[0], kWriteTo) << "Transpose does not support inplace";
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  if (SupportMKLDNNTranspose(param, inputs[0])) {
+    MKLDNNTransposeForward(attrs, ctx, inputs[0], req[0], outputs[0]);
+    return;
+  }
+  FallBackCompute(Transpose<cpu>, attrs, ctx, inputs, req, outputs);
+}
+
+inline static bool TransposeStorageType(const nnvm::NodeAttrs& attrs,
+                                        const int dev_mask,
+                                        DispatchMode* dispatch_mode,
+                                        std::vector<int>* in_attrs,
+                                        std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+}
+#endif
+
 NNVM_REGISTER_OP(transpose)
 .describe(R"code(Permutes the dimensions of an array.
 
@@ -381,8 +410,8 @@ Examples::
           "transpose", n, ograds, {},
           std::unordered_map<std::string, std::string>());
     } else {
-      mxnet::TShape axes = mxnet::TShape(param.axes.ndim());
-      for (index_t i = 0; i < axes.ndim(); ++i) {
+      mxnet::TShape axes = mxnet::TShape(param.axes.ndim(), -1);
+      for (int i = 0; i < axes.ndim(); ++i) {
         axes[param.axes[i]] = i;
       }
       std::ostringstream os;
@@ -393,6 +422,11 @@ Examples::
     }
   })
 .set_attr<FCompute>("FCompute<cpu>", Transpose<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FComputeEx>("FComputeEx<cpu>", TransposeComputeExCPU)
+.set_attr<FInferStorageType>("FInferStorageType", TransposeStorageType)
+#endif
 .add_argument("data", "NDArray-or-Symbol", "Source input")
 .add_arguments(TransposeParam::__FIELDS__());
 
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index 5a95e05ffb65..1dda90104205 100644
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -149,7 +149,7 @@ inline void ParseTopKParam(const mxnet::TShape& src_shape, const TopKParam& para
                                                   << src_shape.ndim() << ", found axis=" << *axis;
     *batch_size = src_shape.Size() / src_shape[*axis];
     *element_num = src_shape[*axis];
-    if (*axis != static_cast<int>(src_shape.ndim()) - 1) {
+    if (*axis != src_shape.ndim() - 1) {
       *do_transpose = true;
     }
   }
diff --git a/src/operator/tensor/ravel.cc b/src/operator/tensor/ravel.cc
index 0a66ea80fca9..94d79c7d07a6 100644
--- a/src/operator/tensor/ravel.cc
+++ b/src/operator/tensor/ravel.cc
@@ -31,12 +31,13 @@ DMLC_REGISTER_PARAMETER(RavelParam);
 
 NNVM_REGISTER_OP(_ravel_multi_index)
 .add_alias("ravel_multi_index")
-.describe(R"code(Converts a batch of index arrays into an array of flat indices. The operator follows numpy conventions so a single multi index is given by a column of the input matrix. 
+.describe(R"code(Converts a batch of index arrays into an array of flat indices. The operator follows numpy conventions so a single multi index is given by a column of the input matrix. The leading dimension may be left unspecified by using -1 as placeholder.  
 
 Examples::
    
    A = [[3,6,6],[4,5,1]]
    ravel(A, shape=(7,6)) = [22,41,37]
+   ravel(A, shape=(-1,6)) = [22,41,37]
 
 )code" ADD_FILELINE)
 .set_num_inputs(1)
@@ -55,12 +56,13 @@ Examples::
 
 NNVM_REGISTER_OP(_unravel_index)
 .add_alias("unravel_index")
-.describe(R"code(Converts an array of flat indices into a batch of index arrays. The operator follows numpy conventions so a single multi index is given by a column of the output matrix.
+.describe(R"code(Converts an array of flat indices into a batch of index arrays. The operator follows numpy conventions so a single multi index is given by a column of the output matrix. The leading dimension may be left unspecified by using -1 as placeholder.  
 
 Examples::
 
    A = [22,41,37]
    unravel(A, shape=(7,6)) = [[3,6,6],[4,5,1]]
+   unravel(A, shape=(-1,6)) = [[3,6,6],[4,5,1]]
 
 )code" ADD_FILELINE)
 .set_num_inputs(1)
diff --git a/src/operator/tensor/ravel.h b/src/operator/tensor/ravel.h
index 6d337dcef701..256fe334e971 100644
--- a/src/operator/tensor/ravel.h
+++ b/src/operator/tensor/ravel.h
@@ -110,11 +110,12 @@ struct unravel_index {
                                   DType *unravelled, DType *ravelled) {
     index_t idx(ravelled[i]);
     #pragma unroll
-    for (int j = ndim; j--; ) {
+    for (int j = ndim-1; j > 0; --j) {
       index_t tmp = idx / shape[j];
       unravelled[i+j*N] = idx - tmp*shape[j];
       idx = tmp;
     }
+    unravelled[i] = idx;
   }
 };
 
diff --git a/src/operator/tensor/slice-inl.h b/src/operator/tensor/slice-inl.h
index 4e94cbeda46c..78a2bd8c7b45 100644
--- a/src/operator/tensor/slice-inl.h
+++ b/src/operator/tensor/slice-inl.h
@@ -34,15 +34,15 @@ namespace mxnet {
 namespace op {
 
 struct SliceParam : public dmlc::Parameter<SliceParam> {
-  nnvm::Tuple<dmlc::optional<int>> begin, end;
-  nnvm::Tuple<dmlc::optional<int>> step;
+  mxnet::Tuple<dmlc::optional<int>> begin, end;
+  mxnet::Tuple<dmlc::optional<int>> step;
   DMLC_DECLARE_PARAMETER(SliceParam) {
     DMLC_DECLARE_FIELD(begin)
     .describe("starting indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(end)
     .describe("ending indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(step)
-    .set_default(nnvm::Tuple<dmlc::optional<int>>())
+    .set_default(mxnet::Tuple<dmlc::optional<int>>())
     .describe("step for the slice operation, supports negative values.");
   }
   bool operator==(const SliceParam& other) const {
diff --git a/src/profiler/profiler.h b/src/profiler/profiler.h
index adea941bda13..f1fac9ae8ddd 100644
--- a/src/profiler/profiler.h
+++ b/src/profiler/profiler.h
@@ -608,6 +608,12 @@ struct ProfileCounter : public ProfileObject {
       return IncrementValue(static_cast<uint64_t>(v));
     }
   }
+
+  inline bool operator >=(int64_t v) {
+      CHECK_GE(v, 0);
+      return value_ >= static_cast<uint64_t>(v);
+  }
+
   /*! \brief operator: object = v */
   inline ProfileCounter& operator = (uint64_t v) {
     SetValue(v);
diff --git a/src/profiler/storage_profiler.h b/src/profiler/storage_profiler.h
index bcbe7e7e3ffd..5ab5983267eb 100644
--- a/src/profiler/storage_profiler.h
+++ b/src/profiler/storage_profiler.h
@@ -66,7 +66,11 @@ class DeviceStorageProfiler {
         Init();  // In case of bug which tries to free first
         const size_t idx = prof->DeviceIndex(handle.ctx.dev_type, handle.ctx.dev_id);
         CHECK_LT(idx, mem_counters_.size()) << "Invalid device index: " << idx;
-        *mem_counters_[idx] -= handle.size;
+        if (*mem_counters_[idx] >= handle.size) {
+            *mem_counters_[idx] -= handle.size;
+        } else {
+            *mem_counters_[idx] = 0;
+        }
       }
     }
   }
diff --git a/src/resource.cc b/src/resource.cc
index 80a5c0e444e1..cd6320d393b1 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -54,30 +54,29 @@ struct SpaceAllocator {
     host_handle.dptr = nullptr;
     host_handle.size = 0;
   }
+
   inline void ReleaseAll() {
-    if (handle.size != 0) {
-      Storage::Get()->DirectFree(handle);
-      handle.size = 0;
-    }
-    if (host_handle.size != 0) {
-      Storage::Get()->DirectFree(host_handle);
-      host_handle.size = 0;
-    }
+    Storage::Get()->DirectFree(handle);
+    handle.dptr = nullptr;
+    handle.size = 0;
+
+    Storage::Get()->DirectFree(host_handle);
+    host_handle.dptr = nullptr;
+    host_handle.size = 0;
   }
+
   inline void* GetSpace(size_t size) {
     if (handle.size >= size) return handle.dptr;
-    if (handle.size != 0) {
-      Storage::Get()->DirectFree(handle);
-    }
+
+    Storage::Get()->DirectFree(handle);
     handle = Storage::Get()->Alloc(size, ctx);
     return handle.dptr;
   }
 
   inline void* GetHostSpace(size_t size) {
     if (host_handle.size >= size) return host_handle.dptr;
-    if (host_handle.size != 0) {
-      Storage::Get()->DirectFree(host_handle);
-    }
+
+    Storage::Get()->DirectFree(host_handle);
     host_handle = Storage::Get()->Alloc(size, Context());
     return host_handle.dptr;
   }
@@ -190,12 +189,14 @@ class ResourceManagerImpl : public ResourceManager {
     cpu_rand_->Seed(seed);
     cpu_parallel_rand_->Seed(seed);
 #if MXNET_USE_CUDA
-    gpu_rand_.Get(ctx.dev_id, [ctx, seed, this]() {
-      return new ResourceRandom<gpu>(ctx, seed);
-    })->Seed(seed);
-    gpu_parallel_rand_.Get(ctx.dev_id, [ctx, seed, this]() {
-      return new ResourceParallelRandom<gpu>(ctx, gpu_native_rand_copy_, seed);
-    })->Seed(seed);
+    if (ctx.dev_type == Context::kGPU) {
+      gpu_rand_.Get(ctx.dev_id, [ctx, seed, this]() {
+        return new ResourceRandom<gpu>(ctx, seed);
+      })->Seed(seed);
+      gpu_parallel_rand_.Get(ctx.dev_id, [ctx, seed, this]() {
+        return new ResourceParallelRandom<gpu>(ctx, gpu_native_rand_copy_, seed);
+      })->Seed(seed);
+    }
 #endif
   }
 
@@ -432,6 +433,9 @@ void Resource::get_cudnn_dropout_desc(
     // not initialized yet.
     size_t dropout_state_size;
     CUDNN_CALL(cudnnDropoutGetStatesSize(stream->dnn_handle_, &dropout_state_size));
+    // reserve GPU space
+    Storage::Get()->DirectFree(
+      Storage::Get()->Alloc(dropout_state_size, state_space->ctx));
     CUDNN_CALL(cudnnSetDropoutDescriptor(*dropout_desc, stream->dnn_handle_,
                                          dropout,
                                          state_space->GetSpace(dropout_state_size),
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index 25ad61efb232..f6b296a9643f 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -40,13 +40,12 @@ class CPUDeviceStorage {
  public:
   /*!
    * \brief Aligned allocation on CPU.
-   * \param size Size to allocate.
-   * \return Pointer to the storage.
+   * \param handle Handle struct.
    */
-  inline static void* Alloc(Storage::Handle* handle);
+  inline static void Alloc(Storage::Handle* handle);
   /*!
    * \brief Deallocation.
-   * \param ptr Pointer to deallocate.
+   * \param handle Handle struct.
    */
   inline static void Free(Storage::Handle handle);
 
@@ -63,25 +62,25 @@ class CPUDeviceStorage {
 #endif
 };  // class CPUDeviceStorage
 
-inline void* CPUDeviceStorage::Alloc(Storage::Handle* handle) {
+inline void CPUDeviceStorage::Alloc(Storage::Handle* handle) {
+  handle->dptr = nullptr;
   const size_t size = handle->size;
-  void* ptr;
+  if (size == 0) return;
+
 #if _MSC_VER
-  ptr = _aligned_malloc(size, alignment_);
-  if (ptr == NULL) LOG(FATAL) << "Failed to allocate CPU Memory";
+  handle->dptr = _aligned_malloc(size, alignment_);
+  if (handle->dptr == nullptr) LOG(FATAL) << "Failed to allocate CPU Memory";
 #else
-  int ret = posix_memalign(&ptr, alignment_, size);
+  int ret = posix_memalign(&handle->dptr, alignment_, size);
   if (ret != 0) LOG(FATAL) << "Failed to allocate CPU Memory";
 #endif
-  return ptr;
 }
 
 inline void CPUDeviceStorage::Free(Storage::Handle handle) {
-  void * ptr = handle.dptr;
 #if _MSC_VER
-  _aligned_free(ptr);
+  _aligned_free(handle.dptr);
 #else
-  free(ptr);
+  free(handle.dptr);
 #endif
 }
 
diff --git a/src/storage/cpu_shared_storage_manager.h b/src/storage/cpu_shared_storage_manager.h
index a52d779d2318..9c57a4b61eed 100644
--- a/src/storage/cpu_shared_storage_manager.h
+++ b/src/storage/cpu_shared_storage_manager.h
@@ -115,13 +115,18 @@ class CPUSharedStorageManager final : public StorageManager {
 };  // class CPUSharedStorageManager
 
 void CPUSharedStorageManager::Alloc(Storage::Handle* handle) {
+  if (handle->size == 0) {
+    handle->dptr = nullptr;
+    return;
+  }
+
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   std::uniform_int_distribution<> dis(0, std::numeric_limits<int>::max());
   int fid = -1;
   std::string filename;
   bool is_new = false;
   size_t size = handle->size + alignment_;
-  void *ptr = nullptr;
+  void* ptr = nullptr;
 #ifdef _WIN32
   CheckAndRealFree();
   HANDLE map_handle = nullptr;
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index 562badb8752e..5e09561c0b54 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -43,43 +43,42 @@ class GPUDeviceStorage {
  public:
   /*!
    * \brief Allocation.
-   * \param size Size to allocate.
-   * \return Pointer to the storage.
+   * \param handle Handle struct.
    */
-  inline static void* Alloc(Storage::Handle* handle);
+  inline static void Alloc(Storage::Handle* handle);
   /*!
    * \brief Deallocation.
-   * \param ptr Pointer to deallocate.
+   * \param handle Handle struct.
    */
   inline static void Free(Storage::Handle handle);
 };  // class GPUDeviceStorage
 
-inline void* GPUDeviceStorage::Alloc(Storage::Handle* handle) {
+inline void GPUDeviceStorage::Alloc(Storage::Handle* handle) {
+  handle->dptr = nullptr;
   const size_t size = handle->size;
-  void* ret = nullptr;
+  if (size == 0) return;
+
 #if MXNET_USE_CUDA
   mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
 #endif  // MXNET_USE_NCCL
-  cudaError_t e = cudaMalloc(&ret, size);
+  cudaError_t e = cudaMalloc(&handle->dptr, size);
   if (e != cudaSuccess && e != cudaErrorCudartUnloading)
     LOG(FATAL) << "CUDA: " << cudaGetErrorString(e);
 #else   // MXNET_USE_CUDA
   LOG(FATAL) << "Please compile with CUDA enabled";
 #endif  // MXNET_USE_CUDA
-  return ret;
 }
 
 inline void GPUDeviceStorage::Free(Storage::Handle handle) {
 #if MXNET_USE_CUDA
-  void * ptr = handle.dptr;
   mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
 #endif  // MXNET_USE_NCCL
   // throw special exception for caller to catch.
-  cudaError_t err = cudaFree(ptr);
+  cudaError_t err = cudaFree(handle.dptr);
   // ignore unloading error, as memory has already been recycled
   if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
     LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
diff --git a/src/storage/naive_storage_manager.h b/src/storage/naive_storage_manager.h
index 55112b5a82e9..471b015eb32c 100644
--- a/src/storage/naive_storage_manager.h
+++ b/src/storage/naive_storage_manager.h
@@ -58,7 +58,7 @@ class NaiveStorageManager final : public StorageManager {
 
 template <class DeviceStorage>
 void NaiveStorageManager<DeviceStorage>::Alloc(Storage::Handle* handle) {
-  handle->dptr = DeviceStorage::Alloc(handle);
+  DeviceStorage::Alloc(handle);
 }
 
 template <class DeviceStorage>
diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h
index c4ababbdc03a..13573d99360c 100644
--- a/src/storage/pinned_memory_storage.h
+++ b/src/storage/pinned_memory_storage.h
@@ -19,7 +19,7 @@
 
 /*!
  * Copyright (c) 2015 by Contributors
- * \file cpu_device_storage.h
+ * \file pinned_memory_storage.h
  * \brief CPU storage with pinned memory
  */
 #ifndef MXNET_STORAGE_PINNED_MEMORY_STORAGE_H_
@@ -38,37 +38,36 @@ class PinnedMemoryStorage {
  public:
   /*!
    * \brief Allocation.
-   * \param size Size to allocate.
-   * \return Pointer to the storage.
+   * \param handle Handle struct.
    */
-  inline static void* Alloc(Storage::Handle* handle);
+  inline static void Alloc(Storage::Handle* handle);
 
   /*!
    * \brief Deallocation.
-   * \param ptr Pointer to deallocate.
+   * \param handle Handle struct.
    */
   inline static void Free(Storage::Handle handle);
 };
 
-inline void* PinnedMemoryStorage::Alloc(Storage::Handle* handle) {
-  void* ret = nullptr;
+inline void PinnedMemoryStorage::Alloc(Storage::Handle* handle) {
+  handle->dptr = nullptr;
   const size_t size = handle->size;
+  if (size == 0) return;
+
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
 #endif
   mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
   // make the memory available across all devices
-  CUDA_CALL(cudaHostAlloc(&ret, size, cudaHostAllocPortable));
-  return ret;
+  CUDA_CALL(cudaHostAlloc(&handle->dptr, size, cudaHostAllocPortable));
 }
 
 inline void PinnedMemoryStorage::Free(Storage::Handle handle) {
-  void * ptr = handle.dptr;
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
 #endif
   mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
-  cudaError_t err = cudaFreeHost(ptr);
+  cudaError_t err = cudaFreeHost(handle.dptr);
   // ignore unloading error, as memory has already been recycled
   if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
     LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index c407a9f00cb6..7726bc6f9273 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -53,8 +53,11 @@ class GPUPooledStorageManager final : public StorageManager {
  public:
   /*!
    * \brief Default constructor.
+   *
+   * \param initial_context context used by this Storage Manager
    */
-  GPUPooledStorageManager() {
+  explicit GPUPooledStorageManager(Context initial_context) :
+    initial_context_(initial_context) {
     reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
     page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
     large_alloc_round_size_ = dmlc::GetEnv("MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE", 2 * 1024 * 1024);
@@ -123,12 +126,20 @@ class GPUPooledStorageManager final : public StorageManager {
   int reserve_;
   // number of devices
   const size_t NDEV = 32;
+  // context used by this Storage Manager
+  const Context initial_context_;
   // memory pool
   std::unordered_map<size_t, std::vector<void*>> memory_pool_;
   DISALLOW_COPY_AND_ASSIGN(GPUPooledStorageManager);
 };  // class GPUPooledStorageManager
 
 void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
+  // Set dptr to nullptr when handle size is 0.
+  if (handle->size == 0) {
+    handle->dptr = nullptr;
+    return;
+  }
+
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
   size_t size = RoundAllocSize(handle->size);
   auto&& reuse_it = memory_pool_.find(size);
@@ -155,6 +166,10 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
 }
 
 void GPUPooledStorageManager::Free(Storage::Handle handle) {
+  // Do nothing if dptr is nullptr. Otherwise, nullptr may be reused
+  // which can cause illegal memory access error.
+  if (handle.dptr == nullptr) return;
+
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
   size_t size = RoundAllocSize(handle.size);
   auto&& reuse_pool = memory_pool_[size];
@@ -167,6 +182,7 @@ void GPUPooledStorageManager::ReleaseAll() {
       Storage::Handle handle;
       handle.dptr = j;
       handle.size = i.first;
+      handle.ctx = initial_context_;
       DirectFreeNoLock(handle);
     }
   }
@@ -191,8 +207,11 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
  public:
   /*!
    * \brief Default constructor.
+   *
+   * \param initial_context context used by this Storage Manager
    */
-  GPUPooledRoundedStorageManager() {
+  explicit GPUPooledRoundedStorageManager(Context initial_context) :
+    initial_context_(initial_context) {
     reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
     page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
     cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24);
@@ -280,12 +299,20 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
   size_t cut_off_;
   // percentage of reserved memory
   int reserve_;
+  // context used by this Storage Manager
+  const Context initial_context_;
   // memory pool
   std::vector<std::vector<void*>> memory_pool_;
   DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager);
 };  // class GPUPooledRoundedStorageManager
 
 void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
+  // Set dptr to nullptr when handle size is 0.
+  if (handle->size == 0) {
+    handle->dptr = nullptr;
+    return;
+  }
+
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
   int bucket = get_bucket(handle->size);
   size_t size = get_size(bucket);
@@ -312,6 +339,10 @@ void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
 }
 
 void GPUPooledRoundedStorageManager::Free(Storage::Handle handle) {
+  // Do nothing if dptr is nullptr. Otherwise, nullptr may be reused
+  // which can cause illegal memory access error.
+  if (handle.dptr == nullptr) return;
+
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
   int bucket = get_bucket(handle.size);
   auto&& reuse_pool = memory_pool_[bucket];
@@ -325,6 +356,7 @@ void GPUPooledRoundedStorageManager::ReleaseAll() {
       Storage::Handle handle;
       handle.size = size;
       handle.dptr = j;
+      handle.ctx = initial_context_;
       DirectFreeNoLock(handle);
     }
     memory_pool_[i].clear();
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 911d30cc3f05..4f15351a594a 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -104,13 +104,13 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
             std::string strategy = type;
 
             if (strategy == "Round") {
-              ptr = new storage::GPUPooledRoundedStorageManager();
+              ptr = new storage::GPUPooledRoundedStorageManager(handle->ctx);
               LOG(INFO) << "Using GPUPooledRoundedStorageManager.";
             } else {
               if (strategy != "Naive") {
                 LOG(FATAL) << "Unknown memory pool strategy specified: " << strategy << ".";
               }
-              ptr = new storage::GPUPooledStorageManager();
+              ptr = new storage::GPUPooledStorageManager(handle->ctx);
             }
 #else
             LOG(FATAL) << "Compile with USE_CUDA=1 to enable GPU usage";
@@ -127,6 +127,10 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
 }
 
 void StorageImpl::Free(Storage::Handle handle) {
+  // Do nothing if dtpr is nullptr because the handle may have already
+  // been freed or have not been allocated memory yet.
+  if (handle.dptr == nullptr) return;
+
   const Context &ctx = handle.ctx;
   auto&& device = storage_managers_.at(ctx.dev_type);
   std::shared_ptr<storage::StorageManager> manager = device.Get(
@@ -140,6 +144,10 @@ void StorageImpl::Free(Storage::Handle handle) {
 }
 
 void StorageImpl::DirectFree(Storage::Handle handle) {
+  // Do nothing if dtpr is nullptr because the handle may have already
+  // been freed or have not been allocated memory yet.
+  if (handle.dptr == nullptr) return;
+
   const Context &ctx = handle.ctx;
   auto&& device = storage_managers_.at(ctx.dev_type);
   std::shared_ptr<storage::StorageManager> manager = device.Get(
diff --git a/src/storage/storage_manager.h b/src/storage/storage_manager.h
index 15a2c7ecffcb..d17dc91dc2fc 100644
--- a/src/storage/storage_manager.h
+++ b/src/storage/storage_manager.h
@@ -39,20 +39,17 @@ class StorageManager {
  public:
   /*!
    * \brief Allocation.
-   * \param size Size to allocate.
-   * \return Pointer to the storage.
+   * \param handle Handle struct.
    */
   virtual void Alloc(Storage::Handle* handle) = 0;
   /*!
    * \brief Deallocation.
-   * \param ptr Pointer to deallocate.
-   * \param size Size of the storage.
+   * \param handle Handle struct.
    */
   virtual void Free(Storage::Handle handle) = 0;
   /*!
-   * \brief Direct de-allocation.
-   * \param ptr Pointer to deallocate.
-   * \param size Size of the storage.
+   * \brief Direct deallocation.
+   * \param handle Handle struct.
    */
   virtual void DirectFree(Storage::Handle handle) = 0;
   /*!
diff --git a/tests/cpp/engine/engine_shutdown_test.cc b/tests/cpp/engine/engine_shutdown_test.cc
new file mode 100644
index 000000000000..893d08502c3a
--- /dev/null
+++ b/tests/cpp/engine/engine_shutdown_test.cc
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file engine_shutdown_test.cc
+ * \brief Tests engine shutdown for possible crashes
+*/
+#include <gtest/gtest.h>
+
+#include "../src/engine/engine_impl.h"
+#include "../include/test_util.h"
+
+/**
+ * This test will help ensure we don't crash during engine shutdown.
+ * The crash happens during a static destructor call, so this test may pass and then cause a test-run process crash.
+ */
+TEST(EngineShutdown, stop_without_crashing) {
+    static std::unique_ptr<mxnet::NDArray> ndArray;
+    {
+        auto engine = mxnet::Engine::_GetSharedRef();
+        ndArray = std::make_unique<mxnet::NDArray>(mxnet::Context::CPU());
+        engine->Stop();
+    }
+}
diff --git a/tests/cpp/engine/threaded_engine_test.cc b/tests/cpp/engine/threaded_engine_test.cc
index 6d669c19bcaa..405f3b30a176 100644
--- a/tests/cpp/engine/threaded_engine_test.cc
+++ b/tests/cpp/engine/threaded_engine_test.cc
@@ -27,6 +27,7 @@
 #include <dmlc/thread_group.h>
 #include <dmlc/omp.h>
 #include <gtest/gtest.h>
+#include <mxnet/c_api.h>
 #include <mxnet/engine.h>
 #include <dmlc/timer.h>
 #include <cstdio>
@@ -176,6 +177,83 @@ TEST(Engine, RandSumExpr) {
 
 void Foo(mxnet::RunContext, int i) { printf("The fox says %d\n", i); }
 
+void FooAsyncFunc(void*, void* cb_ptr, void* param) {
+  if (param == nullptr) {
+    LOG(INFO) << "The fox asynchronously says receiving nothing.";
+  } else {
+    auto num = static_cast<int*>(param);
+    EXPECT_EQ(*num, 100);
+    LOG(INFO) << "The fox asynchronously says receiving " << *num;
+  }
+  auto cb = *static_cast<mxnet::engine::CallbackOnComplete*>(cb_ptr);
+  cb();
+}
+
+void FooSyncFunc(void*, void* param) {
+  if (param == nullptr) {
+    LOG(INFO) << "The fox synchronously says receiving nothing.";
+  } else {
+    auto num = static_cast<int*>(param);
+    EXPECT_EQ(*num, 101);
+    LOG(INFO) << "The fox synchronously says receiving " << *num;
+  }
+}
+
+void FooFuncDeleter(void* param) {
+  if (param != nullptr) {
+    auto num = static_cast<int*>(param);
+    LOG(INFO) << "The fox says deleting " << *num;
+    delete num;
+  }
+}
+
+TEST(Engine, PushFunc) {
+  auto var = mxnet::Engine::Get()->NewVariable();
+  auto ctx = mxnet::Context{};
+
+  // Test #1
+  LOG(INFO) << "===== Test #1: PushAsync param and deleter =====";
+  int* a = new int(100);
+  int res = MXEnginePushAsync(FooAsyncFunc, a, FooFuncDeleter, &ctx, &var, 1, nullptr, 0);
+  EXPECT_EQ(res, 0);
+
+  // Test #2
+  LOG(INFO) << "===== Test #2: PushAsync NULL param and NULL deleter =====";
+  res = MXEnginePushAsync(FooAsyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &var, 0);
+  EXPECT_EQ(res, 0);
+
+  // Test #3
+  LOG(INFO) << "===== Test #3: PushAsync invalid number of const vars =====";
+  res = MXEnginePushAsync(FooAsyncFunc, nullptr, nullptr, &ctx, &var, -1, nullptr, 0);
+  EXPECT_EQ(res, -1);
+
+  // Test #4
+  LOG(INFO) << "===== Test #4: PushAsync invalid number of mutable vars =====";
+  res = MXEnginePushAsync(FooAsyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &var, -1);
+  EXPECT_EQ(res, -1);
+
+  // Test #5
+  LOG(INFO) << "===== Test #5: PushSync param and deleter =====";
+  int* b = new int(101);
+  res = MXEnginePushSync(FooSyncFunc, b, FooFuncDeleter, &ctx, &var, 1, nullptr, 0);
+  EXPECT_EQ(res, 0);
+
+  // Test #6
+  LOG(INFO) << "===== Test #6: PushSync NULL param and NULL deleter =====";
+  res = MXEnginePushSync(FooSyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &var, 1);
+  EXPECT_EQ(res, 0);
+
+  // Test #7
+  LOG(INFO) << "===== Test #7: PushSync invalid number of const vars =====";
+  res = MXEnginePushSync(FooSyncFunc, nullptr, nullptr, &ctx, &var, -1, nullptr, 0);
+  EXPECT_EQ(res, -1);
+
+  // Test #8
+  LOG(INFO) << "===== Test #8: PushSync invalid number of mutable vars =====";
+  res = MXEnginePushSync(FooSyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &var, -1);
+  EXPECT_EQ(res, -1);
+}
+
 TEST(Engine, basics) {
   auto&& engine = mxnet::Engine::Get();
   auto&& var = engine->NewVariable();
diff --git a/tests/cpp/include/test_mkldnn.h b/tests/cpp/include/test_mkldnn.h
index a379dab7bf90..f1682772a14a 100644
--- a/tests/cpp/include/test_mkldnn.h
+++ b/tests/cpp/include/test_mkldnn.h
@@ -49,7 +49,7 @@ inline static mkldnn::memory::primitive_desc GetMemPD(const mxnet::TShape s, int
 inline static mkldnn::memory::primitive_desc GetExpandedMemPD(
     mkldnn::memory::primitive_desc pd, float scale, int dim = 0) {
   CHECK(dim < pd.desc().data.ndims) << "dimension cannot be larger than total dimensions of input";
-  mxnet::TShape s(pd.desc().data.ndims);
+  mxnet::TShape s(pd.desc().data.ndims, -1);
   for (size_t i = 0; i < pd.desc().data.ndims; i++)
     s[i] = pd.desc().data.dims[i];
   s[dim] = static_cast<int>(s[dim] * scale);
@@ -165,7 +165,7 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
   std::vector<mkldnn::memory::primitive_desc> pds;
   {
     // 1D
-    mxnet::TShape s(1);
+    mxnet::TShape s(1, -1);
     s[0] = 279936;
     shapes.push_back(s);
     pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::x));
@@ -175,7 +175,7 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
   }
   {
     // 2D
-    mxnet::TShape s(2);
+    mxnet::TShape s(2, -1);
     s[0] = 96;
     s[1] = 2916;
     shapes.push_back(s);
@@ -187,12 +187,12 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
   }
   {
     // 4D
-    mxnet::TShape s1(4);
+    mxnet::TShape s1(4, -1);
     s1[0] = 10; s1[1] = 96; s1[2] = 54; s1[3] = 54;
     shapes.push_back(s1);
     pds.push_back(GetMemPD(s1, dtype, mkldnn::memory::format::nchw));
 
-    mxnet::TShape s2(4);
+    mxnet::TShape s2(4, -1);
     s2[0] = 96; s2[1] = 3; s2[2] = 11; s2[3] = 11;
     shapes.push_back(s2);
     pds.push_back(GetMemPD(s2, dtype, mkldnn::memory::format::oihw));
@@ -204,7 +204,7 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
   }
   {
     // 5D
-    mxnet::TShape s(5);
+    mxnet::TShape s(5, -1);
     s[0] = 96; s[1] = 1; s[2] = 3; s[3] = 11; s[4] = 11;
     shapes.push_back(s);
     pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::goihw));
@@ -259,7 +259,7 @@ enum ArrayTypes {
 inline NDArray CreateKernelNDArray(mxnet::TShape kernel, int num_filters, mxnet::TShape input,
     bool is_deconv = false) {
   CHECK_EQ(kernel.ndim(), 2) << "mkldnn only supports 2d filters on 4d inputs";
-  mxnet::TShape target_shape(4);
+  mxnet::TShape target_shape(4, -1);
   target_shape[0] = is_deconv ? input[1] : num_filters;
   target_shape[1] = is_deconv ? num_filters : input[1];
   target_shape[2] = kernel[0];
@@ -470,7 +470,7 @@ inline std::vector<NDArrayAttrs> GetTestOutputArrays(
     in_arrs.emplace_back(arr0.Slice(1, shape[0] + 1), "Reshaped NDArray");
   }
 
-  mxnet::TShape s(1);
+  mxnet::TShape s(1, -1);
   if (types & ArrayTypes::NormalReused) {
     // Type 5.
     // Get a reused version.
@@ -528,7 +528,7 @@ inline std::vector<NDArrayAttrs> GetTestOutputArrays(
 
     // Type 8, 9.
     // Get a reused version.
-    mxnet::TShape s(1);
+    mxnet::TShape s(1, -1);
     s[0] = shape.Size();
     NDArray arr = NDArray(s, Context());
     arr = arr.AsArray(shape, arr.dtype());
diff --git a/tests/cpp/include/test_op.h b/tests/cpp/include/test_op.h
index d581e88357de..67d98c4457e1 100644
--- a/tests/cpp/include/test_op.h
+++ b/tests/cpp/include/test_op.h
@@ -72,7 +72,7 @@ struct GPUStreamScope {
     : opContext_(*opContext) {
     CHECK_EQ(opContext_.run_ctx.stream == nullptr, true)
       << "Invalid runtime context stream state";
-    opContext_.run_ctx.stream = mshadow::NewStream<gpu>(true, true);
+    opContext_.run_ctx.stream = mshadow::NewStream<gpu>(true, true, opContext_.run_ctx.ctx.dev_id);
     CHECK_EQ(opContext_.run_ctx.stream != nullptr, true)
       << "Unable to allocate a GPU stream";
   }
diff --git a/tests/cpp/include/test_util.h b/tests/cpp/include/test_util.h
index aec3ddc5a59b..b0114e1721ef 100644
--- a/tests/cpp/include/test_util.h
+++ b/tests/cpp/include/test_util.h
@@ -70,11 +70,9 @@ class BlobMemory {
     return handle_.dptr;
   }
   void Free() {
-    if (handle_.dptr) {
-      Storage *storage = mxnet::Storage::Get();
-      storage->DirectFree(handle_);
-      handle_.dptr = nullptr;
-    }
+    mxnet::Storage::Get()->DirectFree(handle_);
+    handle_.dptr = nullptr;
+    handle_.size = 0;
   }
   size_t Size() const {
     return handle_.size;
@@ -355,14 +353,14 @@ inline StreamType& print_blob_(const RunContext& ctx,
 
   if (dim == 1) {
     // probably a 1d tensor (mshadow::Tensor is deprecated)
-    TBlob changed(blob.dptr<DType>(), mxnet::TShape(3), blob.dev_mask(), blob.dev_id());
+    TBlob changed(blob.dptr<DType>(), mxnet::TShape(3, -1), blob.dev_mask(), blob.dev_id());
     changed.shape_[0] = 1;
     changed.shape_[1] = 1;
     changed.shape_[2] = blob.shape_[0];
     return print_blob_<DType>(ctx, &os, changed, false, false, add_endl);
   } else if (dim == 2) {
     // probably a 2d tensor (mshadow::Tensor is deprecated)
-    TBlob changed(blob.dptr<DType>(), mxnet::TShape(4), blob.dev_mask(), blob.dev_id());
+    TBlob changed(blob.dptr<DType>(), mxnet::TShape(4, -1), blob.dev_mask(), blob.dev_id());
     changed.shape_[0] = 1;
     changed.shape_[1] = 1;
     changed.shape_[2] = blob.shape_[0];
diff --git a/tests/cpp/misc/serialization.cc b/tests/cpp/misc/serialization.cc
index 77014238c2fa..2509a43c27ee 100644
--- a/tests/cpp/misc/serialization.cc
+++ b/tests/cpp/misc/serialization.cc
@@ -48,7 +48,7 @@ TEST(SerializerTest, OutputMapCorrect) {
     std::map<std::string, std::tuple<uint32_t, mxnet::TShape, int, int> > output_map;
     output_map.emplace("output_0", std::make_tuple(1, mxnet::TShape({23, 12, 63, 432}), 0, 1));
     output_map.emplace("another_output", std::make_tuple(2, mxnet::TShape({23, 123}), 14, -23));
-    output_map.emplace("last_output", std::make_tuple(0, mxnet::TShape({0}), -1, 0));
+    output_map.emplace("last_output", std::make_tuple(0, mxnet::TShape(1, 0), -1, 0));
     std::string serialized_data;
     common::Serialize(output_map, &serialized_data);
     std::map<std::string, std::tuple<uint32_t, mxnet::TShape, int, int> > deserialized_output_map;
diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc
index 8beebfb1582e..ed0e70b831f1 100644
--- a/tests/cpp/operator/batchnorm_test.cc
+++ b/tests/cpp/operator/batchnorm_test.cc
@@ -980,7 +980,9 @@ static void timingTest(const std::string& label,
 #endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
 
 /*! \brief Stress-test random batch size/channels/dimension(s) */
-TEST(BATCH_NORM, TestStochasticTiming_2D) {
+TEST(BATCH_NORM, DISABLED_TestStochasticTiming_2D) {
+  // Test is disabled due to suspected flakiness
+  // https://github.com/apache/incubator-mxnet/issues/14411
   MSHADOW_REAL_TYPE_SWITCH_EX(
     mshadow::kFloat32, DType, AccReal,
     {
@@ -1264,7 +1266,7 @@ static void testSaveAndLoad(const std::vector<size_t>& dims,
   ChannelAxisTestData<DType> data;
   data.channel_data_ = inputChannelData;
 
-  mxnet::TShape shape(dims.size());
+  mxnet::TShape shape(dims.size(), -1);
   for (size_t i = 0, n = dims.size(); i < n; ++i) {
     shape[i] = index_t(dims[i]);
   }
@@ -1320,7 +1322,7 @@ static mxnet::TShape MakeShape(const std::vector<index_t>& shape,
   }
   CHECK_LT(channelAxis, shape.size() + 1);
   const index_t dim = index_t(shape.size()) + 1;
-  mxnet::TShape newShape(dim);
+  mxnet::TShape newShape(dim, -1);
   for (size_t x = 0; x < static_cast<size_t>(channelAxis); ++x) {
     newShape[x] = index_t(shape[x]);
   }
diff --git a/tests/cpp/operator/mkldnn_operator_test.cc b/tests/cpp/operator/mkldnn_operator_test.cc
index 559ab5da0ccc..961785dcfc87 100644
--- a/tests/cpp/operator/mkldnn_operator_test.cc
+++ b/tests/cpp/operator/mkldnn_operator_test.cc
@@ -916,13 +916,13 @@ void TestFullyConnectedOp(const OpAttrs &forward_attrs, const OpAttrs &backwards
       if (in_shape.ndim() < 2)
         continue;
 
-      mxnet::TShape wt_shape(2);
+      mxnet::TShape wt_shape(2, -1);
       wt_shape[0] = num_hid;
       wt_shape[1] = GetFCWeightDim2(in_shape);
       NDArray weights(wt_shape, Context());
       InitDefaultArray(&weights, false);
 
-      mxnet::TShape bias_shape(1);
+      mxnet::TShape bias_shape(1, -1);
       bias_shape[0] = num_hid;
       NDArray bias(bias_shape, Context());
       InitDefaultArray(&bias, false);
@@ -931,7 +931,7 @@ void TestFullyConnectedOp(const OpAttrs &forward_attrs, const OpAttrs &backwards
       inputs[1] = &weights;
       inputs[2] = &bias;
 
-      mxnet::TShape out_shape(2);
+      mxnet::TShape out_shape(2, -1);
       out_shape[0] = in_shape[0];
       out_shape[1] = num_hid;
 
diff --git a/tests/cpp/operator/mkldnn_test.cc b/tests/cpp/operator/mkldnn_test.cc
index 1e7f09005c93..02131d419c24 100644
--- a/tests/cpp/operator/mkldnn_test.cc
+++ b/tests/cpp/operator/mkldnn_test.cc
@@ -100,7 +100,7 @@ static void VerifyDefMem(const mkldnn::memory &mem) {
 
 TEST(MKLDNN_UTIL_FUNC, MemFormat) {
   // Check whether the number of format is correct.
-  CHECK_EQ(mkldnn_format_last, 112);
+  CHECK_EQ(mkldnn_format_last, 114);
   CHECK_EQ(mkldnn_nchw, 7);
   CHECK_EQ(mkldnn_oihw, 16);
 }
diff --git a/tests/cpp/storage/storage_test.cc b/tests/cpp/storage/storage_test.cc
index 026c3660f326..ce8d4ebd7a71 100644
--- a/tests/cpp/storage/storage_test.cc
+++ b/tests/cpp/storage/storage_test.cc
@@ -36,10 +36,15 @@ TEST(Storage, Basic_CPU) {
   EXPECT_EQ(handle.ctx, context_cpu);
   EXPECT_EQ(handle.size, kSize);
   storage->Free(handle);
+
   handle = storage->Alloc(kSize, context_cpu);
   EXPECT_EQ(handle.ctx, context_cpu);
   EXPECT_EQ(handle.size, kSize);
   storage->Free(handle);
+
+  handle = storage->Alloc(0, context_cpu);
+  EXPECT_EQ(handle.dptr, nullptr);
+  storage->Free(handle);
 }
 
 #if MXNET_USE_CUDA
@@ -47,6 +52,7 @@ TEST(Storage_GPU, Basic_GPU) {
   if (mxnet::test::unitTestsWithCuda) {
     putenv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF=20");
     putenv("MXNET_GPU_MEM_POOL_TYPE=Round");
+
     auto &&storage = mxnet::Storage::Get();
     mxnet::Context context_gpu = mxnet::Context::GPU(0);
     auto &&handle = storage->Alloc(32, context_gpu);
@@ -71,6 +77,11 @@ TEST(Storage_GPU, Basic_GPU) {
     EXPECT_EQ(handle2.size, 3145728);
     EXPECT_EQ(handle2.dptr, ptr2);
     storage->Free(handle2);
+
+    handle = storage->Alloc(0, context_gpu);
+    EXPECT_EQ(handle.dptr, nullptr);
+    storage->Free(handle);
+
     unsetenv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF");
     unsetenv("MXNET_GPU_MEM_POOL_TYPE");
   }
@@ -88,6 +99,10 @@ TEST(Storage_GPU, Basic_GPU) {
     EXPECT_EQ(handle.size, kSize);
     EXPECT_EQ(handle.dptr, ptr);
     storage->Free(handle);
+
+    handle = storage->Alloc(0, context_gpu);
+    EXPECT_EQ(handle.dptr, nullptr);
+    storage->Free(handle);
   }
 }
 #endif  // MXNET_USE_CUDA
diff --git a/tests/nightly/README.md b/tests/nightly/README.md
index 774a975edad6..bc483d2e5ee1 100644
--- a/tests/nightly/README.md
+++ b/tests/nightly/README.md
@@ -17,16 +17,16 @@
 
 # Nightly Tests for MXNet 
 
-These are some longer running tests that are scheduled to run every night. 
+These are some longer running tests that are scheduled to run every night, for master and for latest release branches. 
 
 ### Description
 There are two Jenkins pipelines that run these tests - 
-1. Tests on the Source code: http://jenkins.mxnet-ci.amazon-ml.com/view/Nightly%20Tests/job/NightlyTests_onSource/
-2. Tests on Built Binaries: http://jenkins.mxnet-ci.amazon-ml.com/view/Nightly%20Tests/job/NightlyTests_onBinaries/
+1. [Tests on source code](http://jenkins.mxnet-ci.amazon-ml.com/job/NightlyTests/)
+2. [Tests on built binaries](http://jenkins.mxnet-ci.amazon-ml.com/job/NightlyTestsForBinaries/)
 
 ### Adding a new Nightly Test
-Add your test script to the mxnet repo (preferably)in the tests/nightly folder)
-Make sure to describe in a readme or comments the purpose of the test. 
+Add your test script to the MXNet repo's `tests/nightly/` folder. Make sure to describe in a readme or in the 
+comments the purpose of the test. 
 
 #### Setting up the Docker Container 
 1. Your test must run on the CI slaves only within an official docker container available at ci/docker
diff --git a/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh b/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh
index 23386836ed83..ba53146c3225 100755
--- a/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh
+++ b/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh
@@ -27,4 +27,5 @@ cd tests/nightly/model_backwards_compatibility_check
 echo `pwd`
 
 echo '=========================='
-python model_backwards_compat_inference.py
\ No newline at end of file
+export MXNET_ENFORCE_DETERMINISM=1
+python model_backwards_compat_inference.py
diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
index 02d480d9d3ba..9a1700f931ca 100755
--- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
+++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
@@ -25,6 +25,7 @@ run_models() {
 	echo '=========================='
 	echo "Running training files and preparing models"
 	echo '=========================='
+  export MXNET_ENFORCE_DETERMINISM=1
 	python model_backwards_compat_train.py
 	echo '=========================='
 }
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 54bfcee47347..1c5a5835e6f9 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -38,12 +38,14 @@
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied
+from common import run_in_spawned_process
 from test_gluon import *
 from test_loss import *
 from test_gluon_rnn import *
 
 set_default_context(mx.gpu(0))
 
+
 def check_rnn_layer(layer):
     layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)])
     with mx.gpu(0):
@@ -61,6 +63,7 @@ def check_rnn_layer(layer):
     for g, c in zip(gs, cs):
         assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6)
 
+
 @with_seed()
 def check_rnn_layer_w_rand_inputs(layer):
     layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)])
@@ -87,11 +90,13 @@ def test_lstmp():
     rtol, atol = 1e-2, 1e-2
     batch_size, seq_len = 7, 11
     input_size = 5
-    lstm_input = mx.nd.uniform(shape=(seq_len, batch_size, input_size), ctx=mx.gpu(0))
-    shapes = {'i2h_weight': (hidden_size*4, input_size),
-              'h2h_weight': (hidden_size*4, projection_size),
-              'i2h_bias': (hidden_size*4,),
-              'h2h_bias': (hidden_size*4,),
+    ctx = mx.gpu(0)
+    lstm_input = mx.nd.uniform(
+        shape=(seq_len, batch_size, input_size), ctx=ctx)
+    shapes = {'i2h_weight': (hidden_size * 4, input_size),
+              'h2h_weight': (hidden_size * 4, projection_size),
+              'i2h_bias': (hidden_size * 4,),
+              'h2h_bias': (hidden_size * 4,),
               'h2r_weight': (projection_size, hidden_size)}
     weights = {k: rand_ndarray(v) for k, v in shapes.items()}
     lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
@@ -100,34 +105,37 @@ def test_lstmp():
                                             projection_size=projection_size,
                                             input_size=input_size,
                                             prefix='lstm0_l0_')
-    lstm_layer.initialize(ctx=mx.gpu(0))
-    lstm_cell.initialize(ctx=mx.gpu(0))
+    lstm_layer.initialize(ctx=ctx)
+    lstm_cell.initialize(ctx=ctx)
     layer_params = lstm_layer.collect_params()
     cell_params = lstm_cell.collect_params()
     for k, v in weights.items():
-        layer_params['lstm0_l0_'+k].set_data(v.copy())
-        cell_params['lstm0_l0_'+k].set_data(v.copy())
+        layer_params['lstm0_l0_' + k].set_data(v.copy())
+        cell_params['lstm0_l0_' + k].set_data(v.copy())
     with autograd.record():
         layer_output = lstm_layer(lstm_input.copy())
         cell_output = lstm_cell.unroll(seq_len, lstm_input.copy(), layout='TNC',
                                        merge_outputs=True)[0]
-    assert_almost_equal(layer_output.asnumpy(), cell_output.asnumpy(), rtol=rtol, atol=atol)
+    assert_almost_equal(layer_output.asnumpy(),
+                        cell_output.asnumpy(), rtol=rtol, atol=atol)
     layer_output.backward()
     cell_output.backward()
     for k, v in weights.items():
-        layer_grad = layer_params['lstm0_l0_'+k].grad()
-        cell_grad = cell_params['lstm0_l0_'+k].grad()
-        print('checking gradient for {}'.format('lstm0_l0_'+k))
+        layer_grad = layer_params['lstm0_l0_' + k].grad()
+        cell_grad = cell_params['lstm0_l0_' + k].grad()
+        print('checking gradient for {}'.format('lstm0_l0_' + k))
         assert_almost_equal(layer_grad.asnumpy(), cell_grad.asnumpy(),
                             rtol=rtol, atol=atol)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5), mx.nd.ones((8, 3, 20)))
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True), mx.nd.ones((8, 3, 20)), [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))])
+    check_rnn_layer_forward(gluon.rnn.LSTM(
+        10, 2, projection_size=5), mx.nd.ones((8, 3, 20)), ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True), mx.nd.ones(
+        (8, 3, 20)), [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], ctx=ctx)
 
     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, projection_size=5), mx.nd.ones((8, 3, 20)),
-                            run_only=True)
+                            run_only=True, ctx=ctx)
     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, projection_size=5),
                             mx.nd.ones((8, 3, 20)),
-                            [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], run_only=True)
+                            [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], run_only=True, ctx=ctx)
 
 
 @with_seed()
@@ -137,7 +145,8 @@ def test_lstm_clip():
     batch_size, seq_len = 32, 80
     input_size = 50
     clip_min, clip_max, clip_nan = -5, 5, True
-    lstm_input = mx.nd.uniform(shape=(seq_len, batch_size, input_size), ctx=mx.gpu(0))
+    lstm_input = mx.nd.uniform(
+        shape=(seq_len, batch_size, input_size), ctx=mx.gpu(0))
     lstm_states = [mx.nd.uniform(shape=(2, batch_size, projection_size), ctx=mx.gpu(0)),
                    mx.nd.uniform(shape=(2, batch_size, hidden_size), ctx=mx.gpu(0))]
     lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
@@ -163,7 +172,8 @@ def test_rnn_layer():
     check_rnn_layer(gluon.rnn.GRU(100, num_layers=3))
 
     check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True))
-    check_rnn_layer_w_rand_inputs(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True))
+    check_rnn_layer_w_rand_inputs(gluon.rnn.LSTM(
+        100, num_layers=3, bidirectional=True))
 
 
 def check_layer_bidirectional(size, in_size, proj_size):
@@ -171,8 +181,10 @@ class RefBiLSTM(gluon.Block):
         def __init__(self, size, proj_size, **kwargs):
             super(RefBiLSTM, self).__init__(**kwargs)
             with self.name_scope():
-                self._lstm_fwd = gluon.rnn.LSTM(size, projection_size=proj_size, bidirectional=False, prefix='l0')
-                self._lstm_bwd = gluon.rnn.LSTM(size, projection_size=proj_size, bidirectional=False, prefix='r0')
+                self._lstm_fwd = gluon.rnn.LSTM(
+                    size, projection_size=proj_size, bidirectional=False, prefix='l0')
+                self._lstm_bwd = gluon.rnn.LSTM(
+                    size, projection_size=proj_size, bidirectional=False, prefix='r0')
 
         def forward(self, inpt):
             fwd = self._lstm_fwd(inpt)
@@ -182,16 +194,23 @@ def forward(self, inpt):
             return nd.concat(fwd, bwd, dim=2)
     weights = {}
     for d in ['l', 'r']:
-        weights['lstm_{}0_i2h_weight'.format(d)] = mx.random.uniform(shape=(size*4, in_size))
+        weights['lstm_{}0_i2h_weight'.format(d)] = mx.random.uniform(
+            shape=(size * 4, in_size))
         if proj_size:
-            weights['lstm_{}0_h2h_weight'.format(d)] = mx.random.uniform(shape=(size*4, proj_size))
-            weights['lstm_{}0_h2r_weight'.format(d)] = mx.random.uniform(shape=(proj_size, size))
+            weights['lstm_{}0_h2h_weight'.format(d)] = mx.random.uniform(
+                shape=(size * 4, proj_size))
+            weights['lstm_{}0_h2r_weight'.format(d)] = mx.random.uniform(
+                shape=(proj_size, size))
         else:
-            weights['lstm_{}0_h2h_weight'.format(d)] = mx.random.uniform(shape=(size*4, size))
-        weights['lstm_{}0_i2h_bias'.format(d)] = mx.random.uniform(shape=(size*4,))
-        weights['lstm_{}0_h2h_bias'.format(d)] = mx.random.uniform(shape=(size*4,))
-
-    net = gluon.rnn.LSTM(size, projection_size=proj_size, bidirectional=True, prefix='lstm_')
+            weights['lstm_{}0_h2h_weight'.format(
+                d)] = mx.random.uniform(shape=(size * 4, size))
+        weights['lstm_{}0_i2h_bias'.format(
+            d)] = mx.random.uniform(shape=(size * 4,))
+        weights['lstm_{}0_h2h_bias'.format(
+            d)] = mx.random.uniform(shape=(size * 4,))
+
+    net = gluon.rnn.LSTM(size, projection_size=proj_size,
+                         bidirectional=True, prefix='lstm_')
     ref_net = RefBiLSTM(size, proj_size, prefix='lstm_')
     net.initialize()
     ref_net.initialize()
@@ -199,16 +218,19 @@ def forward(self, inpt):
     ref_net_params = ref_net.collect_params()
     for k in weights:
         net_params[k].set_data(weights[k])
-        ref_net_params[k.replace('l0', 'l0l0').replace('r0', 'r0l0')].set_data(weights[k])
+        ref_net_params[k.replace('l0', 'l0l0').replace(
+            'r0', 'r0l0')].set_data(weights[k])
 
     data = mx.random.uniform(shape=(11, 10, in_size))
     assert_allclose(net(data).asnumpy(), ref_net(data).asnumpy())
 
+
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_layer_bidirectional():
     check_layer_bidirectional(7, 5, 0)
 
+
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='7.2.1')
 def test_layer_bidirectional_proj():
@@ -219,7 +241,8 @@ def test_layer_bidirectional_proj():
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_rnn_layer_begin_state_type():
     fake_data = nd.random.uniform(shape=(3, 5, 7), dtype='float16')
-    modeling_layer = gluon.rnn.LSTM(hidden_size=11, num_layers=2, dropout=0.2, bidirectional=True)
+    modeling_layer = gluon.rnn.LSTM(
+        hidden_size=11, num_layers=2, dropout=0.2, bidirectional=True)
     modeling_layer.cast('float16')
     modeling_layer.initialize()
     modeling_layer(fake_data)
@@ -227,9 +250,10 @@ def test_rnn_layer_begin_state_type():
 
 def test_gluon_ctc_consistency():
     loss = mx.gluon.loss.CTCLoss()
-    data = mx.nd.arange(0, 4, repeat=40, ctx=mx.gpu(0)).reshape((2,20,4)).flip(axis=0)
-    cpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.cpu(0))
-    gpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.gpu(0))
+    data = mx.nd.arange(0, 4, repeat=40, ctx=mx.gpu(0)
+                        ).reshape((2, 20, 4)).flip(axis=0)
+    cpu_label = mx.nd.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.cpu(0))
+    gpu_label = mx.nd.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.gpu(0))
 
     cpu_data = data.copy().as_in_context(mx.cpu(0))
     cpu_data.attach_grad()
@@ -243,15 +267,17 @@ def test_gluon_ctc_consistency():
         l_gpu = loss(gpu_data, gpu_label)
         l_gpu.backward()
 
-    assert_almost_equal(cpu_data.grad.asnumpy(), gpu_data.grad.asnumpy(), atol=1e-3, rtol=1e-3)
+    assert_almost_equal(cpu_data.grad.asnumpy(),
+                        gpu_data.grad.asnumpy(), atol=1e-3, rtol=1e-3)
 
 
 @with_seed()
 def test_global_norm_clip_multi_device():
     for check_isfinite in [True, False]:
-        x1 = mx.nd.ones((3,3), ctx=mx.gpu(0))
-        x2 = mx.nd.ones((4,4), ctx=mx.cpu(0))
-        norm = gluon.utils.clip_global_norm([x1, x2], 1.0, check_isfinite=check_isfinite)
+        x1 = mx.nd.ones((3, 3), ctx=mx.gpu(0))
+        x2 = mx.nd.ones((4, 4), ctx=mx.cpu(0))
+        norm = gluon.utils.clip_global_norm(
+            [x1, x2], 1.0, check_isfinite=check_isfinite)
         if check_isfinite:
             assert norm == 5.0
         else:
@@ -260,86 +286,6 @@ def test_global_norm_clip_multi_device():
         assert_almost_equal(x2.asnumpy(), np.ones((4, 4)) / 5)
 
 
-def _check_batchnorm_result(input, num_devices=1, cuda=False):
-    from mxnet.gluon.utils import split_and_load
-    def _find_bn(module):
-        if isinstance(module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)):
-            return module
-        elif isinstance(module.module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)):
-            return module.module
-
-        raise RuntimeError('BN not found')
-
-    def _syncParameters(bn1, bn2, ctx):
-        ctx = input.context
-        bn2.gamma.set_data(bn1.gamma.data(ctx))
-        bn2.beta.set_data(bn1.beta.data(ctx))
-        bn2.running_mean.set_data(bn1.running_mean.data(ctx))
-        bn2.running_var.set_data(bn1.running_var.data(ctx))
-
-    input1 = input.copy()
-    input2 = input.copy()
-
-    if cuda:
-        input1 = input.as_in_context(mx.gpu(0))
-        ctx_list = [mx.gpu(i) for i in range(num_devices)]
-    else:
-        ctx_list = [mx.cpu(0) for _ in range(num_devices)]
-
-    nch = input.shape[1]
-    bn1 = mx.gluon.nn.BatchNorm(in_channels=nch)
-    bn2 = mx.gluon.contrib.nn.SyncBatchNorm(in_channels=nch, num_devices=num_devices)
-
-    bn1.initialize(ctx=ctx_list[0])
-    bn2.initialize(ctx=ctx_list)
-
-    # using the same values for gamma and beta
-    #_syncParameters(_find_bn(bn1), _find_bn(bn2), ctx_list[0])
-
-    input1.attach_grad()
-    inputs2 = split_and_load(input2, ctx_list, batch_axis=0)
-    for xi in inputs2:
-        xi.attach_grad()
-
-    with mx.autograd.record():
-        output1 = bn1(input1)
-        output2  = [bn2(xi) for xi in inputs2]
-        loss1 = (output1 ** 2).sum()
-        loss2 = [(output ** 2).sum() for output in output2]
-        mx.autograd.backward(loss1)
-        mx.autograd.backward(loss2)
-
-    output2 = mx.nd.concat(*[output.as_in_context(input.context) for output in output2], dim=0)
-    # assert forwarding
-    assert_almost_equal(input1.asnumpy(), input2.asnumpy(), atol=1e-3, rtol=1e-3)
-    assert_almost_equal(output1.asnumpy(), output2.asnumpy(), atol=1e-3, rtol=1e-3)
-    assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),
-                        _find_bn(bn2).running_mean.data(ctx_list[0]).asnumpy(),
-                        atol=1e-3, rtol=1e-3)
-    assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(),
-                        _find_bn(bn2).running_var.data(ctx_list[0]).asnumpy(),
-                        atol=1e-3, rtol=1e-3)
-    input2grad = mx.nd.concat(*[output.grad.as_in_context(input.context) for output in inputs2], dim=0)
-    assert_almost_equal(input1.grad.asnumpy(), input2grad.asnumpy(), atol=1e-3, rtol=1e-3)
-
-@with_seed()
-def test_sync_batchnorm():
-    def get_num_devices():
-        for i in range(100):
-            try:
-                mx.nd.zeros((1,), ctx=mx.gpu(i))
-            except:
-                return i
-    # no need to use SyncBN with 1 gpu
-    if get_num_devices() < 2:
-        return
-    ndev = 2
-    # check with unsync version
-    for i in range(10):
-        _check_batchnorm_result(mx.nd.random.uniform(shape=(4, 1, 4, 4)),
-                                num_devices=ndev, cuda=True)
-
-
 @with_seed()
 def test_symbol_block_fp16():
     # Test case to verify if initializing the SymbolBlock from a model with params
@@ -350,10 +296,11 @@ def test_symbol_block_fp16():
     tmpfile = os.path.join(tmp, 'resnet34_fp16')
     ctx = mx.gpu(0)
 
-    net_fp32 = mx.gluon.model_zoo.vision.resnet34_v2(pretrained=True, ctx=ctx, root=tmp)
+    net_fp32 = mx.gluon.model_zoo.vision.resnet34_v2(
+        pretrained=True, ctx=ctx, root=tmp)
     net_fp32.cast('float16')
     net_fp32.hybridize()
-    data = mx.nd.zeros((1,3,224,224), dtype='float16', ctx=ctx)
+    data = mx.nd.zeros((1, 3, 224, 224), dtype='float16', ctx=ctx)
     net_fp32.forward(data)
     net_fp32.export(tmpfile, 0)
 
@@ -387,7 +334,8 @@ def test_large_models():
     # Compute the height (=width) of the square tensor of the given size in bytes
     def tensor_size(big_tensor_bytes):
         bytes_per_float = 4
-        sz = int(math.sqrt(big_tensor_bytes / largest_num_features / bytes_per_float))
+        sz = int(math.sqrt(big_tensor_bytes /
+                           largest_num_features / bytes_per_float))
         return (sz // 100) * 100
 
     # The idea is to create models with large tensors of (say) 20% of the total memory.
@@ -396,18 +344,101 @@ def tensor_size(big_tensor_bytes):
     (free_mem_bytes, total_mem_bytes) = mx.context.gpu_memory_info(ctx.device_id)
     start_size = tensor_size(0.20 * total_mem_bytes)
     num_trials = 10
-    sys.stderr.write(' testing global memory of size {} ... '.format(total_mem_bytes))
+    sys.stderr.write(
+        ' testing global memory of size {} ... '.format(total_mem_bytes))
     sys.stderr.flush()
     for i in range(num_trials):
         sz = start_size - 10 * i
-        (height, width) = (sz,sz)
-        sys.stderr.write(" {}x{} ".format(height,width))
+        (height, width) = (sz, sz)
+        sys.stderr.write(" {}x{} ".format(height, width))
         sys.stderr.flush()
         data_in = nd.random_uniform(low=0, high=255, shape=(1, 3, height, width),
                                     ctx=ctx, dtype="float32")
         # Evaluate model
         net(data_in).asnumpy()
 
+# isolated execution bulking test function to be invoked with different env var settings
+
+
+def _test_bulking_in_process(seed, time_per_iteration):
+    # Use flip since it's a simple function with same-sized I/O unlikely to ever be fused.
+    class Flip(gluon.HybridBlock):
+        def __init__(self, **kwargs):
+            super(Flip, self).__init__(**kwargs)
+
+        def hybrid_forward(self, F, x):
+            return F.flip(x, axis=0)
+
+    def get_net(num_ops):
+        net = nn.HybridSequential()
+        with net.name_scope():
+            for _ in range(num_ops):
+                net.add(Flip())
+        return net
+
+    data_shape = (10,)
+    num_ops = 1000
+    num_iterations = 20
+
+    # build model
+    x = mx.ndarray.zeros(data_shape)
+    x.attach_grad()
+    dy = mx.ndarray.ones(data_shape)
+    net = get_net(num_ops)
+    net.hybridize(static_alloc=True, static_shape=True)
+
+    # time a number of forward() and backward() executions after some warm-up iterations
+    warmups = 1
+    for i in range(num_iterations + warmups):
+        with autograd.record():
+            if i == warmups:
+                start = time.time()
+            y = net(x)
+            y.backward(dy)
+            x.grad.wait_to_read()
+
+    time_per_iteration.value = (time.time() - start) / num_iterations
+
+
+@with_seed()
+def test_bulking():
+    # test case format: (max_fwd_segment_size, max_bwd_segment_size, enable_bulking_in_training)
+    test_cases = [(0, 0, True), (1, 1, True), (15, 15, False),
+                  (15, 0, True), (0, 15, True), (15, 15, True)]
+    times = {}
+    times_str = ''
+    for seg_sizes in test_cases:
+        # Create shared variable to return measured time from test process
+        time_per_iteration = mp.Manager().Value('d', 0.0)
+        if not run_in_spawned_process(_test_bulking_in_process,
+                                      {'MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD': seg_sizes[0],
+                                       'MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD': seg_sizes[1],
+                                       'MXNET_EXEC_BULK_EXEC_TRAIN': seg_sizes[2]},
+                                      time_per_iteration):
+            # skip test since the python version can't run it properly.  Warning msg was logged.
+            return
+        times[seg_sizes] = time_per_iteration.value
+        times_str += \
+            '\n    runtime of (fwd,bwd,enable) op seg setting ({},{},{}) =\t{:.1f} msec'.format(
+                seg_sizes[0], seg_sizes[1], seg_sizes[2], 1000.0 * times[seg_sizes])
+
+    fastest_non_bulked_time = min(
+        times[(0, 0, True)], times[(1, 1, True)], times[(15, 15, False)])
+    slowest_half_bulked_time = max(times[(0, 15, True)], times[(15, 0, True)])
+    fastest_half_bulked_time = min(times[(0, 15, True)], times[(15, 0, True)])
+    fully_bulked_time = times[(15, 15, True)]
+
+    print(times_str)
+    # Non-bulked times[0,0,True], times[1,1,True] and times[15,15,False] should be about the same,
+    # slower than both half-bulked times[0,15,True] and times[15,0,True]
+    assert slowest_half_bulked_time < fastest_non_bulked_time, \
+        'A half-bulked exec time is slower than the non-bulked time by {} secs! {}' \
+        .format(slowest_half_bulked_time - fastest_non_bulked_time, times_str)
+    # The fully bulked times[15,15,True] should be faster than both half-bulked runs
+    assert fully_bulked_time < fastest_half_bulked_time, \
+        'The fully-bulked exec time is slower than a half-bulked time by {} secs! {}' \
+        .format(fully_bulked_time - fastest_half_bulked_time, times_str)
+
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
index 8ff8752f534a..1dddc5889643 100644
--- a/tests/python/gpu/test_kvstore_gpu.py
+++ b/tests/python/gpu/test_kvstore_gpu.py
@@ -39,8 +39,10 @@ def init_kv_with_str(stype='default', kv_type='local'):
     kv.init(str_keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys))
     return kv
 
-# Test seed 89411477 (module seed 1829754103) resulted in a py3-gpu CI runner core dump.
-# Not reproducible, so this test is back on random seeds.
+# 1. Test seed 89411477 (module seed 1829754103) resulted in a py3-gpu CI runner core dump.
+# 2. Test seed 1155716252 (module seed 1032824746) resulted in py3-mkldnn-gpu have error 
+# src/operator/nn/mkldnn/mkldnn_base.cc:567: Check failed: similar
+# Both of them are not reproducible, so this test is back on random seeds.
 @with_seed()
 @unittest.skipIf(mx.context.num_gpus() < 2, "test_rsp_push_pull needs more than 1 GPU")
 def test_rsp_push_pull():
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 2007b93ef5ee..b037218dc07f 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -33,6 +33,7 @@
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied
+from common import run_in_spawned_process
 from test_operator import *
 from test_optimizer import *
 from test_random import *
@@ -521,23 +522,31 @@ def test_convolution_options():
     check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
 
 
-# Helper function to run tests in a subprocess to avoid save/restore of os.environ.
-# Also avoids issues of cached environment variable lookups in the backend.
-def _test_in_separate_process(func, env, *args):
-    try:
-        mpctx = mp.get_context('spawn')
-    except:
-        print('SKIP: python%s.%s lacks the required process fork-exec support ... ' %
-              sys.version_info[0:2], file=sys.stderr, end='')
-    else:
-        seed = np.random.randint(0,1024*1024*1024)
-        for (key, value) in env.items():
-            os.environ[key] = str(value)
-        # Prepend seed as first arg
-        p = mpctx.Process(target=func, args=(seed,)+args)
-        p.start()
-        p.join()
-        assert p.exitcode == 0, "Non-zero exit code %d from %s()." % (p.exitcode, func.__name__)
+@with_seed()
+def test_conv_deconv_guards():
+    # Test cases for convolution and deconvolution via strided fft.  Ensure that the framework
+    # guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5)
+    # see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_750.html#rel_750
+    tol = 1e-1
+    for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]:
+        dataname = opname + '_data'
+        ctx = {'ctx': mx.gpu(0), dataname: (32, 32, 64, 64), 'type_dict': {dataname: np.float32}}
+        test_cases = [
+            {'num_filter':32, 'kernel':(6,6), 'pad':(0,0), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(6,6), 'pad':(1,1), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(6,7), 'pad':(0,1), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(7,6), 'pad':(1,0), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(7,7), 'pad':(0,0), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(7,7), 'pad':(1,1), 'stride':(2,2), 'name': opname}]
+        for test_case_args in test_cases:
+            try:
+                sym = op(**test_case_args)
+                sym_no_cudnn = op(cudnn_off=True, **test_case_args)
+                check_consistency([sym, sym_no_cudnn], [ctx, ctx], tol=tol)
+            except:
+                print('Test failure of mx.sym.{} with args: {}'.format(op.__name__, test_case_args))
+                raise
+
 
 def _conv_with_num_streams(seed):
     with random_seed(seed):
@@ -566,8 +575,10 @@ def _conv_with_num_streams(seed):
 def test_convolution_multiple_streams():
     for num_streams in [1, 2]:
         for engine in ['NaiveEngine', 'ThreadedEngine', 'ThreadedEnginePerDevice']:
-            _test_in_separate_process(_conv_with_num_streams,
+            print("Starting engine %s with %d streams." % (engine, num_streams), file=sys.stderr)
+            run_in_spawned_process(_conv_with_num_streams,
                 {'MXNET_GPU_WORKER_NSTREAMS' : num_streams, 'MXNET_ENGINE_TYPE' : engine})
+            print("Finished engine %s with %d streams." % (engine, num_streams), file=sys.stderr)
 
 
 # This test is designed to expose an issue with cudnn v7.1.4 algo find() when invoked with large c.
@@ -1952,19 +1963,21 @@ def check_proposal_consistency(op, batch_size, with_nms=False):
 # The following 2 functions launch 0-thread kernels, an error that should be caught and signaled.
 def kernel_error_check_imperative():
     os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
-    a = mx.nd.array([1,2,3],ctx=mx.gpu(0))
-    b = mx.nd.array([],ctx=mx.gpu(0))
-    c = (a / b).asnumpy()
+    with mx.np_compat(active=True):
+        a = mx.nd.array([1,2,3],ctx=mx.gpu(0))
+        b = mx.nd.array([],ctx=mx.gpu(0))
+        c = (a / b).asnumpy()
 
 def kernel_error_check_symbolic():
     os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
-    a = mx.sym.Variable('a')
-    b = mx.sym.Variable('b')
-    c = a / b
-    f = c.bind(mx.gpu(0), { 'a':mx.nd.array([1,2,3],ctx=mx.gpu(0)),
-                            'b':mx.nd.array([],ctx=mx.gpu(0))})
-    f.forward()
-    g = f.outputs[0].asnumpy()
+    with mx.np_compat(active=True):
+        a = mx.sym.Variable('a')
+        b = mx.sym.Variable('b')
+        c = a / b
+        f = c.bind(mx.gpu(0), { 'a':mx.nd.array([1,2,3],ctx=mx.gpu(0)),
+                                'b':mx.nd.array([],ctx=mx.gpu(0))})
+        f.forward()
+        g = f.outputs[0].asnumpy()
 
 def test_kernel_error_checking():
     # Running tests that may throw exceptions out of worker threads will stop CI testing
@@ -2117,6 +2130,71 @@ def test_bilinear_sampler_versions():
                     assert_almost_equal(exe.grad_dict['grid'].asnumpy(), exe_list[ref_idx].grad_dict['grid'].asnumpy(), rtol=1e-3, atol=1e-5)
 
 
+# isolated execution bulking test function to be invoked with different env var settings
+def _test_bulking_in_process(seed, time_per_iteration):
+    data_shape = (10,)
+    num_ops = 1000
+    num_iterations = 20
+
+    ctx = default_context()
+    # build symbol
+    X = mx.sym.Variable('X')
+    sym = mx.sym.flip(X, axis=0)
+    for _ in range(num_ops-1):
+        sym = mx.sym.flip(sym, axis=0)
+    x = mx.ndarray.zeros(data_shape)
+    dx = mx.ndarray.zeros(data_shape)
+    dy = mx.ndarray.ones(data_shape)
+    exe = sym.bind(ctx=ctx, args=[x], args_grad = {'X':dx})
+
+    # time a number of forward() and backward() executions after some warm-up iterations
+    warmups = 1
+    for i in range(num_iterations+warmups):
+        if i == warmups:
+            start = time.time()
+        exe.forward(is_train=True)
+        exe.backward(dy)
+        dx.wait_to_read()
+    time_per_iteration.value = (time.time() - start) / num_iterations
+
+@with_seed()
+def test_bulking():
+    # test case format: (max_fwd_segment_size, max_bwd_segment_size, enable_bulking_in_training)
+    test_cases = [(0,0,True), (1,1,True), (15,15,False), (15,0,True), (0,15,True), (15,15,True)]
+    times = {}
+    times_str = ''
+    for seg_sizes in test_cases:
+        # Create shared variable to return measured time from test process
+        time_per_iteration = mp.Manager().Value('d', 0.0)
+        if not run_in_spawned_process(_test_bulking_in_process,
+                                      {'MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD' : seg_sizes[0],
+                                       'MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD' : seg_sizes[1],
+                                       'MXNET_EXEC_BULK_EXEC_TRAIN' : seg_sizes[2]},
+                                      time_per_iteration):
+            # skip test since the python version can't run it properly.  Warning msg was logged.
+            return
+        times[seg_sizes] = time_per_iteration.value
+        times_str += \
+            '\n    runtime of (fwd,bwd,enable) op seg setting ({},{},{}) =\t{:.1f} msec'.format(
+            seg_sizes[0], seg_sizes[1], seg_sizes[2], 1000.0 * times[seg_sizes])
+
+    fastest_non_bulked_time = min(times[(0,0,True)], times[(1,1,True)], times[(15,15,False)])
+    slowest_half_bulked_time = max(times[(0,15,True)], times[(15,0,True)])
+    fastest_half_bulked_time = min(times[(0,15,True)], times[(15,0,True)])
+    fully_bulked_time = times[(15,15,True)]
+
+    print(times_str)
+    # Non-bulked times[0,0,True], times[1,1,True] and times[15,15,False] should be about the same,
+    # slower than both half-bulked times[0,15,True] and times[15,0,True]
+    assert slowest_half_bulked_time < fastest_non_bulked_time, \
+        'A half-bulked exec time is slower than the non-bulked time by {} secs! {}' \
+            .format(slowest_half_bulked_time - fastest_non_bulked_time, times_str)
+    # The fully bulked times[15,15,True] should be faster than both half-bulked runs
+    assert fully_bulked_time < fastest_half_bulked_time, \
+        'The fully-bulked exec time is slower than a half-bulked time by {} secs! {}' \
+            .format(fully_bulked_time - fastest_half_bulked_time, times_str)
+
+
 def test_context_num_gpus():
     # Test that num_gpus reports at least one GPU, as the test is run on a GPU host.
     assert mx.context.num_gpus() > 0
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 01ba03cab7cd..0610b606c201 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -473,6 +473,21 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
     exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
     exec1.forward()[0].wait_to_read()
 
+@with_seed()
+def test_conv_transpose():
+    axes = [(0,2,1,3), (0,2,3,1), (1,2,3,0), (3,2,1,0)]
+    a = np.random.rand(10, 16, 50, 50)
+    b = np.random.rand(32, 16, 3, 3)
+    x = mx.nd.array(a)
+    w = mx.nd.array(b)
+    y = mx.nd.Convolution(data=x, weight=w, kernel=(3, 3), num_group=1, num_filter=32, no_bias=True)
+    for axis in axes:
+        t = mx.nd.transpose(y, axis)
+        t.wait_to_read()
+        s = y.asnumpy()
+        n = np.transpose(s, axis)
+        np.allclose(t.asnumpy(), n)
+
 
 if __name__ == '__main__':
     install.test_mkldnn_install()
diff --git a/tests/python/mkl/test_subgraph.py b/tests/python/mkl/test_subgraph.py
index 313668cb56f9..761eb47e56cb 100644
--- a/tests/python/mkl/test_subgraph.py
+++ b/tests/python/mkl/test_subgraph.py
@@ -32,20 +32,60 @@
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
 from mxnet.test_utils import assert_almost_equal
+import itertools
+
+OP_NAME='op_name'
+QUANTIZED_OP_NAME='quantized_op_name'
+SG_PASS_NAME='sg_pass_name'
+POST_SG_PASS_NAME='post_sg_pass_name'
+config =  {
+  'conv': {
+    OP_NAME: 'sg_mkldnn_conv',
+    QUANTIZED_OP_NAME: 'quantized_sg_mkldnn_conv',
+    SG_PASS_NAME: 'MKLDNN',
+    POST_SG_PASS_NAME: 'MKLDNN_POST_QUANTIZE'
+  },
+  'fc': {
+    OP_NAME: 'sg_mkldnn_fully_connected',
+    QUANTIZED_OP_NAME: 'quantized_sg_mkldnn_fully_connected',
+    SG_PASS_NAME: 'MKLDNN',
+    POST_SG_PASS_NAME: 'MKLDNN_POST_QUANTIZE'
+  }
+}
 
 DATA_SHAPE=[(4, 4, 10, 10), (32, 3, 24, 24), (64, 8, 64, 64)]
 
-def check_qsym_calibrated(qsym, out_type):
+def check_qsym_calibrated(qsym, out_type, name='conv'):
+  quantized_op_name = config[name][QUANTIZED_OP_NAME]
+  assert ''.join(qsym.attr_dict().keys()).find(quantized_op_name) != -1
+  for k, v in qsym.attr_dict().items():
+    if k.find('_quantize') != -1:
+      assert v['out_type'] == out_type
+    if k.find(quantized_op_name) != -1:
+      if name == 'fc' and 'enable_float_output' in v:
+        continue
+      assert 'min_calib_range' in v
+      assert 'max_calib_range' in v
+
+def check_qsym_scale_align(qsym):
   assert ''.join(qsym.attr_dict().keys()).find('quantized_sg_mkldnn_conv') != -1
+  init = False
   for k, v in qsym.attr_dict().items():
     if k.find('quantized_sg_mkldnn_conv') != -1:
       assert 'min_calib_range' in v
       assert 'max_calib_range' in v
-    if k.find('_quantize') != -1:
-      assert v['out_type'] == out_type
+      if not init:
+        min_calib_range = v['min_calib_range']
+        max_calib_range = v['max_calib_range']
+        init = True
+      else:
+        assert min_calib_range == v['min_calib_range']
+        assert max_calib_range == v['max_calib_range']
+
+
 
 def check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape):
-  mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
+  mod = Module(symbol=qsym, context=mx.current_context())
   mod.bind(for_training=False,
            data_shapes=[('data', data_shape)],
            label_shapes=[('softmax_label', label_shape)])
@@ -56,7 +96,7 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_
   return mod.get_outputs()
 
 def check_qsym_dummy_forward(qsym, batch, data_shape, label_shape):
-  mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
+  mod = Module(symbol=qsym, context=mx.current_context())
   mod.bind(for_training=False,
            data_shapes=[('data', data_shape)],
            label_shapes=[('softmax_label', label_shape)])
@@ -66,15 +106,41 @@ def check_qsym_dummy_forward(qsym, batch, data_shape, label_shape):
     output.wait_to_read()
   return mod.get_outputs()
 
-def check_quantize(sym, data_shape, out_type, check_conv=True):
-  fc = mx.sym.FullyConnected(data=sym, num_hidden=10, flatten=True, name='fc')
-  sym = mx.sym.SoftmaxOutput(data=fc, name='softmax')
-  sym_sg = sym.get_backend_symbol("MKLDNN")
-  label_shape = (data_shape[0], 10)
-  mod = Module(symbol=sym)
-  mod.bind(for_training=False,
-           data_shapes=[('data', data_shape)],
-           label_shapes=[('softmax_label', label_shape)])
+def check_qsym_gluon_forward(qsym, qarg_params, qaux_params, data_shape):
+  # save qsym to JSON file
+  qsym.save('quantized-symbol.json')
+  # save params
+  save_dict = {('arg:%s' % k): v.as_in_context(mx.current_context()) for k, v in qarg_params.items()}
+  save_dict.update({('aux:%s' % k): v.as_in_context(mx.current_context()) for k, v in qaux_params.items()})
+  mx.nd.save('quantized-0000.params', save_dict)
+  # load back with SymbolBlock
+  net = mx.gluon.SymbolBlock.imports('quantized-symbol.json', ['data'], 'quantized-0000.params')
+  net.collect_params().reset_ctx(ctx = mx.current_context())
+  net.hybridize()
+
+  data = mx.random.uniform(-1.0, 1.0, shape=data_shape)
+  net(data)
+
+def check_quantize(sym, data_shape, out_type, name='conv',
+                   check_calibration=True, gluon_forward=False, check_scale_align=False):
+  sg_pass_name = config[name][SG_PASS_NAME]
+  post_sg_pass_name = config[name][POST_SG_PASS_NAME]
+
+  fc = mx.sym.FullyConnected(data=sym, num_hidden=10, flatten=True, name='fc_softmax')
+  if gluon_forward == True:
+    sym = fc
+    sym_sg = sym.get_backend_symbol(sg_pass_name)
+    mod = Module(symbol=sym, label_names=[])
+    mod.bind(for_training=False,
+            data_shapes=[('data', data_shape)])
+  else:
+    sym = mx.sym.SoftmaxOutput(data=fc, name='softmax')
+    sym_sg = sym.get_backend_symbol(sg_pass_name)
+    label_shape = (data_shape[0], 10)
+    mod = Module(symbol=sym)
+    mod.bind(for_training=False,
+            data_shapes=[('data', data_shape)],
+            label_shapes=[('softmax_label', label_shape)])
   mod.init_params(mx.init.Normal(0.5))
   arg_params, aux_params = mod.get_params()
 
@@ -87,8 +153,9 @@ def check_quantize(sym, data_shape, out_type, check_conv=True):
   ref_out = mod.get_outputs()
 
   excluded_sym_names = []
-  if mx.current_context() == mx.cpu():
-    excluded_sym_names += ['fc']
+  if mx.current_context() == mx.cpu() and gluon_forward == True:
+    excluded_sym_names += ['sg_mkldnn_fully_connected_0']
+    excluded_sym_names += ['fc_softmax']
 
   calib_data = mx.nd.random.uniform(shape=data_shape)
   calib_data = NDArrayIter(data=calib_data)
@@ -104,30 +171,86 @@ def check_quantize(sym, data_shape, out_type, check_conv=True):
                                                                    calib_data=calib_data,
                                                                    calib_layer=calib_layer,
                                                                    num_calib_examples=5)
-  qsym = qsym.get_backend_symbol("MKLDNN_POST_QUANTIZE")
-  if check_conv:
-    check_qsym_calibrated(qsym, out_type)
-  quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape)
-  for i in range(len(ref_out)):
-    assert_almost_equal(ref_out[i].asnumpy(), quantized_out[i].asnumpy(), atol = 1)
-  check_qsym_dummy_forward(qsym, batch, data_shape, label_shape)
+  qsym = qsym.get_backend_symbol(post_sg_pass_name)
+  if check_calibration:
+    check_qsym_calibrated(qsym, out_type, name=name)
+  if check_scale_align:
+    check_qsym_scale_align(qsym)
+  if gluon_forward == True:
+    check_qsym_gluon_forward(qsym, qarg_params, qaux_params, data_shape)
+  else:
+    check_qsym_dummy_forward(qsym, batch, data_shape, label_shape)
+    quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape)
+    for i in range(len(ref_out)):
+      assert_almost_equal(ref_out[i].asnumpy(), quantized_out[i].asnumpy(), atol = 1)
 
+@with_seed()
+def check_quantize_whole_model_with_forward():
+  def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape):
+    mod = Module(symbol=qsym, label_names=None, context=mx.current_context())
+    mod.bind(for_training=False,
+             data_shapes=[('data', data_shape)])
+    mod.set_params(qarg_params, qaux_params)
+    data = [mx.random.uniform(-1.0, 1.0, shape=shape) for _, shape in mod.data_shapes]
+    batch = mx.io.DataBatch(data, [])
+    mod.forward(batch, is_train=False)
+    for output in mod.get_outputs():
+        output.wait_to_read()
+
+  def check_quantize_whole_model(out_type):
+    batch_size = 4
+    data_shape = (batch_size, 4, 10, 10)
+    data = mx.sym.Variable('data')
+    conv0 = mx.sym.Convolution(data, kernel=(1, 1), num_filter=16, name='conv0')
+    sym = mx.sym.Convolution(conv0, kernel=(1, 1), num_filter=16, name='conv1')
+    sym_sg = sym.get_backend_symbol('MKLDNN')
+    mod = Module(symbol=sym, label_names=[])
+    mod.bind(for_training=False,
+             data_shapes=[('data', data_shape)])
+
+    mod.init_params(mx.init.Normal(0.5))
+    arg_params, aux_params = mod.get_params()
+
+    excluded_sym_names = []
+
+    calib_data = mx.nd.random.uniform(shape=data_shape)
+    calib_data = NDArrayIter(data=calib_data)
+    calib_data = DummyIter(calib_data)
+    calib_layer = lambda name: name.endswith('_output')
+    qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym_sg,
+                                                                     arg_params=arg_params,
+                                                                     aux_params=aux_params,
+                                                                     ctx=mx.current_context(),
+                                                                     excluded_sym_names=excluded_sym_names,
+                                                                     quantized_dtype=out_type,
+                                                                     calib_mode='naive',
+                                                                     calib_data=calib_data,
+                                                                     calib_layer=calib_layer,
+                                                                     num_calib_examples=5)
+    qsym = qsym.get_backend_symbol('MKLDNN_POST_QUANTIZE')
+    check_qsym_forward(qsym, qarg_params, qaux_params, data_shape)
+
+  for qdtype in ['uint8', 'int8', 'auto']:
+    check_quantize_whole_model(qdtype)
 
 @with_seed()
-def check_fusion(sym, data_shape, attrs_op):
-  sym_sg = sym.get_backend_symbol("MKLDNN")
-  assert ''.join(sym_sg.get_internals().list_outputs()).find('sg_mkldnn_conv') != -1
+def check_fusion(sym, data_shape, attrs_op, name='conv', check_quantization=True):
+  op_name = config[name][OP_NAME]
+  sg_pass_name = config[name][SG_PASS_NAME]
+
+  sym_sg = sym.get_backend_symbol(sg_pass_name)
+  assert ''.join(sym_sg.get_internals().list_outputs()).find(op_name) != -1
   for k, v in sym_sg.attr_dict().items():
-    if k.find('sg_mkldnn_conv') != -1:
+    if k.find(op_name) != -1:
       for attr_op in attrs_op:
-        assert v[attr_op] == 'true'
+        assert v[attr_op] in ['true', 'True']
 
   arg_shapes, _, aux_shapes = sym.infer_shape()
   arg_array = [mx.nd.random.uniform(-1, 1, shape=shape) for shape in arg_shapes]
   aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
   exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
   exe.forward()
-  os.environ['MXNET_SUBGRAPH_BACKEND'] = 'MKLDNN'
+  os.environ['MXNET_SUBGRAPH_BACKEND'] = sg_pass_name
   exe_sg = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
   exe_sg.forward()
   del os.environ['MXNET_SUBGRAPH_BACKEND']
@@ -135,17 +258,29 @@ def check_fusion(sym, data_shape, attrs_op):
     assert_almost_equal(exe.outputs[i].asnumpy(), exe_sg.outputs[i].asnumpy(), rtol=1e-3, atol=1e-3)
 
   # fp32 to int8
-  for out_type in ('uint8', 'int8', 'auto'):
-    check_quantize(sym, data_shape, out_type)
+  out_type_list = ['uint8', 'int8', 'auto']
+
+  if check_quantization:
+    for out_type in out_type_list:
+      check_quantize(sym, data_shape, out_type, name=name)
+      # TODO(ciyong), since quantized fc save its params in int8, while gluon treat the default
+      # variable from symbol file as fp32 which results in mismatch dtype of params.
+      # Skip quantized fc in gluon pass.
+      if name != 'fc':
+        check_quantize(sym, data_shape, out_type, name=name, gluon_forward=True)
+
+def check_neg_fusion(syms, attrs_name=None, excluded_attrs=None,
+                     date_shape=(4,4,10,10), name='conv'):
+  op_name = config[name][OP_NAME]
+  sg_pass_name = config[name][SG_PASS_NAME]
 
-def check_neg_fusion(syms, attrs_name=None, excluded_attrs=None, date_shape=(4,4,10,10)):
   for sym, attrs, excluded_attr in zip(syms, attrs_name, excluded_attrs):
-    sym_sg = sym.get_backend_symbol("MKLDNN")
+    sym_sg = sym.get_backend_symbol(sg_pass_name)
     exe_sg = sym_sg.simple_bind(mx.cpu(), data=date_shape, grad_req='null')
 
     attrs_dict = sym_sg.attr_dict()
     for k, v in attrs_dict.items():
-      if k.find('sg_mkldnn_conv') != -1:
+      if k.find(op_name) != -1:
         for attr in attrs:
           assert v[attr] == 'true'
         for exc_attr in excluded_attr:
@@ -239,6 +374,20 @@ def single_concat(data_shape, input_num, dim):
   concat = mx.symbol.Concat(*inputs, name="concat", dim=dim)
   return concat
 
+# concat scale alignment case
+def concat_scale_align(data_shape):
+  data, weight = head_symbol(data_shape)
+  conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=True)
+  conv2 = mx.symbol.Convolution(data=data, weight=weight * 2, name='conv2', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=True)
+  conv3 = mx.symbol.Convolution(data=data, weight=weight * 3, name='conv3', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=True)
+  conv4 = mx.symbol.Convolution(data=data, weight=weight * 4, name='conv4', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=True)
+  concat = mx.symbol.Concat(*[conv1, conv2, conv3, conv4], name="concat", dim=1)
+  return concat
+
 def tail_neg_symbol(sym1, sym2):
   fc1 = mx.sym.FullyConnected(data=sym1, num_hidden=10, flatten=True, name='fc1')
   fc2 = mx.sym.FullyConnected(data=sym2, num_hidden=10, flatten=True, name='fc2')
@@ -417,6 +566,45 @@ def neg_conv_bn_add_relu(data_shape):
   excluded_attrs.append(['with_postsum_relu'])
   return syms, attrs, excluded_attrs
 
+def single_fc(no_bias, data_shape, flatten=True):
+  attr = ['']
+  data, weight = head_symbol(data_shape)
+  fc = mx.symbol.FullyConnected(name='fc', data=data, weight=weight, num_hidden=64,
+                                no_bias=no_bias, flatten=flatten)
+  return fc, attr
+
+def fc_relu(no_bias, data_shape, flatten=True):
+  attr = ['with_relu']
+  data, weight = head_symbol(data_shape)
+  fc = mx.symbol.FullyConnected(name='fc', data=data, weight=weight, num_hidden=64,
+                                no_bias=no_bias, flatten=flatten)
+  relu = mx.symbol.Activation(data=fc, name='relu', act_type="relu")
+  return relu, attr
+
+# fc + relu can't be fusion case
+# eg.1
+# fc -----------> relu
+#  |
+#  |
+#  ---------------> [custom op]
+def neg_fc_relu(no_bias, data_shape, flatten=True):
+  syms = []
+  attrs = []
+  excluded_attrs = []
+  data, weight = head_symbol(data_shape)
+
+  # eg.1 ([custom op] = pool)
+  fc = mx.symbol.FullyConnected(name='fc', data=data, weight=weight, num_hidden=64,
+                                no_bias=no_bias, flatten=flatten)
+  relu = mx.symbol.Activation(data=fc, name='relu', act_type="relu")
+  sigmoid = mx.symbol.Activation(data=fc, name='sigmoid', act_type="sigmoid")
+  sym = tail_neg_symbol(relu, sigmoid)
+
+  syms.append(sym)
+  attrs.append([])
+  excluded_attrs.append([])
+  return syms, attrs, excluded_attrs
+
 @with_seed()
 def test_pos_single_conv():
   for data_shape in DATA_SHAPE:
@@ -473,15 +661,27 @@ def test_pos_conv_bn_sum_relu():
     net, attrs = conv_bn_sum_relu(True, data_shape)
     check_fusion(net, data_shape, attrs)
 
+@with_seed()
 def test_pos_single_concat():
   for data_shape in DATA_SHAPE:
     for out_type in ('uint8', 'int8', 'auto'):
       net = single_concat(data_shape, 2, 1)
-      check_quantize(net, data_shape, out_type, False)
+      check_quantize(net, data_shape, out_type, name='conv', check_calibration=False)
+      check_quantize(net, data_shape, out_type, name='conv', check_calibration=False, gluon_forward=True)
       net = single_concat(data_shape, 4, 2)
-      check_quantize(net, data_shape, out_type, False)
+      check_quantize(net, data_shape, out_type, name='conv', check_calibration=False)
+      check_quantize(net, data_shape, out_type, name='conv', check_calibration=False, gluon_forward=True)
       net = single_concat(data_shape, 4, 3)
-      check_quantize(net, data_shape, out_type, False)
+      check_quantize(net, data_shape, out_type, name='conv', check_calibration=False)
+      check_quantize(net, data_shape, out_type, name='conv', check_calibration=False, gluon_forward=True)
+
+@with_seed()
+def test_pos_concat_scale_align():
+  for data_shape in DATA_SHAPE:
+    for out_type in ('uint8', 'int8', 'auto'):
+      net = concat_scale_align(data_shape)
+      check_quantize(net, data_shape, out_type, check_calibration=True, check_scale_align=True)
+      check_quantize(net, data_shape, out_type, check_calibration=True, check_scale_align=True, gluon_forward=True)
 
 @with_seed()
 def test_neg_conv_bn():
@@ -513,6 +713,30 @@ def test_neg_conv_bn_add_relu():
     syms, attrs, excluded_attrs = neg_conv_bn_add_relu(data_shape)
     check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
 
+@with_seed()
+def test_single_fc():
+  for dshape, no_bias, flatten in itertools.product(DATA_SHAPE, [True, False], [True, False]):
+    syms, attrs = single_fc(no_bias, dshape, flatten)
+    if flatten is True:
+      check_fusion(syms, dshape, attrs, name='fc', check_quantization=True)
+    else:
+      check_fusion(syms, dshape, attrs, name='fc', check_quantization=False)
+
+
+@with_seed()
+def test_fc_relu():
+  for dshape, no_bias, flatten in itertools.product(DATA_SHAPE, [True, False], [True, False]):
+    syms, attrs = fc_relu(no_bias, dshape, flatten)
+    if flatten is True:
+      check_fusion(syms, dshape, attrs, name='fc', check_quantization=True)
+    else:
+      check_fusion(syms, dshape, attrs, name='fc', check_quantization=False)
+
+@with_seed()
+def test_neg_fc_relu():
+  for dshape, no_bias, flatten in itertools.product(DATA_SHAPE, [True, False], [True, False]):
+    syms, attrs, excluded_attrs = neg_fc_relu(no_bias, dshape, flatten)
+    check_neg_fusion(syms, attrs, excluded_attrs, dshape, name='fc')
 
 if __name__ == "__main__":
   import nose
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 3ff4b69302fb..757df81e1607 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -278,7 +278,7 @@ def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_p
 @with_seed()
 def test_quantized_fc():
     def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
-        if mx.current_context().device_type != 'gpu':
+        if is_test_for_native_cpu():
             hasMKL = False;
             for key in os.environ.keys():
                 if operator.eq(key, "BUILD_TAG"):
@@ -292,27 +292,55 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
             print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
             return
 
+        def maxabs(a, b):
+            return mx.nd.maximum(mx.nd.abs(a), mx.nd.abs(b))
+
         data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
         fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten)
         arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape)
         arg_names = fc_fp32.list_arguments()
         fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+        int8_range = 127.0
         if qdtype == 'uint8':
             data_low = 0.0
             data_high = 63.0
+            quantized_range = 255.0
         else:
             data_low = -63.0
             data_high = 63.0
-        fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
-                                                                     shape=data_shape).astype('int32')
-        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
-                                                                     shape=arg_shapes[1]).astype('int32')
+            quantized_range = 127.0
+
+        data = mx.nd.random.uniform(low=data_low, high=data_high,
+                                    shape=data_shape).astype('int32')
+        weight = mx.nd.random.uniform(low=data_low, high=data_high,
+                                      shape=arg_shapes[1]).astype('int32')
+        fc_fp32_exe.arg_dict[arg_names[0]][:] = data
+        fc_fp32_exe.arg_dict[arg_names[1]][:] = weight
+
+        data_min = mx.nd.min(data).astype('float32')
+        data_max = mx.nd.max(data).astype('float32')
+        weight_min = mx.nd.min(weight).astype('float32')
+        weight_max = mx.nd.max(weight).astype('float32')
+        data_range = maxabs(data_min, data_max)
+        weight_range = maxabs(weight_min, weight_max)
+
         if not no_bias:
-            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
-                                                                         shape=arg_shapes[2]).astype('int32')
+            bias = mx.nd.random.uniform(low=data_low, high=data_high,
+                                        shape=arg_shapes[2]).astype('int32')
+            bias_min = mx.nd.min(bias).astype('float32')
+            bias_max = mx.nd.max(bias).astype('float32')
+            bias_range = maxabs(bias_min, bias_max)
+
+            bias_scale = int8_range / bias_range
+            data_scale = quantized_range / data_range
+            weight_scale = int8_range / weight_range
+            bias_int32_rescale = data_scale * weight_scale / bias_scale
+            new_bias = mx.nd.cast(bias, dtype='float32') * bias_int32_rescale
+            fc_fp32_exe.arg_dict[arg_names[2]][:] = new_bias.astype('int32')
+
         output = fc_fp32_exe.forward()[0]
 
-        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
+        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype)
         fc_int8 = mx.sym.contrib.quantized_fully_connected(data=qdata, num_hidden=num_hidden,
                                                            no_bias=no_bias, flatten=flatten)
         qarg_names = fc_int8.list_arguments()
@@ -322,20 +350,19 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
         fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[arg_names[0]].astype(qdtype)
         fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[arg_names[1]].astype('int8')
-        quantized_range = 127.0
         if no_bias:
-            fc_int8_exe.arg_dict[qarg_names[2]][:] = -quantized_range
-            fc_int8_exe.arg_dict[qarg_names[3]][:] = quantized_range
-            fc_int8_exe.arg_dict[qarg_names[4]][:] = -quantized_range
-            fc_int8_exe.arg_dict[qarg_names[5]][:] = quantized_range
+            fc_int8_exe.arg_dict[qarg_names[2]][:] = -data_range
+            fc_int8_exe.arg_dict[qarg_names[3]][:] = data_range
+            fc_int8_exe.arg_dict[qarg_names[4]][:] = -weight_range
+            fc_int8_exe.arg_dict[qarg_names[5]][:] = weight_range
         else:
-            fc_int8_exe.arg_dict[qarg_names[2]][:] = fc_fp32_exe.arg_dict[arg_names[2]].astype('int8')
-            fc_int8_exe.arg_dict[qarg_names[3]][:] = -quantized_range
-            fc_int8_exe.arg_dict[qarg_names[4]][:] = quantized_range
-            fc_int8_exe.arg_dict[qarg_names[5]][:] = -quantized_range
-            fc_int8_exe.arg_dict[qarg_names[6]][:] = quantized_range
-            fc_int8_exe.arg_dict[qarg_names[7]][:] = -quantized_range
-            fc_int8_exe.arg_dict[qarg_names[8]][:] = quantized_range
+            fc_int8_exe.arg_dict[qarg_names[2]][:] = bias.astype('int8')
+            fc_int8_exe.arg_dict[qarg_names[3]][:] = -data_range
+            fc_int8_exe.arg_dict[qarg_names[4]][:] = data_range
+            fc_int8_exe.arg_dict[qarg_names[5]][:] = -weight_range
+            fc_int8_exe.arg_dict[qarg_names[6]][:] = weight_range
+            fc_int8_exe.arg_dict[qarg_names[7]][:] = -bias_range
+            fc_int8_exe.arg_dict[qarg_names[8]][:] = bias_range
         qoutput, min_range, max_range = fc_int8_exe.forward()
 
         if no_bias:
@@ -347,6 +374,11 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
             assert cond == 0
 
     for qdtype in ['int8', 'uint8']:
+        if is_test_for_mkldnn():
+            check_quantized_fc((32, 512, 2), 100, True, qdtype, flatten=False)
+            check_quantized_fc((32, 512, 2), 100, False, qdtype, flatten=False)
+            check_quantized_fc((32, 512, 2, 2), 100, True, qdtype, flatten=False)
+            check_quantized_fc((32, 512, 2, 2), 100, False, qdtype, flatten=False)
         check_quantized_fc((32, 512, 2, 2), 100, True, qdtype)
         check_quantized_fc((32, 111, 2, 2), 100, True, qdtype)
         check_quantized_fc((32, 512, 2, 2), 100, False, qdtype)
@@ -450,6 +482,16 @@ def get_fp32_sym_with_multiple_outputs(length=1):
 @with_seed()
 def test_quantize_model():
     def check_quantize_model(qdtype):
+        if is_test_for_native_cpu():
+            print('skipped testing quantize_model for native cpu since it is not supported yet')
+            return
+        elif qdtype == 'int8' and is_test_for_mkldnn():
+            print('skipped testing quantize_model for mkldnn cpu int8 since it is not supported yet')
+            return
+        elif qdtype == 'uint8' and is_test_for_gpu():
+            print('skipped testing quantize_model for gpu uint8 since it is not supported yet')
+            return
+
         def check_params(params, qparams, qsym=None):
             if qsym is None:
                 assert len(params) == len(qparams)
@@ -635,6 +677,101 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape):
     for qdtype in ['int8', 'uint8']:
         check_quantize_model(qdtype)
 
+@with_seed()
+def test_quantize_conv_with_forward():
+    def check_quantize_model(qdtype):
+        if is_test_for_native_cpu():
+            print('skipped testing test_quantize_model_with_forward for native cpu since it is not supported yet')
+            return
+        elif qdtype == 'int8' and is_test_for_mkldnn():
+            print('skipped testing test_quantize_model_with_forward for mkldnn cpu int8 since it is not supported yet')
+            return
+        elif qdtype == 'uint8' and is_test_for_gpu():
+            print('skipped testing test_quantize_model_with_forward for gpu uint8 since it is not supported yet')
+            return
+
+        def check_params(params, qparams, qsym=None):
+            if qsym is None:
+                assert len(params) == len(qparams)
+                for k, v in params.items():
+                    assert k in qparams
+                    assert same(v.asnumpy(), qparams[k].asnumpy())
+            else:
+                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params, th_dict = {})
+                assert len(qparams) == len(qparams_ground_truth)
+                for k, v in qparams_ground_truth.items():
+                    assert k in qparams
+                    assert same(v.asnumpy(), qparams[k].asnumpy())
+
+        def check_qsym_calibrated(qsym):
+            attrs = qsym.attr_dict()
+            for k, v in attrs.items():
+                if k.find('requantize_') != -1:
+                    assert 'min_calib_range' in v
+                    assert 'max_calib_range' in v
+
+        def check_qsym_qdtype(qsym, qdtype):
+            attrs = qsym.attr_dict()
+            for k, v in attrs.items():
+                if k.find('_quantize') != -1:
+                    assert 'out_type' in v
+                    assert v['out_type'] == qdtype
+
+        def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape):
+            mod = mx.mod.Module(symbol=qsym, label_names=None, context=mx.current_context())
+            mod.bind(for_training=False,
+                     data_shapes=[('data', data_shape)])
+            mod.set_params(qarg_params, qaux_params)
+            data = [mx.random.uniform(-1.0, 1.0, shape=shape) for _, shape in mod.data_shapes]
+            batch = mx.io.DataBatch(data, [])
+            mod.forward(batch, is_train=False)
+            for output in mod.get_outputs():
+                output.wait_to_read()
+
+        batch_size = 4
+        dshape = (batch_size, 4, 10, 10)
+        data = mx.sym.Variable('data')
+        sym = mx.sym.Convolution(data, kernel=(1, 1), num_filter=16, name='conv0')
+
+        mod = Module(symbol=sym, label_names=None)
+        mod.bind(data_shapes=[('data', dshape)])
+
+        mod.init_params()
+        arg_params, aux_params = mod.get_params()
+        excluded_sym_names = []
+
+        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
+                                                                            arg_params=arg_params,
+                                                                            aux_params=aux_params,
+                                                                            excluded_sym_names=excluded_sym_names,
+                                                                            ctx=mx.current_context(),
+                                                                            quantized_dtype=qdtype,
+                                                                            calib_mode='none')
+        check_params(arg_params, qarg_params, qsym)
+        check_params(aux_params, qaux_params)
+        check_qsym_forward(qsym, qarg_params, qaux_params, dshape)
+
+        calib_data = mx.nd.random.uniform(shape=dshape)
+        calib_data = NDArrayIter(data=calib_data, batch_size=batch_size)
+        calib_data = DummyIter(calib_data)
+        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
+                                                                            arg_params=arg_params,
+                                                                            aux_params=aux_params,
+                                                                            excluded_sym_names=excluded_sym_names,
+                                                                            ctx=mx.current_context(),
+                                                                            quantized_dtype=qdtype,
+                                                                            calib_mode='naive',
+                                                                            calib_data=calib_data,
+                                                                            num_calib_examples=20)
+        check_params(arg_params, qarg_params, qsym)
+        check_params(aux_params, qaux_params)
+        check_qsym_calibrated(qsym)
+        check_qsym_qdtype(qsym, qdtype)
+        check_qsym_forward(qsym, qarg_params, qaux_params, dshape)
+
+    for qdtype in ['uint8', 'int8']:
+        check_quantize_model(qdtype)
+
 @with_seed()
 def test_quantize_sym_with_calib():
     sym = get_fp32_sym()
@@ -673,10 +810,11 @@ def test_optimal_threshold_adversarial_case():
     # The worst case for the optimal_threshold function is when the values are concentrated
     # at one edge: [0, 0, ..., 1000]. (histogram)
     # We want to make sure that the optimal threshold in this case is the max.
-    arr = np.array([2]*1000)
-    res = mx.contrib.quant._get_optimal_threshold(arr, num_quantized_bins=5)
-    # The threshold should be 2.
-    assert res[3] - 2 < 1e-5
+    arr = np.array([2] * 1000)
+    for dtype in ['uint8', 'int8', 'auto']:
+        res = mx.contrib.quant._get_optimal_threshold(arr, dtype, num_quantized_bins=5)
+        # The threshold should be 2.
+        assert res[3] - 2 < 1e-5
 
 
 @with_seed()
@@ -688,11 +826,12 @@ def get_threshold(nd):
         max_nd = mx.nd.max(nd)
         return mx.nd.maximum(mx.nd.abs(min_nd), mx.nd.abs(max_nd)).asnumpy()
 
-    nd_dict = {'layer1': mx.nd.uniform(low=-10.532, high=11.3432, shape=(8, 3, 23, 23), dtype=np.float64)}
-    expected_threshold = get_threshold(nd_dict['layer1'])
-    th_dict = mx.contrib.quant._get_optimal_thresholds(nd_dict)
-    assert 'layer1' in th_dict
-    assert_almost_equal(np.array([th_dict['layer1'][1]]), expected_threshold, rtol=1e-2, atol=1e-4)
+    for dtype in ['uint8', 'int8', 'auto']:
+        nd_dict = {'layer1': mx.nd.uniform(low=-10.532, high=11.3432, shape=(8, 3, 23, 23), dtype=np.float64)}
+        expected_threshold = get_threshold(nd_dict['layer1'])
+        th_dict = mx.contrib.quant._get_optimal_thresholds(nd_dict, dtype)
+        assert 'layer1' in th_dict
+        assert_almost_equal(np.array([th_dict['layer1'][1]]), expected_threshold, rtol=1e-2, atol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/train/test_dtype.py b/tests/python/train/test_dtype.py
index 2e3ff06d2e18..39bfbcdeeafe 100644
--- a/tests/python/train/test_dtype.py
+++ b/tests/python/train/test_dtype.py
@@ -65,6 +65,30 @@ def get_iterator_uint8(kv):
 
     return (train, val)
 
+def get_iterator_int8(kv):
+    data_shape = (3, 28, 28)
+
+    train = mx.io.ImageRecordInt8Iter(
+        path_imgrec = "data/cifar/train.rec",
+        data_shape  = data_shape,
+        batch_size  = batch_size,
+        rand_crop   = True,
+        rand_mirror = True,
+        num_parts   = kv.num_workers,
+        part_index  = kv.rank)
+    train = mx.io.PrefetchingIter(train)
+
+    val = mx.io.ImageRecordInt8Iter(
+        path_imgrec = "data/cifar/test.rec",
+        rand_crop   = False,
+        rand_mirror = False,
+        data_shape  = data_shape,
+        batch_size  = batch_size,
+        num_parts   = kv.num_workers,
+        part_index  = kv.rank)
+
+    return (train, val)
+
 def get_iterator_float32(kv):
     data_shape = (3, 28, 28)
 
@@ -190,5 +214,10 @@ def test_cifar10():
     run_cifar10(train, val, use_module=False)
     run_cifar10(train, val, use_module=True)
 
+    # test int8 input
+    (train, val) = get_iterator_int8(kv)
+    run_cifar10(train, val, use_module=False)
+    run_cifar10(train, val, use_module=True)
+
 if __name__ == "__main__":
     test_cifar10()
diff --git a/tests/python/unittest/common.py b/tests/python/unittest/common.py
index abfba73ab727..7cd637da3d4f 100644
--- a/tests/python/unittest/common.py
+++ b/tests/python/unittest/common.py
@@ -15,7 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from __future__ import print_function
 import sys, os, logging
+import multiprocessing as mp
 import mxnet as mx
 import numpy as np
 import random
@@ -39,6 +41,7 @@ def assertRaises(expected_exception, func, *args, **kwargs):
         # Did not raise exception
         assert False, "%s did not raise %s" % (func.__name__, expected_exception.__name__)
 
+
 def default_logger():
     """A logger used to output seed information to nosetests logs."""
     logger = logging.getLogger(__name__)
@@ -51,6 +54,7 @@ def default_logger():
             logger.setLevel(logging.INFO)
     return logger
 
+
 @contextmanager
 def random_seed(seed=None):
     """
@@ -181,6 +185,7 @@ def test_new(*args, **kwargs):
         return test_new
     return test_helper
 
+
 def setup_module():
     """
     A function with a 'magic name' executed automatically before each nosetests module
@@ -265,3 +270,48 @@ def teardown():
     It waits for all operations in one file to finish before carrying on the next.
     """
     mx.nd.waitall()
+
+
+def run_in_spawned_process(func, env, *args):
+    """
+    Helper function to run a test in its own process.
+
+    Avoids issues with Singleton- or otherwise-cached environment variable lookups in the backend.
+    Adds a seed as first arg to propagate determinism.
+
+    Parameters
+    ----------
+
+    func : function to run in a spawned process.
+    env : dict of additional environment values to set temporarily in the environment before exec.
+    args : args to pass to the function.
+
+    Returns
+    -------
+    Whether the python version supports running the function as a spawned process.
+
+    This routine calculates a random seed and passes it into the test as a first argument.  If the
+    test uses random values, it should include an outer 'with random_seed(seed):'.  If the
+    test needs to return values to the caller, consider use of shared variable arguments.
+    """
+    try:
+        mpctx = mp.get_context('spawn')
+    except:
+        print('SKIP: python%s.%s lacks the required process fork-exec support ... ' %
+              sys.version_info[0:2], file=sys.stderr, end='')
+        return False
+    else:
+        seed = np.random.randint(0,1024*1024*1024)
+        orig_environ = os.environ.copy()
+        try:
+            for (key, value) in env.items():
+                os.environ[key] = str(value)
+            # Prepend seed as first arg
+            p = mpctx.Process(target=func, args=(seed,)+args)
+            p.start()
+            p.join()
+            assert p.exitcode == 0, "Non-zero exit code %d from %s()." % (p.exitcode, func.__name__)
+        finally:
+            os.environ.clear()
+            os.environ.update(orig_environ)
+    return True
\ No newline at end of file
diff --git a/tests/python/unittest/test_contrib_operator.py b/tests/python/unittest/test_contrib_operator.py
index aac807660af1..38aeb99c2d89 100644
--- a/tests/python/unittest/test_contrib_operator.py
+++ b/tests/python/unittest/test_contrib_operator.py
@@ -26,23 +26,23 @@
 import unittest
 
 def test_box_nms_op():
-    def test_box_nms_forward(data, expected, thresh=0.5, valid=0, topk=-1, coord=2, score=1, cid=0,
-                         force=False, in_format='corner', out_format='corner'):
+    def test_box_nms_forward(data, expected, thresh=0.5, valid=0, topk=-1, coord=2, score=1, cid=0, bid=-1,
+                             force=False, in_format='corner', out_format='corner'):
         for dtype in ['float16', 'float32', 'float64']:
             data = mx.nd.array(data, dtype=dtype)
             out = mx.contrib.nd.box_nms(data, overlap_thresh=thresh, valid_thresh=valid, topk=topk,
-                                    coord_start=coord, score_index=score, id_index=cid,
-                                    force_suppress=force, in_format=in_format, out_format=out_format)
+                                        coord_start=coord, score_index=score, id_index=cid, background_id=bid,
+                                        force_suppress=force, in_format=in_format, out_format=out_format)
             assert_almost_equal(out.asnumpy(), expected.astype(dtype), rtol=1e-3, atol=1e-3)
 
     def test_box_nms_backward(data, grad, expected, thresh=0.5, valid=0, topk=-1, coord=2, score=1,
-                          cid=0, force=False, in_format='corner', out_format='corner'):
+                              cid=0, bid=-1, force=False, in_format='corner', out_format='corner'):
         in_var = mx.sym.Variable('data')
         arr_data = mx.nd.array(data)
         arr_grad = mx.nd.empty(arr_data.shape)
         op = mx.contrib.sym.box_nms(in_var, overlap_thresh=thresh, valid_thresh=valid, topk=topk,
-                                coord_start=coord, score_index=score, id_index=cid,
-                                force_suppress=force, in_format=in_format, out_format=out_format)
+                                    coord_start=coord, score_index=score, id_index=cid, background_id=bid,
+                                    force_suppress=force, in_format=in_format, out_format=out_format)
         exe = op.bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad])
         exe.forward(is_train=True)
         exe.backward(mx.nd.array(grad))
@@ -91,8 +91,8 @@ def swap_position(data, expected, coord=2, score=1, cid=0, new_col=0):
              [0, 0.3, 0.1, 0.1, 0.14, 0.14], [2, 0.6, 0.5, 0.5, 0.7, 0.8]]
 
     # case1
-    force=True
-    thresh=0.5
+    force = True
+    thresh = 0.5
     expected = [[2, 0.6, 0.5, 0.5, 0.7, 0.8], [0, 0.5, 0.1, 0.1, 0.2, 0.2],
                 [0, 0.3, 0.1, 0.1, 0.14, 0.14], [-1, -1, -1, -1, -1, -1]]
     grad = np.random.rand(4, 6)
@@ -176,6 +176,29 @@ def swap_position(data, expected, coord=2, score=1, cid=0, new_col=0):
     test_box_nms_forward(np.array(boxes8), np.array(expected8), force=force, thresh=thresh, valid=valid, topk=topk)
     test_box_nms_backward(np.array(boxes8), grad8, expected_in_grad8, force=force, thresh=thresh, valid=valid, topk=topk)
 
+    # case9: background id filter out
+    # default background id -1
+    boxes9 = [[0, 0.5, 0.1, 0.1, 0.2, 0.2], [0, 0.4, 0.1, 0.1, 0.2, 0.2],
+              [1, 0.3, 0.1, 0.1, 0.14, 0.14], [-1, 0.6, 0.5, 0.5, 0.7, 0.8]]
+    expected9 = [[0, 0.5, 0.1, 0.1, 0.2, 0.2], [1, 0.3, 0.1, 0.1, 0.14, 0.14],
+                 [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]]
+    force = True
+    thresh = 0.5
+    grad9 = np.random.rand(4, 6)
+    expected_in_grad9 = grad9[(0, 2, 1, 3), :]
+    expected_in_grad9[(1, 3), :] = 0
+    test_box_nms_forward(np.array(boxes9), np.array(expected9), force=force, thresh=thresh)
+    test_box_nms_backward(np.array(boxes9), grad9, expected_in_grad9, force=force, thresh=thresh)
+    # set background id
+    background_id = 0
+    expected9 = [[-1, 0.6, 0.5, 0.5, 0.7, 0.8], [1, 0.3, 0.1, 0.1, 0.14, 0.14],
+                 [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]]
+    grad9 = np.random.rand(4, 6)
+    expected_in_grad9 = grad9[(2, 3, 1, 0), :]
+    expected_in_grad9[(0, 1), :] = 0
+    test_box_nms_forward(np.array(boxes9), np.array(expected9), force=force, thresh=thresh, bid=background_id)
+    test_box_nms_backward(np.array(boxes9), grad9, expected_in_grad9, force=force, thresh=thresh, bid=background_id)
+
 def test_box_iou_op():
     def numpy_box_iou(a, b, fmt='corner'):
         def area(left, top, right, bottom):
diff --git a/tests/python/unittest/test_dynamic_shape.py b/tests/python/unittest/test_dynamic_shape.py
new file mode 100644
index 000000000000..1b043c73256d
--- /dev/null
+++ b/tests/python/unittest/test_dynamic_shape.py
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import mxnet as mx
+from mxnet import gluon
+from numpy.testing import assert_allclose, assert_array_equal
+from mxnet.test_utils import *
+from mxnet.base import _as_list
+from mxnet.attribute import AttrScope
+from common import with_seed
+
+
+def test_dynamic_shape():
+
+    class _TestBlock(gluon.HybridBlock):
+
+        def __init__(self):
+            super(_TestBlock, self).__init__()
+
+        def hybrid_forward(self, F, data, index):
+            return F.contrib.boolean_mask(data, index)
+
+    block = _TestBlock()
+    block.hybridize()
+    data = mx.nd.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
+    index = mx.nd.array([0, 1, 1])
+    data.attach_grad()
+    with mx.autograd.record():
+        result = block(data, index)
+    result.backward()
+    result_nd = np.array([[4, 5, 6], [7, 8, 9]])
+    data_grad_nd = np.array([[0., 0., 0.], [1., 1., 1.], [1., 1., 1.]])
+    assert_almost_equal(result.asnumpy(), result_nd)
+    assert_almost_equal(data.grad.asnumpy(), data_grad_nd)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_exc_handling.py b/tests/python/unittest/test_exc_handling.py
index e9e161d7f3b6..5627ac50d26e 100644
--- a/tests/python/unittest/test_exc_handling.py
+++ b/tests/python/unittest/test_exc_handling.py
@@ -34,11 +34,11 @@ def imperative(exec_numpy=True):
             c.asnumpy()
 
     imperative(exec_numpy=False)
-    assert_raises(MXNetError, imperative, True)
+    assert_raises(MXNetError, imperative, exec_numpy=True)
 
 @with_seed()
 def test_exc_symbolic():
-    def symbolic(exec_backward=True):
+    def symbolic(exec_backward=True, waitall=True):
         x = mx.sym.Variable('x')
         y = mx.sym.Variable('y')
         z = mx.sym.Variable('z')
@@ -58,16 +58,25 @@ def symbolic(exec_backward=True):
         outputs = exec1.forward()
         if exec_backward:
             exec1.backward()
-            exec1.grad_arrays[0].asnumpy()
+            if waitall:
+                mx.nd.waitall()
+            else:
+                exec1.grad_arrays[0].asnumpy()
         else:
-            outputs[0].asnumpy()
+            if waitall:
+                mx.nd.waitall()
+            else:
+                outputs[0].asnumpy()
 
-    assert_raises(MXNetError, symbolic, False)
-    assert_raises(MXNetError, symbolic, True)
+    assert_raises(MXNetError, symbolic, exec_backward=False)
+    assert_raises(MXNetError, symbolic, exec_backward=True)
+
+    assert_raises(MXNetError, symbolic, exec_backward=False, waitall=True)
+    assert_raises(MXNetError, symbolic, exec_backward=True, waitall=True)
 
 @with_seed()
 def test_exc_gluon():
-    def gluon(exec_wait=True):
+    def gluon(exec_wait=True, waitall=False):
         model = nn.Sequential()
         model.add(nn.Dense(128, activation='tanh', in_units=10, flatten=False))
         model.add(nn.Dropout(1))
@@ -77,46 +86,102 @@ def gluon(exec_wait=True):
         y = model(x)
         model.collect_params().initialize(ctx=[default_context()])
         z = model(mx.nd.random.normal(10, -10, (32, 2, 10), ctx=default_context()))
-        if exec_wait:
+        if waitall:
+            mx.nd.waitall()
+        elif exec_wait:
             z.wait_to_read()
 
     gluon(exec_wait=False)
-    assert_raises(MXNetError, gluon, True)
+    assert_raises(MXNetError, gluon, exec_wait=True)
+
+    assert_raises(MXNetError, gluon, waitall=True)
 
 @with_seed()
 def test_exc_multiple_waits():
-    caught = False
-    try:
-        a = mx.nd.random.normal(0, -1, (2, 2)).copyto(default_context())
-        a.wait_to_read()
-    except MXNetError:
-        caught = True
-    assert caught, "No exception thrown"
-    try:
-        b = mx.nd.random.normal(0, -1, (2, 2)).copyto(default_context())
-        b.wait_to_read()
-    except MXNetError:
-        caught = True
-    assert caught, "No exception thrown"
+    def multiple_waits(waitall=False):
+        # Test calling failed op followed by wait_to_read or waitall twice
+        # Intention is to test rethrow for multiple wait_to_reads and waitalls
+        # for vars with exceptions in same scope
+        caught = False
+        try:
+            a = mx.nd.random.normal(0, -1, (2, 2)).copyto(default_context())
+            if waitall:
+                mx.nd.waitall()
+            else:
+                a.wait_to_read()
+        except MXNetError:
+            caught = True
+        assert caught, "No exception thrown, exception should be rethrown with wait_to_read/waitall"
+        try:
+            b = mx.nd.random.normal(0, -1, (2, 2)).copyto(default_context())
+            if waitall:
+                mx.nd.waitall()
+            else:
+                b.wait_to_read()
+        except MXNetError:
+            caught = True
+        assert caught, "No exception thrown, exception should be rethrown with wait_to_read/waitall"
+
+    multiple_waits(waitall=False)
+    multiple_waits(waitall=True)
 
 @with_seed()
 def test_exc_post_fail():
+    def post_fail(waitall=False):
+        caught = False
+        try:
+            a, b = mx.nd.random_normal(0, -1, (2, 2)).copyto(default_context())
+            if waitall:
+                mx.nd.waitall()
+            else:
+                a.asnumpy()
+        except MXNetError:
+            caught = True
+        assert caught, "No exception thrown"
+        b.asnumpy()
+    post_fail(waitall=False)
+    post_fail(waitall=True)
+
+@with_seed()
+def test_exc_mutable_var_fail():
+    def mutable_var_check(waitall=False):
+        a, b = mx.nd.random_normal(0, -1, (2, 2)).copyto(default_context())
+        a = mx.nd.dot(a, a)
+        if waitall:
+            mx.nd.waitall()
+        else:
+            a.asnumpy()
+    assert_raises(MXNetError, mutable_var_check, waitall=False)
+    assert_raises(MXNetError, mutable_var_check, waitall=True)
+
+@with_seed()
+def test_multiple_waitalls():
     caught = False
     try:
-        a, b = mx.nd.random_normal(0, -1, (2, 2)).copyto(default_context())
-        a.asnumpy()
+        a = mx.nd.random.normal(0, -1, (2, 2)).copyto(default_context())
+        mx.nd.waitall()
     except MXNetError:
         caught = True
     assert caught, "No exception thrown"
-    b.asnumpy()
+    mx.nd.waitall()
 
 @with_seed()
-def test_exc_mutable_var_fail():
-    def mutable_var_check():
-        a, b = mx.nd.random_normal(0, -1, (2, 2)).copyto(default_context())
-        a = mx.nd.dot(a, a)
-        a.asnumpy()
-    assert_raises(MXNetError, mutable_var_check)
+def test_exc_profiler():
+    def run_training_iteration(data):
+        output = net(data)
+
+    net = gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.add(gluon.nn.Dense(10))
+
+    ctx = default_context()
+    net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
+    data = mx.nd.ones((3, 4))
+    mx.profiler.set_state("run")
+    run_training_iteration(data)
+    mx.nd.waitall()
+    mx.profiler.set_state("stop")
+
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 34380dc00314..8c60ef6745f1 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -583,6 +583,126 @@ def test_batchnorm():
     check_layer_forward(layer, (2, 10, 10, 10))
 
 
+@with_seed()
+def test_sync_batchnorm():
+    def _check_batchnorm_result(input, num_devices=1, cuda=False):
+        from mxnet.gluon.utils import split_and_load
+
+        def _find_bn(module):
+            if isinstance(module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)):
+                return module
+            elif isinstance(module.module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)):
+                return module.module
+
+            raise RuntimeError('BN not found')
+
+        def _syncParameters(bn1, bn2, ctx):
+            ctx = input.context
+            bn2.gamma.set_data(bn1.gamma.data(ctx))
+            bn2.beta.set_data(bn1.beta.data(ctx))
+            bn2.running_mean.set_data(bn1.running_mean.data(ctx))
+            bn2.running_var.set_data(bn1.running_var.data(ctx))
+
+        input1 = input.copy()
+        input2 = input.copy()
+
+        if cuda:
+            input1 = input.as_in_context(mx.gpu(0))
+            ctx_list = [mx.gpu(i) for i in range(num_devices)]
+        else:
+            ctx_list = [mx.cpu(0) for _ in range(num_devices)]
+
+        nch = input.shape[1] if input.ndim > 1 else 1
+        bn1 = mx.gluon.nn.BatchNorm(in_channels=nch)
+        bn2 = mx.gluon.contrib.nn.SyncBatchNorm(
+            in_channels=nch, num_devices=num_devices)
+
+        bn1.initialize(ctx=ctx_list[0])
+        bn2.initialize(ctx=ctx_list)
+
+        # using the same values for gamma and beta
+        #_syncParameters(_find_bn(bn1), _find_bn(bn2), ctx_list[0])
+
+        input1.attach_grad()
+        inputs2 = split_and_load(input2, ctx_list, batch_axis=0)
+        for xi in inputs2:
+            xi.attach_grad()
+
+        with mx.autograd.record():
+            output1 = bn1(input1)
+            output2 = [bn2(xi) for xi in inputs2]
+            loss1 = (output1 ** 2).sum()
+            loss2 = [(output ** 2).sum() for output in output2]
+            mx.autograd.backward(loss1)
+            mx.autograd.backward(loss2)
+
+        output2 = mx.nd.concat(*[output.as_in_context(input.context)
+                                 for output in output2], dim=0)
+        # check bn1
+
+        momentum = 0.9
+        epsilon = 1e-5
+        axis = 1
+        data = input1
+        running_mean = mx.nd.zeros(nch, ctx=data.context)
+        running_var = mx.nd.ones(nch, ctx=data.context)
+
+        data_mean = data.mean(
+            axis=axis, exclude=True, keepdims=True)
+        data_var = (data - data_mean).square().mean(axis=axis,
+                                                    exclude=True, keepdims=True)
+
+        target_output = (data - data_mean) / (data_var + epsilon).sqrt()
+
+        # squeeze data_mean and data_var
+        data_mean_flat = data_mean.squeeze()
+        data_var_flat = data_var.squeeze()
+
+        running_mean = running_mean * momentum + \
+            data_mean_flat * (1 - momentum)
+        running_var = running_var * momentum + \
+            data_var_flat * (1 - momentum)
+
+        atol = 1e-2
+        rtol = 1e-2
+        assert_almost_equal(output1.asnumpy(), target_output.asnumpy(),
+                            atol=atol, rtol=rtol)
+        assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),
+                            running_mean.asnumpy(),
+                            atol=atol, rtol=rtol)
+        assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(),
+                            running_var.asnumpy(),
+                            atol=atol, rtol=rtol)
+        # assert forwarding
+        assert_almost_equal(input1.asnumpy(), input2.asnumpy(),
+                            atol=atol, rtol=rtol)
+        assert_almost_equal(output1.asnumpy(),
+                            output2.asnumpy(), atol=atol, rtol=rtol)
+        assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),
+                            _find_bn(bn2).running_mean.data(ctx_list[0]).asnumpy(),
+                            atol=atol, rtol=rtol)
+        assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(),
+                            _find_bn(bn2).running_var.data(ctx_list[0]).asnumpy(),
+                            atol=atol, rtol=rtol)
+        input2grad = mx.nd.concat(
+            *[output.grad.as_in_context(input.context) for output in inputs2], dim=0)
+        assert_almost_equal(input1.grad.asnumpy(),
+                            input2grad.asnumpy(), atol=atol, rtol=rtol)
+
+    cfgs = [(1, False)]
+    num_gpus = mx.context.num_gpus()
+    for i in range(1, num_gpus + 1):
+        cfgs.append((i, True))
+    for ndev, cuda in cfgs:
+        # check with unsync version
+        for shape in [(24, 2), (24, 3, 4), (24, 4, 4, 4), (24, 5, 6, 4, 4)]:
+            print(str((ndev, cuda, shape)))
+            for i in range(10):
+                _check_batchnorm_result(mx.nd.random.uniform(shape=shape,
+                                                             ctx=mx.cpu(0)),
+                                        num_devices=ndev, cuda=cuda)
+
+
 @with_seed()
 def test_instancenorm():
     layer = nn.InstanceNorm(in_channels=10)
@@ -882,8 +1002,13 @@ def test_import():
     net2 = gluon.SymbolBlock.imports(
         'net1-symbol.json', ['data'], 'net1-0001.params', ctx)
     out2 = net2(data)
+    lines = str(net2).splitlines()
 
     assert_almost_equal(out1.asnumpy(), out2.asnumpy())
+    assert lines[0] == 'SymbolBlock('
+    assert lines[1]
+    assert lines[2] == ')'
+
 
 @with_seed()
 def test_hybrid_stale_cache():
@@ -1065,10 +1190,10 @@ def elu(x):
     def selu_test(x):
         def selu(x):
             scale, alpha = 1.0507009873554804934193349852946, 1.6732632423543772848170429916717
-            return scale * x if x >= 0 else alpha * mx.nd.exp(x) - alpha
+            return scale * x if x >= 0 else scale * alpha * mx.nd.expm1(x)
         return [selu(x_i) for x_i in x]
 
-    for test_point, ref_point in zip(selu(point_to_validate), selu(point_to_validate)):
+    for test_point, ref_point in zip(selu_test(point_to_validate), selu(point_to_validate)):
         assert test_point == ref_point
 
     prelu = mx.gluon.nn.PReLU()
@@ -1076,6 +1201,20 @@ def selu(x):
     x = point_to_validate.reshape((1, 3, 2))
     assert_almost_equal(prelu(x).asnumpy(), mx.nd.where(x >= 0, x, 0.25 * x).asnumpy())
 
+    gelu = mx.gluon.nn.GELU()
+    def gelu_test(x):
+        CUBE_CONSTANT = 0.044715
+        ROOT_TWO_OVER_PI = 0.7978845608028654
+        def g(x):
+            return ROOT_TWO_OVER_PI * (x + CUBE_CONSTANT * x * x * x)
+        def f(x):
+            return 1.0 + mx.nd.tanh(g(x))
+        def gelu(x):
+            return 0.5 * x * f(x)
+        for test_point, ref_point in zip(gelu_test(point_to_validate), gelu(point_to_validate)):
+            assert test_point == ref_point
+
+
 @with_seed()
 def test_dropout():
     def get_slice(x, axis, idx):
diff --git a/tests/python/unittest/test_gluon_data_vision.py b/tests/python/unittest/test_gluon_data_vision.py
index a855fc8cf1df..cc15bec5dee9 100644
--- a/tests/python/unittest/test_gluon_data_vision.py
+++ b/tests/python/unittest/test_gluon_data_vision.py
@@ -15,14 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 from __future__ import print_function
+from collections import namedtuple
+
 import mxnet as mx
 import mxnet.ndarray as nd
 from mxnet.base import MXNetError
 from mxnet import gluon
 from mxnet.base import MXNetError
 from mxnet.gluon.data.vision import transforms
-from mxnet.test_utils import assert_almost_equal
-from mxnet.test_utils import almost_equal
+from mxnet import image
+from mxnet.test_utils import *
 from common import assertRaises, setup_module, with_seed, teardown
 
 import numpy as np
@@ -118,6 +120,75 @@ def _test_resize_with_diff_type(dtype):
         _test_resize_with_diff_type(dtype)    
 
 
+@with_seed()
+def test_crop_resize():
+    def _test_crop_resize_with_diff_type(dtype):
+        # test normal case
+        data_in = nd.arange(60).reshape((5, 4, 3)).astype(dtype)
+        out_nd = transforms.CropResize(0, 0, 3, 2)(data_in)
+        out_np = out_nd.asnumpy()
+        assert(out_np.sum() == 180)
+        assert((out_np[0:2,1,1].flatten() == [4, 16]).all())
+        # test 4D input
+        data_bath_in = nd.arange(180).reshape((2, 6, 5, 3)).astype(dtype)
+        out_batch_nd = transforms.CropResize(1, 2, 3, 4)(data_bath_in)
+        out_batch_np = out_batch_nd.asnumpy()
+        assert(out_batch_np.sum() == 7524)
+        assert((out_batch_np[0:2,0:4,1,1].flatten() == [37,  52,  67,  82, 127, 142, 157, 172]).all())
+        # test normal case with resize
+        data_in = nd.random.uniform(0, 255, (300, 200, 3)).astype(dtype)
+        out_nd = transforms.CropResize(0, 0, 100, 50, (25, 25), 2)(data_in)
+        data_expected = image.imresize(nd.slice(data_in, (0, 0, 0), (50, 100 , 3)), 25, 25, 2)
+        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
+        # test 4D input with resize
+        data_bath_in = nd.random.uniform(0, 255, (3, 300, 200, 3)).astype(dtype)
+        out_batch_nd = transforms.CropResize(0, 0, 100, 50, (25, 25), 2)(data_bath_in)
+        for i in range(len(out_batch_nd)):
+            assert_almost_equal(image.imresize(nd.slice(data_bath_in[i], (0, 0, 0), (50, 100, 3)), 25, 25, 2).asnumpy(),
+                out_batch_nd[i].asnumpy())
+        # test with resize height and width should be greater than 0
+        transformer = transforms.CropResize(0, 0, 100, 50, (-25, 25), 2)
+        assertRaises(MXNetError, transformer, data_in)
+        # test height and width should be greater than 0 
+        transformer = transforms.CropResize(0, 0, -100, -50)
+        assertRaises(MXNetError, transformer, data_in)
+        # test cropped area is bigger than input data
+        transformer = transforms.CropResize(150, 200, 200, 500)
+        assertRaises(MXNetError, transformer, data_in)
+        assertRaises(MXNetError, transformer, data_bath_in)
+
+    for dtype in ['uint8', 'float32', 'float64']:
+        _test_crop_resize_with_diff_type(dtype)  
+
+    # test nd.image.crop backward
+    def test_crop_backward(test_nd_arr, TestCase):
+        a_np = test_nd_arr.asnumpy()
+        b_np = a_np[(slice(TestCase.y, TestCase.y + TestCase.height), slice(TestCase.x, TestCase.x + TestCase.width), slice(0, 3))]
+
+        data = mx.sym.Variable('data')
+        crop_sym = mx.sym.image.crop(data, TestCase.x, TestCase.y, TestCase.width, TestCase.height)
+
+        expected_in_grad = np.zeros_like(a_np)
+        expected_in_grad[(slice(TestCase.y, TestCase.y + TestCase.height), slice(TestCase.x, TestCase.x + TestCase.width), slice(0, 3))] = b_np
+        check_symbolic_backward(crop_sym, [a_np], [b_np], [expected_in_grad])
+
+    TestCase = namedtuple('TestCase', ['x', 'y', 'width', 'height'])
+    test_list = [TestCase(0, 0, 3, 3), TestCase(2, 1, 1, 2), TestCase(0, 1, 3, 2)]
+
+    for dtype in ['uint8', 'float32', 'float64']:
+        data_in = nd.arange(60).reshape((5, 4, 3)).astype(dtype)
+        for test_case in test_list:
+            test_crop_backward(data_in, test_case)
+        
+
+
+    # check numeric gradient of nd.image.crop
+    # in_data = np.arange(36).reshape(3, 4, 3)
+    # data = mx.sym.Variable('data')
+    # image_crop_sym = mx.sym.image.crop(data, 0, 0, 2, 2)
+    # check_numeric_gradient(image_crop_sym, [in_data])
+
+
 @with_seed()
 def test_flip_left_right():
     data_in = np.random.uniform(0, 255, (300, 300, 3)).astype(dtype=np.uint8)
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index edc43d21b36b..9d7892010839 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -22,8 +22,7 @@
 from numpy.testing import assert_allclose
 import unittest
 from mxnet.test_utils import almost_equal, assert_almost_equal
-from common import assert_raises_cudnn_not_satisfied
-
+from common import assert_raises_cudnn_not_satisfied, with_seed
 
 def test_rnn():
     cell = gluon.rnn.RNNCell(100, prefix='rnn_')
@@ -244,6 +243,7 @@ def test_bidirectional():
 
 
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
+@with_seed()
 def test_layer_bidirectional():
     class RefBiLSTM(gluon.Block):
         def __init__(self, size, **kwargs):
@@ -279,7 +279,7 @@ def forward(self, inpt):
         ref_net_params[k.replace('l0', 'l0l0').replace('r0', 'r0l0')].set_data(weights[k])
 
     data = mx.random.uniform(shape=(11, 10, in_size))
-    assert_allclose(net(data).asnumpy(), ref_net(data).asnumpy())
+    assert_allclose(net(data).asnumpy(), ref_net(data).asnumpy(), rtol=1e-04, atol=1e-02)
 
 
 
@@ -427,9 +427,15 @@ def hybrid_forward(self, F, seq):
         assert_almost_equal(output1.asnumpy(), output2.asnumpy())
 
 
-def check_rnn_layer_forward(layer, inputs, states=None, run_only=False):
-    layer.collect_params().initialize()
+def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, ctx=mx.cpu()):
+    layer.collect_params().initialize(ctx=ctx)
+    inputs = inputs.as_in_context(ctx)
     inputs.attach_grad()
+    if states is not None:
+        if isinstance(states, (list, tuple)):
+            states = [s.as_in_context(ctx) for s in states]
+        else:
+            states = states.as_in_context(ctx)
     with mx.autograd.record():
         if states is None:
             out = layer(inputs)
@@ -467,47 +473,76 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False):
         mx.test_utils.assert_almost_equal(np_dx, inputs.grad.asnumpy(), rtol=1e-3, atol=1e-5)
 
 
-@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
-def test_rnn_layers():
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2), mx.nd.ones((8, 3, 20)))
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, bidirectional=True), mx.nd.ones((8, 3, 20)), mx.nd.ones((4, 3, 10)))
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2), mx.nd.ones((8, 3, 20)))
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True), mx.nd.ones((8, 3, 20)), [mx.nd.ones((4, 3, 10)), mx.nd.ones((4, 3, 10))])
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2), mx.nd.ones((8, 3, 20)))
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, bidirectional=True), mx.nd.ones((8, 3, 20)), mx.nd.ones((4, 3, 10)))
-
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dropout=0.5), mx.nd.ones((8, 3, 20)),
-                            run_only=True)
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, bidirectional=True, dropout=0.5),
-                            mx.nd.ones((8, 3, 20)), mx.nd.ones((4, 3, 10)), run_only=True)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5), mx.nd.ones((8, 3, 20)),
-                            run_only=True)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5),
-                            mx.nd.ones((8, 3, 20)),
-                            [mx.nd.ones((4, 3, 10)), mx.nd.ones((4, 3, 10))], run_only=True)
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dropout=0.5), mx.nd.ones((8, 3, 20)),
-                            run_only=True)
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, bidirectional=True, dropout=0.5),
-                            mx.nd.ones((8, 3, 20)), mx.nd.ones((4, 3, 10)), run_only=True)
+
+def run_rnn_layers(dtype, dtype2, ctx=mx.cpu()):
+
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.nd.ones((8, 3, 20), dtype=dtype), ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, bidirectional=True), mx.nd.ones((8, 3, 20),  dtype=dtype), mx.nd.ones((4, 3, 10),  dtype=dtype), ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype), mx.nd.ones((8, 3, 20),  dtype=dtype), ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype,  bidirectional=True), mx.nd.ones((8, 3, 20),  dtype=dtype), [mx.nd.ones((4, 3, 10),  dtype=dtype), mx.nd.ones((4, 3, 10),  dtype=dtype)],ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, ), mx.nd.ones((8, 3, 20), dtype=dtype),ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, bidirectional=True), mx.nd.ones((8, 3, 20),  dtype=dtype), mx.nd.ones((4, 3, 10),  dtype=dtype),ctx=ctx)
+
+
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, dropout=0.5), mx.nd.ones((8, 3, 20), dtype=dtype),
+                            run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, bidirectional=True, dropout=0.5, dtype=dtype),
+                            mx.nd.ones((8, 3, 20), dtype=dtype), mx.nd.ones((4, 3, 10), dtype=dtype), run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, dtype=dtype), mx.nd.ones((8, 3, 20), dtype=dtype),
+                            run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, dtype=dtype),
+                            mx.nd.ones((8, 3, 20), dtype=dtype),
+                            [mx.nd.ones((4, 3, 10), dtype=dtype), mx.nd.ones((4, 3, 10), dtype=dtype)], run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dropout=0.5, dtype=dtype), mx.nd.ones((8, 3, 20), dtype=dtype),
+                            run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, bidirectional=True, dropout=0.5, dtype=dtype),
+                            mx.nd.ones((8, 3, 20), dtype=dtype), mx.nd.ones((4, 3, 10), dtype=dtype), run_only=True, ctx=ctx)
 
     net = gluon.nn.Sequential()
-    net.add(gluon.rnn.LSTM(10, bidirectional=True))
+    net.add(gluon.rnn.LSTM(10, bidirectional=True, dtype=dtype2))
     net.add(gluon.nn.BatchNorm(axis=2))
     net.add(gluon.nn.Flatten())
     net.add(gluon.nn.Dense(3, activation='relu'))
-    net.collect_params().initialize()
+    net.collect_params().initialize(ctx=ctx)
+    net.cast(dtype)
     with mx.autograd.record():
-        net(mx.nd.ones((2, 3, 10))).backward()
+        out = net(mx.nd.ones((2, 3, 10), dtype=dtype, ctx=ctx))
+        out.backward()
+        out = out.asnumpy()
 
     net2 = gluon.nn.HybridSequential()
-    net2.add(gluon.rnn.LSTM(10, bidirectional=True))
+    net2.add(gluon.rnn.LSTM(10, bidirectional=True, dtype=dtype2))
     net2.add(gluon.nn.BatchNorm(axis=2))
     net2.add(gluon.nn.Flatten())
     net2.add(gluon.nn.Dense(3, activation='relu'))
     net2.hybridize()
-    net2.collect_params().initialize()
+    net2.collect_params().initialize(ctx=ctx)
+    net2.cast(dtype)
+    with mx.autograd.record():
+        out = net2(mx.nd.ones((2, 3, 10), dtype=dtype, ctx=ctx))
+        out.backward()
+        out = out.asnumpy()
+
+    net3 = gluon.nn.HybridSequential()
+    net3.add(gluon.rnn.LSTM(10, bidirectional=True, dtype=dtype))
+    net3.add(gluon.nn.BatchNorm(axis=2))
+    net3.add(gluon.nn.Flatten())
+    net3.add(gluon.nn.Dense(3, activation='relu'))
+    net3.hybridize()
+    net3.collect_params().initialize(ctx=ctx)
+    net3.cast(dtype2)
     with mx.autograd.record():
-        net2(mx.nd.ones((2, 3, 10))).backward()
+        out = net3(mx.nd.ones((2, 3, 10), dtype=dtype2, ctx=ctx))
+        out.backward()
+        out = out.asnumpy()
+
+def test_rnn_layers_fp32():
+    run_rnn_layers('float32', 'float32')
+
+@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
+@unittest.skipIf(mx.context.num_gpus() == 0, "RNN FP16 only implemented for GPU for now")
+def test_rnn_layers_fp16():
+    run_rnn_layers('float16', 'float32', mx.gpu())
 
 
 def test_rnn_unroll_variant_length():
@@ -590,8 +625,6 @@ def test_cell_fill_shape():
     check_rnn_forward(cell, mx.nd.ones((2, 3, 7)))
     assert cell.i2h_weight.shape[1] == 7, cell.i2h_weight.shape[1]
 
-
-@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_layer_fill_shape():
     layer = gluon.rnn.LSTM(10)
     layer.hybridize()
@@ -603,6 +636,7 @@ def test_layer_fill_shape():
 def test_bidirectional_unroll_valid_length():
     # Test BidirectionalCell.
     # In 1.3.1 version, after hybridize( ), BidirectionalCell would failed when pass valid_length to unroll( ).
+    
     class BiLSTM(gluon.nn.HybridBlock):
         def __init__(self, rnn_size, time_step, **kwargs):
             super(BiLSTM, self).__init__(**kwargs)
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index 9f190a0a88c2..2d5874a8b97b 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -272,19 +272,22 @@ def test_trainer_lr_sched():
             lr *= factor
     mx.nd.waitall()
 
-@with_seed()
-def test_trainer_invalid_lr_sched():
+    # Update on kvstore = False
     x = gluon.Parameter('x', shape=(10,))
     x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
     freq = 2
     factor = 0.1
     lr = 1
     lr_sched = mx.lr_scheduler.FactorScheduler(freq, factor=factor, base_lr=lr)
-    invalid_trainer = gluon.Trainer([x], 'sgd', {'learning_rate': lr, 'lr_scheduler': lr_sched},
-                                    update_on_kvstore=False)
-    with mx.autograd.record():
-        for w in x.list_data():
-            y = w + 1
-            y.backward()
-    assert_raises(ValueError, invalid_trainer.step, 1)
+    trainer = gluon.Trainer([x], 'sgd', {'learning_rate': lr, 'lr_scheduler': lr_sched},
+                            update_on_kvstore=False)
+    for i in range(10):
+        with mx.autograd.record():
+            for w in x.list_data():
+                y = w + 1
+                y.backward()
+        trainer.step(1)
+        if i % freq == 0:
+            assert trainer.learning_rate == lr, (lr, trainer.learning_rate, i)
+            lr *= factor
     mx.nd.waitall()
diff --git a/tests/python/unittest/test_image.py b/tests/python/unittest/test_image.py
index e0abbd75ef8e..6b212da26d62 100644
--- a/tests/python/unittest/test_image.py
+++ b/tests/python/unittest/test_image.py
@@ -355,6 +355,19 @@ def test_det_augmenters(self):
         for batch in det_iter:
             pass
 
+    @with_seed()
+    def test_random_size_crop(self):
+        # test aspect ratio within bounds
+        width = np.random.randint(100, 500)
+        height = np.random.randint(100, 500)
+        src = np.random.rand(height, width, 3) * 255.
+        ratio = (0.75, 1)
+        out, (x0, y0, new_w, new_h) = mx.image.random_size_crop(mx.nd.array(src), size=(width, height), area=0.08, ratio=ratio)
+        _, pts = mx.image.center_crop(mx.nd.array(src), size=(width, height))
+        if (x0, y0, new_w, new_h) != pts:
+            assert ratio[0] <= float(new_w)/new_h <= ratio[1]
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_infer_shape.py b/tests/python/unittest/test_infer_shape.py
index 73654a604135..612861bd8303 100644
--- a/tests/python/unittest/test_infer_shape.py
+++ b/tests/python/unittest/test_infer_shape.py
@@ -147,6 +147,21 @@ def test_fc_infer_type():
         assert arg_type_dict[k] == v
 
 
+def test_shape_completely_unknown():
+    data = mx.sym.var("data")
+    ret = mx.sym.sin(data)
+    arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+    assert arg_shapes[0] == ()
+    assert out_shapes[0] == ()
+
+    with mx.np_compat():
+        data = mx.sym.var("data")
+        ret = mx.sym.sin(data)
+        arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+        assert arg_shapes[0] is None
+        assert out_shapes[0] is None
+
+
 if __name__ == "__main__":
     test_mlp2_infer_shape()
     test_mlp2_infer_error()
@@ -156,3 +171,4 @@ def test_fc_infer_type():
     test_incomplete_infer_slicechannel()
     test_incomplete_infer_convolution()
     test_incomplete_infer_concat()
+    test_shape_completely_unknown()
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index 18d1ebf8fb11..3b9b46b16f93 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -126,8 +126,8 @@ def test_logistic_loss_equal_bce():
     loss_bce = gluon.loss.SigmoidBCELoss(from_sigmoid=False)
     data = mx.random.uniform(-10, 10, shape=(N, 1))
     label = mx.nd.round(mx.random.uniform(0, 1, shape=(N, 1)))
-    assert_almost_equal(loss_binary(data, label).asnumpy(), loss_bce(data, label).asnumpy())
-    assert_almost_equal(loss_signed(data, 2 * label - 1).asnumpy(), loss_bce(data, label).asnumpy())
+    assert_almost_equal(loss_binary(data, label).asnumpy(), loss_bce(data, label).asnumpy(), atol=1e-6)
+    assert_almost_equal(loss_signed(data, 2 * label - 1).asnumpy(), loss_bce(data, label).asnumpy(), atol=1e-6)
 
 @with_seed()
 def test_kl_loss():
@@ -421,6 +421,37 @@ def test_poisson_nllloss_mod():
             optimizer='adam')
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
+@with_seed()
+def test_bce_loss_with_pos_weight():
+    # Suppose it's a multi-label classification
+    N = np.random.randint(5, 30)
+    data = mx.nd.random.uniform(-1, 1, shape=(N, 20))
+    label = mx.nd.array(np.random.randint(2, size=(N, 5)), dtype='float32')
+    pos_weight = mx.nd.random.uniform(0, 10, shape=(1, 5))
+    pos_weight = mx.nd.repeat(pos_weight, repeats=N, axis=0)
+    data_iter = mx.io.NDArrayIter(data, {'label': label, 'pos_w': pos_weight}, batch_size=10, label_name='label')
+    output = get_net(5)
+    l = mx.symbol.Variable('label')
+    pos_w = mx.symbol.Variable('pos_w')
+    Loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
+    loss = Loss(output, l, None, pos_w)
+    loss = mx.sym.make_loss(loss)
+    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label', 'pos_w'))
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
+            eval_metric=mx.metric.Loss(), optimizer='adam',
+            initializer=mx.init.Xavier(magnitude=2))
+    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.01
+    # Test against npy
+    data = mx.nd.random.uniform(-5, 5, shape=(N, 5))
+    label = mx.nd.array(np.random.randint(2, size=(N, 5)), dtype='float32')
+    pos_weight = mx.nd.random.uniform(0, 10, shape=(1, 5))
+    mx_bce_loss = Loss(data, label, None, pos_weight).asnumpy()
+    prob_npy = 1.0 / (1.0 + np.exp(-data.asnumpy()))
+    label_npy = label.asnumpy()
+    pos_weight_npy = pos_weight.asnumpy()
+    npy_bce_loss = (- label_npy * np.log(prob_npy)*pos_weight_npy - (1 - label_npy) * np.log(1 - prob_npy)).mean(axis=1)
+    assert_almost_equal(mx_bce_loss, npy_bce_loss, rtol=1e-4, atol=1e-5)
+
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index 2821c4bbae3c..d8dca753bda4 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -34,6 +34,7 @@ def test_metrics():
     check_metric('mcc')
     check_metric('perplexity', -1)
     check_metric('pearsonr')
+    check_metric('pcc')
     check_metric('nll_loss')
     check_metric('loss')
     composite = mx.metric.create(['acc', 'f1'])
@@ -89,6 +90,7 @@ def test_global_metric():
     _check_global_metric('mcc', shape=(10,2), average='micro')
     _check_global_metric('perplexity', -1)
     _check_global_metric('pearsonr', use_same_shape=True)
+    _check_global_metric('pcc', shape=(10,2))
     _check_global_metric('nll_loss')
     _check_global_metric('loss')
     _check_global_metric('ce')
@@ -253,6 +255,86 @@ def test_pearsonr():
     _, pearsonr = metric.get()
     assert pearsonr == pearsonr_expected
 
+def cm_batch(cm):
+    # generate a batch yielding a given confusion matrix
+    n = len(cm)
+    ident = np.identity(n)
+    labels = []
+    preds = []
+    for i in range(n):
+        for j in range(n):
+            labels += [ i ] * cm[i][j]
+            preds += [ ident[j] ] * cm[i][j]
+    return ([ mx.nd.array(labels, dtype='int32') ], [ mx.nd.array(preds) ])
+
+def test_pcc():
+    labels, preds = cm_batch([
+        [ 7, 3 ],
+        [ 2, 5 ],
+    ])
+    met_pcc = mx.metric.create('pcc')
+    met_pcc.update(labels, preds)
+    _, pcc = met_pcc.get()
+
+    # pcc should agree with mcc for binary classification
+    met_mcc = mx.metric.create('mcc')
+    met_mcc.update(labels, preds)
+    _, mcc = met_mcc.get()
+    np.testing.assert_almost_equal(pcc, mcc)
+
+    # pcc should agree with Pearson for binary classification
+    met_pear = mx.metric.create('pearsonr')
+    met_pear.update(labels, [p.argmax(axis=1) for p in preds])
+    _, pear = met_pear.get()
+    np.testing.assert_almost_equal(pcc, pear)
+
+    # check multiclass case against reference implementation
+    CM = [
+        [ 23, 13,  3 ],
+        [  7, 19, 11 ],
+        [  2,  5, 17 ],
+    ]
+    K = 3
+    ref = sum(
+        CM[k][k] * CM[l][m] - CM[k][l] * CM[m][k]
+        for k in range(K)
+        for l in range(K)
+        for m in range(K)
+    ) / (sum(
+        sum(CM[k][l] for l in range(K)) * sum(
+            sum(CM[f][g] for g in range(K))
+            for f in range(K)
+            if f != k
+        )
+        for k in range(K)
+    ) * sum(
+        sum(CM[l][k] for l in range(K)) * sum(
+            sum(CM[f][g] for f in range(K))
+            for g in range(K)
+            if g != k
+        )
+        for k in range(K)
+    )) ** 0.5
+    labels, preds = cm_batch(CM)
+    met_pcc.reset()
+    met_pcc.update(labels, preds)
+    _, pcc = met_pcc.get()
+    np.testing.assert_almost_equal(pcc, ref)
+
+    # things that should not change metric score:
+    # * order
+    # * batch size
+    # * update frequency
+    labels = [ [ i ] for i in labels[0] ]
+    labels.reverse()
+    preds = [ [ i.reshape((1, -1)) ] for i in preds[0] ]
+    preds.reverse()
+
+    met_pcc.reset()
+    for l, p in zip(labels, preds):
+        met_pcc.update(l, p)
+    assert pcc == met_pcc.get()[1]
+
 def test_single_array_input():
     pred = mx.nd.array([[1,2,3,4]])
     label = pred + 0.1
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 7176b1888607..94777677354d 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -32,6 +32,7 @@
 from numpy.testing import assert_allclose
 import mxnet.autograd
 
+
 def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10, type_list=[np.float32]):
     """check function consistency with uniform random numbers"""
     if isinstance(arg_shapes, int):
@@ -60,6 +61,7 @@ def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10, type_list=
         else:
             assert_almost_equal(out1, out2, atol=1e-5)
 
+
 def random_ndarray(dim):
     shape = tuple(np.random.randint(1, int(1000**(1.0/dim)), size=dim))
     data = mx.nd.array(np.random.uniform(-10, 10, shape))
@@ -120,7 +122,11 @@ def test_ndarray_setitem():
 
     # numpy assignment for empty axis
     for trivial_shape in [(), (1,), (1, 1), (1, 1, 1)]:
-        x = mx.nd.zeros(trivial_shape)
+        if trivial_shape == tuple():
+            with mx.np_compat():
+                x = mx.nd.zeros(trivial_shape)
+        else:
+            x = mx.nd.zeros(trivial_shape)
         x[:] = np.ones(trivial_shape)
         x_np = np.ones(trivial_shape, dtype=x.dtype)
         assert x.shape == trivial_shape
@@ -144,12 +150,14 @@ def test_ndarray_elementwise():
             check_with_uniform(mx.nd.square, 1, dim, np.square, rmin=0)
             check_with_uniform(lambda x: mx.nd.norm(x).asscalar(), 1, dim, np.linalg.norm)
 
+
 @with_seed()
 def test_ndarray_elementwisesum():
     ones = mx.nd.ones((10,), dtype=np.int32)
     res = mx.nd.ElementWiseSum(ones, ones*2, ones*4, ones*8)
     assert same(res.asnumpy(), ones.asnumpy()*15)
 
+
 @with_seed()
 def test_ndarray_negate():
     npy = np.random.uniform(-10, 10, (2,3,4))
@@ -162,6 +170,7 @@ def test_ndarray_negate():
     # we compute (-arr)
     assert_almost_equal(npy, arr.asnumpy())
 
+
 @with_seed()
 def test_ndarray_reshape():
     tensor = (mx.nd.arange(30) + 1).reshape(2, 3, 5)
@@ -360,6 +369,7 @@ def test_buffer_load():
                 # test garbage values
                 assertRaises(mx.base.MXNetError,  mx.nd.load_frombuffer, buf_single_ndarray[:-10])
 
+
 @with_seed()
 def test_ndarray_slice():
     shape = (10,)
@@ -391,6 +401,7 @@ def test_ndarray_slice():
         assert same(A[:, i].asnumpy(), A2[:, i])
         assert same(A[i, :].asnumpy(), A2[i, :])
 
+
 @with_seed()
 def test_ndarray_crop():
     # get crop
@@ -524,6 +535,7 @@ def test_reduce_inner(numpy_reduce_func, nd_reduce_func, multi_axes):
                              keepdims:np_reduce(np.float32(data), axis, keepdims, np.argmin),
                       mx.nd.argmin, False)
 
+
 @with_seed()
 def test_broadcast():
     sample_num = 1000
@@ -626,7 +638,7 @@ def check_broadcast_binary(fn):
 def test_moveaxis():
     X = mx.nd.array([[[1, 2, 3], [4, 5, 6]],
                      [[7, 8, 9], [10, 11, 12]]])
-    res = mx.nd.moveaxis(X, 0, 3).asnumpy()
+    res = mx.nd.moveaxis(X, 0, 2).asnumpy()
     true_res = mx.nd.array([[[  1.,   7.],
                              [  2.,   8.],
                              [  3.,   9.]],
@@ -636,6 +648,66 @@ def test_moveaxis():
     assert same(res, true_res.asnumpy())
     assert mx.nd.moveaxis(X, 2, 0).shape == (3, 2, 2)
 
+    def test_move_to_end():
+        x = mx.nd.random.normal(0, 1, (5, 6, 7))
+        for source, expected in [(0, (6, 7, 5)),
+                                 (1, (5, 7, 6)),
+                                 (2, (5, 6, 7)),
+                                 (-1, (5, 6, 7))]:
+            actual = mx.nd.moveaxis(x, source, -1).shape
+            assert actual == expected
+
+    def test_move_new_position():
+        x = mx.nd.random.normal(0, 1, (1, 2, 3, 4))
+        for source, destination, expected in [
+            (0, 1, (2, 1, 3, 4)),
+            (1, 2, (1, 3, 2, 4)),
+            (1, -1, (1, 3, 4, 2)),
+        ]:
+            actual = mx.nd.moveaxis(x, source, destination).shape
+            assert actual == expected
+
+    def test_preserve_order():
+        x = mx.nd.zeros((1, 2, 3, 4))
+        for source, destination in [
+            (0, 0),
+            (3, -1),
+            (-1, 3),
+            ([0, -1], [0, -1]),
+            ([2, 0], [2, 0]),
+            (range(4), range(4)),
+        ]:
+            actual = mx.nd.moveaxis(x, source, destination).shape
+            assert actual == (1, 2, 3, 4)
+
+    def test_move_multiples():
+        x = mx.nd.zeros((4, 1, 2, 3))
+        for source, destination, expected in [
+            ([0, 1], [2, 3], (2, 3, 4, 1)),
+            ([2, 3], [0, 1], (2, 3, 4, 1)),
+            ([0, 1, 2], [2, 3, 0], (2, 3, 4, 1)),
+            ([3, 0], [1, 0], (4, 3, 1, 2)),
+            ([0, 3], [0, 1], (4, 3, 1, 2)),
+        ]:
+            actual = mx.nd.moveaxis(x, source, destination).shape
+            assert actual == expected
+
+    def test_errors():
+        x = mx.nd.random.normal(0, 1, (1, 2, 3))
+        assert_exception(mx.nd.moveaxis, ValueError, x, 3, 0)
+        assert_exception(mx.nd.moveaxis, ValueError, x, -4, 0)
+        assert_exception(mx.nd.moveaxis, ValueError, x, 0, 5)
+        assert_exception(mx.nd.moveaxis, ValueError, x, [0, 0], [0, 1])
+        assert_exception(mx.nd.moveaxis, ValueError, x, [0, 1], [1, 1])
+        assert_exception(mx.nd.moveaxis, ValueError, x, 0, [0, 1])
+        assert_exception(mx.nd.moveaxis, ValueError, x, [0, 1], [0])
+
+    test_move_to_end()
+    test_move_new_position()
+    test_preserve_order()
+    test_move_multiples()
+    test_errors()
+
 
 @with_seed()
 def test_arange():
@@ -653,6 +725,7 @@ def test_arange():
                         dtype="int32").asnumpy()
     assert_almost_equal(pred, gt)
 
+
 @with_seed()
 def test_order():
     ctx = default_context()
@@ -885,6 +958,7 @@ def get_large_matrix():
                      k=dat_size*dat_size*dat_size*dat_size, is_ascend=False)
         assert_almost_equal(nd_ret_sort, gt)
 
+
 @with_seed()
 def test_ndarray_equal():
     x = mx.nd.zeros((2, 3))
@@ -1550,6 +1624,35 @@ def test_ndarray_is_nan():
     np.testing.assert_equal(output.asnumpy(), expected_output.astype(int))
     # astype since numpy functions default return type is boolean array instead of int
 
+@with_seed()
+def test_ndarray_nan_comparison():
+    random_dimensions = np.random.randint(2, 5)
+    random_shape = [np.random.randint(2, 5) for i in range(random_dimensions)]
+    data1 = mxnet.test_utils.rand_ndarray(random_shape,'default')
+    data2 = mxnet.test_utils.rand_ndarray(random_shape,'default')
+    data1[1][0] = np.NaN
+    data2[0][0] = np.NaN
+
+    nd_max = mx.nd.maximum(data1, data2)
+    np_max = np.maximum(data1.asnumpy(), data2.asnumpy())
+    np.testing.assert_equal(nd_max.asnumpy(), np_max)
+
+    nd_min = mx.nd.minimum(data1, data2)
+    np_min = np.minimum(data1.asnumpy(), data2.asnumpy())
+    np.testing.assert_equal(nd_min.asnumpy(), np_min)
+
+    nd_relu = mx.nd.relu(data1)
+    np_relu = np.maximum(data1.asnumpy(), 0)
+    np.testing.assert_equal(nd_relu.asnumpy(), np_relu)
+
+    data1.attach_grad()
+    with mx.autograd.record():
+        y = mx.nd.relu(data1)
+    y.backward()
+    data1_grad = data1.grad.asnumpy()
+    for i in (np.isnan(data1_grad))[1][0].flatten():
+        assert i == True
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index ae7dc86d566c..9db1b5154bd1 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -29,6 +29,8 @@
 from mxnet.test_utils import *
 from mxnet.base import py_str, MXNetError, _as_list
 from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, assertRaises
+from common import run_in_spawned_process
+from nose.tools import assert_raises
 import unittest
 import os
 
@@ -698,11 +700,11 @@ def test_symbol_pow():
 def test_pow_fn():
     shape = (3, 4)
     exp = mx.symbol.Variable("exp")
-    y = mx.sym.pow(2, exp)
     x = np.ones(shape)*3
-    check_numeric_gradient(y, [x], numeric_eps=1E-3)
-    check_symbolic_forward(y, [x], [2**x])
-    check_symbolic_backward(y, [x], [np.ones(shape)], [np.log(2) * 2**x])
+    for y in [mx.sym.pow(2, exp), mx.sym.power(2, exp)]:
+        check_numeric_gradient(y, [x], numeric_eps=1E-3)
+        check_symbolic_forward(y, [x], [2**x])
+        check_symbolic_backward(y, [x], [np.ones(shape)], [np.log(2) * 2**x])
 
 
 @with_seed()
@@ -862,6 +864,39 @@ def fselu_grad(grad, x, y):
         check_symbolic_backward(y, [xa], [np.ones(shape)], [ga], rtol=rtol, atol=atol, dtype=dtype)
 
 
+@with_seed()
+def test_gelu():
+    CUBE_CONSTANT = 0.044715
+    ROOT_TWO_OVER_PI = 0.7978845608028654
+    def g(x):
+        return ROOT_TWO_OVER_PI * (x + CUBE_CONSTANT * np.power(x, 3))
+    def g_grad(x):
+        return ROOT_TWO_OVER_PI * (1.0 + 3.0 * CUBE_CONSTANT * np.power(x, 2))
+    def f(x):
+        return 1.0 + np.tanh(g(x))
+    def f_grad(x):
+        return (1.0 - np.tanh(g(x)) * np.tanh(g(x))) * g_grad(x)
+    def fgelu(x):
+        return 0.5 * x * f(x)
+    def fgelu_grad(grad, x, y):
+        return grad * (y / x + y * (1 - np.tanh(g(x))) * g_grad(x))
+
+    shape = (3, 4)
+    x = mx.sym.Variable("x")
+    y = mx.sym.LeakyReLU(data=x, act_type="gelu")
+    for dtype in [np.float16, np.float32, np.float64]:
+        xa = np.random.uniform(low=-0.1,high=0.1,size=shape).astype(dtype)
+        eps, rtol, atol = (7.5e-4, 1e-1, 1e-2) if dtype is np.float16 else (1e-4, 1e-2, 1e-4)
+        if dtype is np.float16:
+            xa /= 10.0
+        xa[abs(xa) < eps] = 0.01
+        ya = fgelu(xa)
+        ga = fgelu_grad(np.ones(shape).astype(dtype), xa, ya)
+        check_numeric_gradient(y, [xa], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype)
+        check_symbolic_forward(y, [xa], [ya], rtol=rtol, atol=atol, dtype=dtype)
+        check_symbolic_backward(y, [xa], [np.ones(shape)], [ga], rtol=rtol, atol=atol, dtype=dtype)
+
+
 @with_seed()
 def test_sigmoid():
     def fsigmoid(a):
@@ -1598,6 +1633,128 @@ def check_batchnorm_training(stype):
     check_batchnorm_training('default')
 
 
+@with_seed()
+def test_batchnorm():
+    momentum = 0.9
+    epsilon = 1e-5
+
+    def _test_batchnorm_impl(op, shape, axis, cudnn_off, output_mean_var):
+        print(str((op, shape, axis, cudnn_off)))
+
+        kwargs = dict(output_mean_var=output_mean_var)
+        if op == mx.nd.contrib.SyncBatchNorm:
+            if axis != 1:
+                return
+            key = str(op) + str(shape) + str(axis)
+            kwargs.update(dict(key=key))
+            if cudnn_off:
+                return
+        else:
+            kwargs.update(dict(axis=axis, cudnn_off=cudnn_off))
+        nch = shape[axis]
+
+        bn_gamma = mx.nd.random.uniform(shape=(nch,))
+        bn_gamma.attach_grad()
+
+        bn_beta = mx.nd.random.uniform(shape=(nch,))
+        bn_beta.attach_grad()
+
+        bn_running_mean = mx.nd.zeros(nch)
+        bn_running_var = mx.nd.ones(nch)
+
+        running_mean = mx.nd.zeros(nch)
+        running_var = mx.nd.ones(nch)
+        num_iters = 10
+        expand_shape = [1] * len(shape)
+        expand_shape[axis] = shape[axis]
+        for _ in range(num_iters):
+            data = mx.nd.random.uniform(shape=shape)
+            data.attach_grad()
+            ograd = mx.nd.random.uniform(shape=shape)
+            with mx.autograd.record():
+                output = op(data, bn_gamma, bn_beta,
+                            bn_running_mean, bn_running_var,
+                            momentum=momentum, eps=epsilon,
+                            fix_gamma=False, **kwargs)
+                if output_mean_var:
+                    output, output_mean, output_std = output
+                output.backward(ograd)
+            mx.nd.waitall()
+
+            data_mean = data.mean(
+                axis=axis, exclude=True, keepdims=True)
+            data_var = (data - data_mean).square().mean(axis=axis,
+                                                        exclude=True,
+                                                        keepdims=True)
+
+            target_output = (data - data_mean) / \
+                (data_var + epsilon).sqrt() * \
+                bn_gamma.reshape(expand_shape) + \
+                bn_beta.reshape(expand_shape)
+
+            # squeeze data_mean and data_var
+            data_mean_flat = data_mean.squeeze()
+            data_var_flat = data_var.squeeze()
+
+            running_mean = running_mean * momentum + \
+                data_mean_flat * (1 - momentum)
+            running_var = running_var * momentum + \
+                data_var_flat * (1 - momentum)
+
+            W = bn_gamma.reshape(expand_shape)
+            dnx = ograd * W
+            xsm = data - data_mean
+            nd = 1.0 / mx.nd.sqrt(data_var + epsilon)
+            nx = xsm * nd
+            m = np.prod(shape) / shape[axis]
+            dvar = (dnx * xsm).sum(axis=axis, keepdims=True,
+                                   exclude=True) * (-0.5) * mx.nd.power(nd, 3)
+            dmean = -nd * dnx.sum(axis=axis, keepdims=True, exclude=True) - \
+                dvar * xsm.mean(axis=axis, keepdims=True,
+                                exclude=True) * 2.0
+            dX = dnx * nd + dvar * xsm * (2.0 / m) + dmean * (1.0 / m)
+            dW = (ograd * nx).sum(axis=axis, exclude=True)
+            db = ograd.sum(axis=axis, exclude=True)
+
+            atol = 1e-2
+            rtol = 1e-2
+
+            if output_mean_var:
+                assert_almost_equal(output_mean.asnumpy(),
+                                    data_mean_flat.asnumpy(),
+                                    atol=atol, rtol=rtol)
+                if op != mx.nd.contrib.SyncBatchNorm:
+                    assert_almost_equal(output_std.asnumpy(),
+                                        (1.0 / (data_var_flat +
+                                                epsilon).sqrt()).asnumpy(),
+                                        atol=atol, rtol=rtol)
+                else:
+                    assert_almost_equal(output_std.asnumpy(),
+                                        data_var_flat.asnumpy(),
+                                        atol=atol, rtol=rtol)
+            assert_almost_equal(output.asnumpy(), target_output.asnumpy(),
+                                atol=atol, rtol=rtol)
+            assert_almost_equal(bn_running_mean.asnumpy(
+            ), running_mean.asnumpy(), atol=atol, rtol=rtol)
+            assert_almost_equal(bn_running_var.asnumpy(
+            ), running_var.asnumpy(), atol=atol, rtol=rtol)
+
+            assert_almost_equal(data.grad.asnumpy(),
+                                dX.asnumpy(), atol=atol, rtol=rtol)
+            assert_almost_equal(
+                bn_gamma.grad.asnumpy(), dW.asnumpy(), atol=atol, rtol=rtol)
+            assert_almost_equal(
+                bn_beta.grad.asnumpy(), db.asnumpy(), atol=atol, rtol=rtol)
+
+    for op in [mx.nd.BatchNorm, mx.nd.contrib.SyncBatchNorm]:
+        for shape in [(24, 2), (24, 3, 4), (24, 4, 4, 4), (24, 5, 6, 4, 4)]:
+            for axis in range(len(shape)):
+                for cudnn_off in [False, True]:
+                    for output_mean_var in [False, True]:
+                        _test_batchnorm_impl(op, shape, axis,
+                                             cudnn_off, output_mean_var)
+
+
 @with_seed()
 def test_convolution_grouping():
     for dim in [1, 2, 3]:
@@ -2529,6 +2686,16 @@ def test_slice_like_different_types():
     z = mx.nd.slice_like(x, y)
     assert_allclose(z.asnumpy(), [[1,2,3],[5,6,7]])
 
+@with_seed()
+def test_reshape_like_different_types():
+    x = mx.nd.zeros((2, 3))
+
+    y = mx.nd.array([[1, 2], [3, 4], [5, 6]])
+
+    y = mx.nd.array(y).astype('int32')
+    z = mx.nd.reshape_like(x, y)
+    assert_allclose(z.asnumpy(), [[0,0],[0,0],[0,0]])
+
 @with_seed()
 def test_flip():
     for ndim in range(1, 6):
@@ -3219,41 +3386,46 @@ def l2norm(input_data, axis=0, keepdims=True):
     ctx = default_context()
     data = mx.symbol.Variable('data')
     in_data_dim = random_sample([4,5,6], 1)[0]
-    in_shape = rand_shape_nd(in_data_dim)
+    in_shape = rand_shape_nd(in_data_dim, dim=5)
     epsilon = 1e-3
+    acc_type = {np.float16: np.float32, np.float32: np.float32, np.float64: np.float64}
     for order in [1, 2]:
         for dtype in [np.float16, np.float32, np.float64]:
-            in_data = np.random.uniform(-1, 1, in_shape).astype(dtype)
-            in_data[abs(in_data) < epsilon] = 2 * epsilon
             for i in range(in_data_dim):
-                norm_sym = mx.symbol.norm(data=data, ord=order, axis=i, keepdims=True)
-                npy_out = l1norm(in_data, i) if order is 1 else l2norm(in_data, i)
-                npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
-                check_symbolic_forward(norm_sym, [in_data], [npy_out],
-                                        rtol=1e-2 if dtype is np.float16 else 1e-5,
-                                        atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
-                check_symbolic_backward(norm_sym, [in_data], [np.ones(npy_out.shape)],
-                                        [npy_out_backward],
-                                        rtol=1e-2 if dtype is np.float16 else 1e-5,
-                                        atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
-                # Disable numeric gradient https://github.com/apache/incubator-mxnet/issues/11509
-                # # check gradient
-                # if dtype is not np.float16:
-                #     check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon, rtol=1e-1, atol=1e-3)
-                if i < in_data_dim-1:
-                    norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, i+1), keepdims=True)
-                    npy_out = l1norm(in_data, (i, i+1)) if order is 1 else l2norm(in_data, (i, i+1))
+                for out_dtype in ['float32', 'float64']:
+                    backward_dtype = np.float32 if out_dtype == 'float32' else np.float64
+                    print(order, dtype, i, out_dtype, in_shape)
+                    in_data = np.random.uniform(-1, 1, in_shape).astype(acc_type[dtype])
+                    in_data[abs(in_data) < epsilon] = 2 * epsilon
+                    norm_sym = mx.symbol.norm(data=data, ord=order, axis=i, out_dtype=out_dtype, keepdims=True)
+                    npy_out = l1norm(in_data, i) if order is 1 else l2norm(in_data, i)
                     npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
-                    check_symbolic_forward(norm_sym, [in_data], [npy_out],
-                                           rtol=1e-2 if dtype is np.float16 else 1e-5,
-                                           atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
-                    check_symbolic_backward(norm_sym, [in_data], [np.ones(npy_out.shape)],
-                                            [npy_out_backward],
-                                            rtol=1e-2 if dtype is np.float16 else 1e-5,
-                                            atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
-                    # # check gradient
-                    # if dtype is not np.float16:
-                    #     check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon, rtol=1e-1, atol=1e-3)
+                    check_symbolic_forward(norm_sym, [in_data.astype(dtype)], [npy_out.astype(out_dtype)],
+                                           rtol=1e-3, atol=1e-5, ctx=ctx)
+                    check_symbolic_backward(norm_sym, [in_data.astype(dtype)],
+                                            [np.ones(npy_out.shape).astype(out_dtype)],
+                                            [npy_out_backward], rtol=1e-3, atol=1e-5, ctx=ctx,
+                                            dtype=backward_dtype)
+                    # Disable numeric gradient https://github.com/apache/incubator-mxnet/issues/11509
+                    # check gradient
+                    if dtype is not np.float16:
+                        check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon,
+                                               rtol=1e-1, atol=1e-3, dtype=backward_dtype)
+                    if i < in_data_dim-1:
+                        norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, i+1), keepdims=True)
+                        npy_out = l1norm(in_data, (i, i+1)) if order is 1 else l2norm(in_data, (i, i+1))
+                        npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
+                        check_symbolic_forward(norm_sym, [in_data], [npy_out.astype(dtype)],
+                                               rtol=1e-3 if dtype is np.float16 else 1e-3,
+                                               atol=1e-5 if dtype is np.float16 else 1e-5, ctx=ctx)
+                        check_symbolic_backward(norm_sym, [in_data],
+                                                [np.ones(npy_out.shape).astype(out_dtype)],
+                                                [npy_out_backward.astype(out_dtype)],
+                                                rtol=1e-3, atol=1e-5, ctx=ctx, dtype=backward_dtype)
+                        # check gradient
+                        if dtype is not np.float16:
+                            check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon,
+                                                   rtol=1e-1, atol=1e-3, dtype=backward_dtype)
 
 
 def test_layer_norm():
@@ -4233,7 +4405,8 @@ def test_invalid_reps():
         assert_exception(mx.nd.tile, MXNetError, data, (1, 0, 3))
 
     test_normal_case()
-    test_empty_tensor()
+    with mx.np_compat():
+        test_empty_tensor()
     test_empty_reps()
     test_tile_backward()
     test_tile_numeric_gradient()
@@ -4293,7 +4466,8 @@ def test_zero_depth():
     test_normal_case(index_type=np.float64)
     test_normal_case(index_type=np.float32)
     test_normal_case(index_type=np.float16)
-    test_empty_indices()
+    with mx.np_compat():
+        test_empty_indices()
     test_zero_depth()
 
 
@@ -4884,11 +5058,11 @@ def test_quantization_op():
     min0 = mx.nd.array([0.0])
     max0 = mx.nd.array([1.0])
     a  = mx.nd.array([[0.1392, 0.5928], [0.6027, 0.8579]])
-    qa, min1, max1 = mx.nd.contrib.quantize(a, min0, max0, out_type='uint8')
+    qa, min1, max1 = mx.nd.contrib.quantize(a, min0, max0, out_type='int8')
     a_ = mx.nd.contrib.dequantize(qa, min1, max1, out_type='float32')
 
-    qa_real = mx.nd.array([[35, 151], [154, 219]])
-    a_real  = mx.nd.array([[0.13725491, 0.59215689], [0.60392159, 0.8588236]])
+    qa_real = mx.nd.array([[18, 75], [77, 109]])
+    a_real  = mx.nd.array([[0.14173228, 0.5905512], [0.6062992, 0.8582677]])
 
     assert same(qa.asnumpy(), qa_real.asnumpy())
     assert same(a_.asnumpy(),  a_real.asnumpy())
@@ -5188,6 +5362,128 @@ def create_operator(self, ctx, shapes, dtypes):
         x = mx.nd.Custom(length=10, depth=10, op_type="no_input_op")
     assert_almost_equal(x.asnumpy(), np.ones(shape=(10, 10), dtype=np.float32))
 
+    # test custom operator fork
+    # see https://github.com/apache/incubator-mxnet/issues/14396
+    class AdditionOP(mx.operator.CustomOp):
+        def __init__(self):
+            super(AdditionOP, self).__init__()
+        def forward(self, is_train, req, in_data, out_data, aux):
+            out_data[0][:] = in_data[0] + in_data[1]
+        def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+            in_grad[0][:] = out_grad[0]
+            in_grad[1][:] = out_grad[0]
+
+    @mx.operator.register("AdditionOP")
+    class AdditionOPProp(mx.operator.CustomOpProp):
+        def __init__(self):
+            super(AdditionOPProp, self).__init__()
+        def list_arguments(self):
+            return ['a', 'b']
+        def list_outputs(self):
+            return ['output']
+        def infer_shape(self, in_shape):
+            return in_shape, [in_shape[0]]
+        def create_operator(self, ctx, shapes, dtypes):
+            return AdditionOP()
+
+    if not sys.platform.startswith('win'):  # no fork in windows
+        def custom_add():
+            a = mx.nd.array([1, 2, 3])
+            b = mx.nd.array([4, 5, 6])
+            c = mx.nd.Custom(a, b, op_type='AdditionOP')
+            assert_almost_equal((a + b).asnumpy(), c.asnumpy())
+
+        custom_add()
+        from multiprocessing import Process
+        p = Process(target=custom_add)
+        p.daemon = True
+        p.start()
+        p.join(5)
+        assert not p.is_alive(), "deadlock may exist in custom operator"
+
+
+def _build_dot_custom(fun_forward, name):
+    class Dot(mx.operator.CustomOp):
+        def __init__(self):
+            super(Dot, self).__init__()
+        def forward(self, is_train, req, in_data, out_data, aux):
+            fun_forward(in_data, out_data)
+        def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+            pass
+
+    @mx.operator.register(name)
+    class DotProp(mx.operator.CustomOpProp):
+        def __init__(self):
+            super(DotProp, self).__init__()
+        def list_arguments(self):
+            return ['a', 'b']
+        def list_outputs(self):
+            return ['output']
+        def infer_shape(self, in_shape):
+            return in_shape, [(in_shape[0][0], in_shape[1][1])]
+        def create_operator(self, ctx, shapes, dtypes):
+            return Dot()
+
+def _custom_exc3(seed):
+    def custom_exc3():
+        def f(in_data, out_data):
+            out_data[0][:] = mx.nd.dot(in_data[0], in_data[1])
+            out_data[0].wait_to_read()
+        _build_dot_custom(f, 'Dot3')
+        n = int(1e8)
+        a = mx.nd.zeros((n, 1))
+        b = mx.nd.zeros((1, n))
+        # trigger OOM
+        c = mx.nd.Custom(a, b, op_type='Dot3')
+        c.wait_to_read()
+    assert_raises(MXNetError, custom_exc3)
+
+def _custom_exc4(seed):
+    def custom_exc4():
+        def f(in_data, out_data):
+            out_data[0][:] = mx.nd.dot(in_data[0], in_data[1])
+        _build_dot_custom(f, 'Dot4')
+        n = int(1e8)
+        a = mx.nd.zeros((n, 1))
+        b = mx.nd.zeros((1, n))
+        # trigger OOM
+        c = mx.nd.Custom(a, b, op_type='Dot4')
+        c.wait_to_read()
+    assert_raises(MXNetError, custom_exc4)
+
+@with_seed()
+def test_custom_op_exc():
+    # test except handling
+    # see https://github.com/apache/incubator-mxnet/pull/14693
+    # 1. error in python code
+    def custom_exc1():
+        def f(in_data, out_data):
+            assert False
+            out_data[0][:] = mx.nd.dot(in_data[0], in_data[1])
+        _build_dot_custom(f, 'Dot1')
+        a = mx.nd.zeros((4, 1))
+        b = mx.nd.zeros((1, 4))
+        c = mx.nd.Custom(a, b, op_type='Dot1')
+        c.wait_to_read()
+    assert_raises(MXNetError, custom_exc1)
+
+    # 2. error in pushing operator to engine
+    def custom_exc2():
+        def f(in_data, out_data):
+            out_data[0][:] = mx.nd.dot(in_data[0], in_data[1])
+        _build_dot_custom(f, 'Dot2')
+        a = mx.nd.zeros((4, 2))
+        b = mx.nd.zeros((1, 4))
+        # trigger error by invalid input shapes of operands
+        c = mx.nd.Custom(a, b, op_type='Dot2')
+        c.wait_to_read()
+    assert_raises(MXNetError, custom_exc2)
+
+    # 3. error in real execution
+    run_in_spawned_process(_custom_exc3, {})
+    run_in_spawned_process(_custom_exc4, {})
+
+
 @with_seed()
 def test_psroipooling():
     for num_rois in [1, 2]:
@@ -6471,7 +6767,12 @@ def test_binary_math_operators():
                 lambda x, y: np.power(x, y),
                 lambda x, y: np.power(x, y - 1.) * y,
                 lambda x, y: np.power(x, y) * np.log(x),
-                0.2, 5.0, -4.0, 4.0]
+                0.2, 5.0, -4.0, 4.0],
+        'power': [lambda x, y: mx.sym.power(x, y),
+                  lambda x, y: np.power(x, y),
+                  lambda x, y: np.power(x, y - 1.) * y,
+                  lambda x, y: np.power(x, y) * np.log(x),
+                  0.2, 5.0, -4.0, 4.0]
     }
     # Loop over operators
     for name, op in binary_ops.items():
@@ -6496,6 +6797,75 @@ def test_softmax():
     check_smoothed_softmax_grad(default_context())
 
 
+@with_seed()
+def test_softmax_output_normalization():
+    def _softmaxoutput_normalization(multi_output, use_ignore, normalization):
+        grad_scale = np.random.random()
+        batch_size = 8
+        num_labels = 6
+        H, W = 3, 3
+        ignore_label = np.random.randint(0, num_labels) if use_ignore else -1
+
+        if multi_output:
+            data_shape = (batch_size, num_labels, H, W)
+            label_shape = (batch_size, H, W)
+        else:
+            data_shape = (batch_size, num_labels)
+            label_shape = (batch_size, )
+
+        data = mx.nd.random.uniform(-1, 1, shape=data_shape)
+        label = mx.nd.random.randint(
+            0, num_labels, shape=label_shape).astype('float32')
+        data.attach_grad()
+
+        kwargs = dict(grad_scale=grad_scale,
+                      normalization=normalization, multi_output=multi_output)
+        if use_ignore:
+            kwargs.update(use_ignore=True, ignore_label=ignore_label)
+
+        with mx.autograd.record():
+            out = mx.nd.SoftmaxOutput(data=data, label=label, **kwargs)
+        out.backward(mx.nd.ones_like(data))
+
+        exp_data = mx.nd.exp(data)
+        softmax_data = exp_data / exp_data.sum(1, keepdims=True)
+        argmax_data = mx.nd.argmax(data, axis=1)
+
+        assert_almost_equal(out.asnumpy(), softmax_data.asnumpy())
+        one_hot_label = mx.nd.one_hot(label, num_labels)
+        if multi_output:
+            one_hot_label = one_hot_label.transpose((0, 3, 1, 2))
+        data_grad = softmax_data - one_hot_label
+
+        if use_ignore:
+            if multi_output:
+                data_grad *= (label !=
+                              ignore_label).reshape((batch_size, 1, H, W))
+            else:
+                data_grad *= (label != ignore_label).reshape((batch_size, 1))
+
+        valid_cnt = 1
+        if normalization == 'batch':
+            valid_cnt = batch_size
+        elif normalization == 'valid':
+            valid_cnt = mx.nd.maximum(1, (label != ignore_label).sum())
+        scale = grad_scale / valid_cnt
+
+        if multi_output:
+            if normalization != 'valid':
+                scale /= H * W
+
+        data_grad *= scale
+
+        assert_almost_equal(data.grad.asnumpy(), data_grad.asnumpy())
+
+    for multi_output in [False, True]:
+        for use_ignore in [False, True]:
+            for normalization in ['null', 'batch', 'valid']:
+                _softmaxoutput_normalization(
+                    multi_output, use_ignore, normalization)
+
+
 @with_seed()
 def test_slice():
     def test_slice_forward_backward(a, index):
@@ -6527,12 +6897,22 @@ def test_slice_forward_backward(a, index):
     for index in index_list:
         test_slice_forward_backward(arr, index)
 
+    def test_begin_equals_end(shape, begin, end, step):
+        in_arr = mx.nd.arange(np.prod(shape)).reshape(shape=shape)
+        out_arr = mx.nd.slice(in_arr, begin=begin, end=end, step=step)
+
+    assertRaises(MXNetError, test_begin_equals_end, (4,), (2,), (2,), (1,))
+    assertRaises(MXNetError, test_begin_equals_end, (1, 5), (None, 3), (None, 3), (-1, 1))
+    assertRaises(MXNetError, test_begin_equals_end, (3, 4, 5), (1, 3, 1), (3, 3, 1), (1, -3, 2))
+    assertRaises(MXNetError, test_begin_equals_end, (2, 4), (None, 2), (None, 2), (1, -1))
+
     # check numeric gradient
     in_data = np.arange(36).reshape(2, 2, 3, 3)
     data = mx.sym.Variable('data')
     slice_sym = mx.sym.slice(data, begin=[0, None], end=[1, None], step=[2, -1])
     check_numeric_gradient(slice_sym, [in_data])
 
+
 def test_slice_partial_infer():
     def check_slice_partial_infer(data, begin, end, step, expected_out_shape):
         out = mx.sym.slice(data, begin=begin, end=end, step=step)
@@ -6555,6 +6935,21 @@ def check_slice_axis_partial_infer(data, axis, begin, end, expected_out_shape):
     check_slice_axis_partial_infer(var1, 0, 0, 5, (5, 0))
     check_slice_axis_partial_infer(var1, 1, 0, 5, (10, 0))
 
+    with mx.np_compat():
+        var1 = mx.sym.var(name="data", shape=(-1, 20))
+        check_slice_partial_infer(var1, (None, None), (None, 10), [], (-1, 10))
+        check_slice_partial_infer(var1, (None, None), (None, 10), (None, 2), (-1, 5))
+        check_slice_partial_infer(var1, (None, 3), (None, 10), [], (-1, 7))
+        check_slice_partial_infer(var1, (None, 3), (5, 10), [], (-1, 7))
+        check_slice_partial_infer(var1, (2, 3), (None, 10), [], (-1, 7))
+        check_slice_partial_infer(var1, (2, 3), (None, 10), (None, 1), (-1, 7))
+        check_slice_partial_infer(var1, (2, 3), (None, 10), (3, 3), (-1, 3))
+
+        var1 = mx.sym.var(name='data', shape=(10, -1))
+        check_slice_axis_partial_infer(var1, 0, 0, 5, (5, -1))
+        check_slice_axis_partial_infer(var1, 1, 0, 5, (10, -1))
+
+
 @with_seed()
 def test_float16_min_max():
     """Test for issue: https://github.com/apache/incubator-mxnet/issues/9007"""
@@ -6880,7 +7275,12 @@ def get_output_names_callback(name, arr):
 
         op_exe = op_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
         op_exe.set_monitor_callback(get_output_names_callback, monitor_all=False)
-        op_exe.forward()
+        try:
+            op_exe.forward()
+            mx.nd.waitall()
+        except mx.base.MXNetError:
+            # skip errors since test is to check output names
+            pass
         for output_name, expected_name in zip(output_names, expected_names):
             assert output_name == expected_name
 
@@ -6926,7 +7326,12 @@ def get_output_names_callback(name, arr):
 
         op_exe = op_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
         op_exe.set_monitor_callback(get_output_names_callback, monitor_all=True)
-        op_exe.forward()
+        try:
+            op_exe.forward()
+            mx.nd.waitall()
+        except mx.base.MXNetError:
+            # skip errors since test is to check all names
+            pass
         for output_name, expected_name in zip(output_names, expected_names):
             assert output_name == expected_name
 
@@ -7025,6 +7430,13 @@ def test_ravel():
       check_symbolic_forward(b, location={'a': data}, expected=[ravel_npy])
       c = mx.sym.unravel_index(a, shape=shape)
       check_symbolic_forward(c, location={'a': ravel_npy}, expected=[data])
+      # Test with leading dimension set to -1.
+      shape2 = shape
+      shape2 = (-1,)+shape[1:]
+      b = mx.sym.ravel_multi_index(a, shape=shape2)
+      check_symbolic_forward(b, location={'a': data}, expected=[ravel_npy])
+      c = mx.sym.unravel_index(a, shape=shape2)
+      check_symbolic_forward(c, location={'a': ravel_npy}, expected=[data])
 
 def test_context_num_gpus():
     try:
@@ -7585,6 +7997,74 @@ def test_image_normalize():
     check_numeric_gradient(img_norm_sym, [data_in_4d], atol=0.001)
 
 
+@with_seed()
+def test_scalar_tensor_creation():
+    assertRaises(MXNetError, mx.nd.zeros, shape=())
+    assertRaises(MXNetError, mx.nd.ones, shape=())
+    with mx.np_compat():
+        data_mx = mx.nd.ones(shape=())
+        data_np = np.ones((), dtype=data_mx.dtype)
+        assert same(data_mx.asnumpy(), data_np)
+
+
+@with_seed()
+def test_zero_size_tensor_creation():
+    assertRaises(MXNetError, mx.nd.zeros, shape=(0, 1, 3, 0))
+    assertRaises(MXNetError, mx.nd.ones, shape=(0, 1, 3, 0))
+    with mx.np_compat():
+        data_mx = mx.nd.ones(shape=(0, 1, 0, 4))
+        data_np = np.ones(shape=data_mx.shape, dtype=data_mx.dtype)
+        assert same(data_mx.asnumpy(), data_np)
+
+
+@with_seed()
+def test_concat_with_zero_size_tensor():
+    with mx.np_compat():
+        data1 = mx.nd.ones((0, 8, 12))
+        data2 = mx.nd.ones((3, 8, 12))
+        data3 = mx.nd.ones((0, 8, 12))
+        ret = mx.nd.Concat(data1, data2, data3, dim=0)
+        assert ret.shape == (3, 8, 12)
+
+        data1 = mx.nd.ones((0, 3, 10))
+        data2 = mx.nd.ones((0, 4, 10))
+        data3 = mx.nd.ones((0, 5, 10))
+        ret = mx.nd.Concat(data1, data2, data3, dim=1)
+        assert ret.shape == (0, 12, 10)
+
+
+@with_seed()
+def test_np_compat_decorator():
+    @mx.use_np_compat
+    def check_scalar_one():
+        """Generate scalar one tensor"""
+        return mx.nd.ones(shape=())
+    assert check_scalar_one.__name__ == "check_scalar_one"
+    assert check_scalar_one.__doc__ == "Generate scalar one tensor"
+    assert check_scalar_one().shape == ()
+    for active in [True, False]:
+        with mx.np_compat(active=active):
+            assert check_scalar_one.__name__ == "check_scalar_one"
+            assert check_scalar_one.__doc__ == "Generate scalar one tensor"
+            assert check_scalar_one().shape == ()
+
+    @mx.use_np_compat
+    def check_concat(shape1, shape2, axis):
+        data1 = mx.nd.ones(shape1)
+        data2 = mx.nd.ones(shape2)
+        ret = mx.nd.Concat(data1, data2, dim=axis)
+        expected_ret = np.concatenate((data1.asnumpy(), data2.asnumpy()), axis=axis)
+        assert ret.shape == expected_ret.shape
+
+    check_concat((0, 3, 4), (5, 3, 4), 0)
+    check_concat((8, 0, 5), (8, 7, 5), 1)
+    check_concat((8, 0, 0), (8, 0, 0), 2)
+    for active in [True, False]:
+        check_concat((0, 3, 4), (5, 3, 4), 0)
+        check_concat((8, 0, 5), (8, 7, 5), 1)
+        check_concat((8, 0, 0), (8, 0, 0), 2)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 3fdd1cd6bb87..d5aabcb4b1e5 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -517,7 +517,9 @@ def compare_optimizer_noise_seeded(opt1, opt2, shape, dtype, noise_seed,
                 if (dtype == np.float16 and ('multi_precision' not in kwarg or
                     not kwarg['multi_precision'])):
                     continue
-                compare_optimizer_noise_seeded(opt1(**kwarg), opt2(**kwarg), shape, dtype, seed)
+                atol = 1e-2 if dtype == np.float16 else 1e-3
+                rtol = 1e-4 if dtype == np.float16 else 1e-5
+                compare_optimizer_noise_seeded(opt1(**kwarg), opt2(**kwarg), shape, dtype, seed, atol=atol, rtol=rtol)
 
 
 
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 4e31a23a98fb..8fbd97d8a162 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -336,6 +336,7 @@ def test_parallel_random_seed_setting():
         # Avoid excessive test cpu runtimes
         num_temp_seeds = 25 if ctx.device_type == 'gpu' else 1
         # To flush out a possible race condition, run multiple times
+
         for _ in range(20):
             # Create enough samples such that we get a meaningful distribution.
             shape = (200, 200)
@@ -560,12 +561,12 @@ def test_exponential_generator():
         for scale in [0.1, 1.0]:
             buckets, probs = gen_buckets_probs_with_ppf(lambda x: ss.expon.ppf(x, loc=0, scale=scale), 5)
             generator_mx = lambda x: mx.nd.random.exponential(scale, shape=x, ctx=ctx, dtype=dtype).asnumpy()
-            verify_generator(generator=generator_mx, buckets=buckets, probs=probs)
+            verify_generator(generator=generator_mx, buckets=buckets, probs=probs, success_rate=0.20)
             generator_mx_same_seed = \
                 lambda x: np.concatenate(
                     [mx.nd.random.exponential(scale, shape=x // 10, ctx=ctx, dtype=dtype).asnumpy()
                      for _ in range(10)])
-            verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
+            verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs, success_rate=0.20)
 
 @with_seed()
 def test_poisson_generator():
@@ -582,7 +583,6 @@ def test_poisson_generator():
                      for _ in range(10)])
             verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
 
-@unittest.skip("Flaky test. Tracked in https://github.com/apache/incubator-mxnet/issues/13506")
 @with_seed()
 def test_negative_binomial_generator():
     ctx = mx.context.current_context()
@@ -640,15 +640,20 @@ def quantize_probs(probs, dtype):
         quantized_probs = quantize_probs(probs, dtype)
         generator_mx = lambda x: mx.nd.random.multinomial(data=mx.nd.array(quantized_probs, ctx=ctx, dtype=dtype),
                                                           shape=x).asnumpy()
+        # success_rate was set to 0.15 since PR #13498 and became flaky
+        # both of previous issues(#14457, #14158) failed with success_rate 0.25
+        # In func verify_generator inside test_utilis.py
+        # it raise the error when success_num(1) < nrepeat(5) * success_rate(0.25)
+        # by changing the 0.25 -> 0.2 solve these edge case but still have strictness
         verify_generator(generator=generator_mx, buckets=buckets, probs=quantized_probs,
-                         nsamples=samples, nrepeat=trials)
+                         nsamples=samples, nrepeat=trials, success_rate=0.20)
         generator_mx_same_seed = \
             lambda x: np.concatenate(
                 [mx.nd.random.multinomial(data=mx.nd.array(quantized_probs, ctx=ctx, dtype=dtype),
                                                           shape=x // 10).asnumpy()
                  for _ in range(10)])
         verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=quantized_probs,
-                         nsamples=samples, nrepeat=trials)
+                         nsamples=samples, nrepeat=trials, success_rate=0.20)
 
 
 @with_seed()
@@ -670,7 +675,7 @@ def gen_data(seed=None):
         with random_seed(seed):
             python_data = [rnd.random() for _ in range(size)]
             np_data = np.random.rand(size)
-            mx_data = mx.nd.random_uniform(shape=shape, ctx=ctx).asnumpy()
+            mx_data = mx.random.uniform(shape=shape, ctx=ctx).asnumpy()
         return (seed, python_data, np_data, mx_data)
 
     # check data, expecting them to be the same or different based on the seeds
@@ -712,6 +717,32 @@ def check_data(a, b):
         for j in range(i+1, num_seeds):
             check_data(data[i],data[j])
 
+@with_seed()
+def test_random_seed():
+    shape = (5, 5)
+    seed = rnd.randint(-(1 << 31), (1 << 31))
+
+    def _assert_same_mx_arrays(a, b):
+        assert len(a) == len(b)
+        for a_i, b_i in zip(a, b):
+            assert (a_i.asnumpy() == b_i.asnumpy()).all()
+
+    N = 100
+    mx.random.seed(seed)
+    v1 = [mx.random.uniform(shape=shape) for _ in range(N)]
+
+    mx.random.seed(seed)
+    v2 = [mx.random.uniform(shape=shape) for _ in range(N)]
+    _assert_same_mx_arrays(v1, v2)
+
+    try:
+        long
+        mx.random.seed(long(seed))
+        v3 = [mx.random.uniform(shape=shape) for _ in range(N)]
+        _assert_same_mx_arrays(v1, v3)
+    except NameError:
+        pass
+
 @with_seed()
 def test_unique_zipfian_generator():
     ctx = mx.context.current_context()
diff --git a/tests/python/unittest/test_subgraph_op.py b/tests/python/unittest/test_subgraph_op.py
index 40d609ad3541..f3c5dfd4e091 100644
--- a/tests/python/unittest/test_subgraph_op.py
+++ b/tests/python/unittest/test_subgraph_op.py
@@ -24,12 +24,12 @@
 from mxnet.test_utils import assert_almost_equal
 
 
-def test_subgraph_exe():
-    def _check_subgraph_exe1(sym, op_names):
+def _test_subgraph_exe(subgraph_backend):
+    def _check_subgraph_exe1(sym, subgraph_backend, op_names):
         """Use the partitioned sym to simple_bind an executor and compare the outputs
         with those of the original executor"""
         out = SymbolHandle()
-        check_call(_LIB.MXPartitionGraphByOpNames(sym.handle, c_str('default'), mx_uint(len(op_names)),
+        check_call(_LIB.MXBuildSubgraphByOpNames(sym.handle, c_str(subgraph_backend), mx_uint(len(op_names)),
                                                   c_str_array(op_names), ctypes.byref(out)))
 
         partitioned_sym = Symbol(out)
@@ -54,7 +54,7 @@ def _check_subgraph_exe1(sym, op_names):
             assert_almost_equal((exe.outputs[i] - partitioned_exe.outputs[i]).abs().sum().asnumpy(),
                                 np.zeros(shape=(1,)))
 
-    def _check_subgraph_exe2(sym, op_names):
+    def _check_subgraph_exe2(sym, subgraph_backend, op_names):
         """Use env var MXNET_SUBGRAPH_BACKEND=default to trigger graph partitioning in simple_bind
         and compare results of the partitioned sym and the original sym."""
         def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None):
@@ -79,18 +79,18 @@ def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None):
             return exe
 
         original_exec = get_executor(sym)
-        partitioned_exec = get_executor(sym, 'default', op_names, original_exec)
+        partitioned_exec = get_executor(sym, subgraph_backend, op_names, original_exec)
         outputs1 = original_exec.outputs
         outputs2 = partitioned_exec.outputs
         assert len(outputs1) == len(outputs2)
         for i in range(len(outputs1)):
             assert_almost_equal((outputs1[i] - outputs2[i]).abs().sum().asnumpy(), np.zeros(shape=(1,)))
 
-    def _check_subgraph_exe3(sym, op_names):
+    def _check_subgraph_exe3(sym, subgraph_backend, op_names):
         """Use the partitioned sym to bind an executor and compare the outputs
         with those of the original executor"""
         out = SymbolHandle()
-        check_call(_LIB.MXPartitionGraphByOpNames(sym.handle, c_str('default'), mx_uint(len(op_names)),
+        check_call(_LIB.MXBuildSubgraphByOpNames(sym.handle, c_str(subgraph_backend), mx_uint(len(op_names)),
                                                   c_str_array(op_names), ctypes.byref(out)))
 
         partitioned_sym = Symbol(out)
@@ -113,7 +113,7 @@ def _check_subgraph_exe3(sym, op_names):
             assert_almost_equal((exe.outputs[i] - partitioned_exe.outputs[i]).abs().sum().asnumpy(),
                                 np.zeros(shape=(1,)))
 
-    def _check_subgraph_exe4(sym, op_names):
+    def _check_subgraph_exe4(sym, subgraph_backend, op_names):
         """Use env var MXNET_SUBGRAPH_BACKEND=default to trigger graph partitioning in bind
         and compare results of the partitioned sym and the original sym."""
         def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None):
@@ -139,38 +139,38 @@ def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None):
             return exe
 
         original_exec = get_executor(sym)
-        partitioned_exec = get_executor(sym, 'default', op_names, original_exec)
+        partitioned_exec = get_executor(sym, subgraph_backend, op_names, original_exec)
         outputs1 = original_exec.outputs
         outputs2 = partitioned_exec.outputs
         assert len(outputs1) == len(outputs2)
         for i in range(len(outputs1)):
             assert_almost_equal((outputs1[i] - outputs2[i]).abs().sum().asnumpy(), np.zeros(shape=(1,)))
 
-    def check_subgraph_exe(sym, op_names):
-        _check_subgraph_exe1(sym, op_names)
-        _check_subgraph_exe2(sym, op_names)
-        _check_subgraph_exe3(sym, op_names)
-        _check_subgraph_exe4(sym, op_names)
+    def check_subgraph_exe(sym, subgraph_backend, op_names):
+        _check_subgraph_exe1(sym, subgraph_backend, op_names)
+        _check_subgraph_exe2(sym, subgraph_backend, op_names)
+        _check_subgraph_exe3(sym, subgraph_backend, op_names)
+        _check_subgraph_exe4(sym, subgraph_backend, op_names)
 
-    def test_network_structure_1():
+    def test_network_structure_1(subgraph_backend):
         data1 = mx.sym.var('data1', shape=(2, 3, 10, 10))
         data2 = mx.sym.var('data2')
         conv1 = mx.sym.Convolution(data=data1, weight=data2, no_bias=True, kernel=(2, 2), num_filter=1)
         conv2 = mx.sym.Convolution(data=data2, no_bias=True, kernel=(1, 1), num_filter=1)
         out = mx.sym.Group([conv1, conv2])
-        check_subgraph_exe(out, ['Convolution'])
+        check_subgraph_exe(out, subgraph_backend, ['Convolution'])
 
-    def test_network_structure_2():
+    def test_network_structure_2(subgraph_backend):
         # this tests whether the partitioning algorithm can deal with cycles
         data = mx.sym.var('data', shape=(2, 3, 10, 10))
         ret = mx.sym.exp(data)
         ret1 = mx.sym.cos(ret)
         ret2 = mx.sym.sin(ret)
         ret = ret1 + ret2
-        check_subgraph_exe(ret, ['exp', 'sin', '_Plus', 'elemwise_add', '_plus'])
-        check_subgraph_exe(ret, ['exp', 'cos', '_Plus', 'elemwise_add', '_plus'])
+        check_subgraph_exe(ret, subgraph_backend, ['exp', 'sin', '_Plus', 'elemwise_add', '_plus'])
+        check_subgraph_exe(ret, subgraph_backend, ['exp', 'cos', '_Plus', 'elemwise_add', '_plus'])
 
-    def test_network_structure_3():
+    def test_network_structure_3(subgraph_backend):
         # this tests whether the partitioned sym can distinguish in_args and aux_states
         data = mx.sym.var('data', shape=(2, 3, 10, 10))
         ret = mx.sym.exp(data)
@@ -179,27 +179,27 @@ def test_network_structure_3():
         ret = ret1 + ret2
         ret = mx.sym.BatchNorm(ret)
         ret = mx.sym.BatchNorm(ret)
-        check_subgraph_exe(ret, ['exp', 'sin', '_Plus', 'elemwise_add', '_plus'])
-        check_subgraph_exe(ret, ['exp', 'cos', '_Plus', 'elemwise_add', '_plus'])
-        check_subgraph_exe(ret, ['exp', 'sin', '_Plus', 'elemwise_add', '_plus', 'BatchNorm'])
-        check_subgraph_exe(ret, ['exp', 'cos', '_Plus', 'elemwise_add', '_plus', 'BatchNorm'])
-        check_subgraph_exe(ret, ['exp', 'BatchNorm'])
-        check_subgraph_exe(ret, ['BatchNorm'])
-
-    def test_network_structure_4():
+        check_subgraph_exe(ret, subgraph_backend, ['exp', 'sin', '_Plus', 'elemwise_add', '_plus'])
+        check_subgraph_exe(ret, subgraph_backend, ['exp', 'cos', '_Plus', 'elemwise_add', '_plus'])
+        check_subgraph_exe(ret, subgraph_backend, ['exp', 'sin', '_Plus', 'elemwise_add', '_plus', 'BatchNorm'])
+        check_subgraph_exe(ret, subgraph_backend, ['exp', 'cos', '_Plus', 'elemwise_add', '_plus', 'BatchNorm'])
+        check_subgraph_exe(ret, subgraph_backend, ['exp', 'BatchNorm'])
+        check_subgraph_exe(ret, subgraph_backend, ['BatchNorm'])
+
+    def test_network_structure_4(subgraph_backend):
         # the last op has multiple duplicate outputs
         data = mx.sym.var('data', shape=(2, 3, 10, 10))
         ret = mx.sym.exp(data)
         ret = mx.sym.Group([ret, ret, ret])
-        check_subgraph_exe(ret, ['exp'])
+        check_subgraph_exe(ret, subgraph_backend, ['exp'])
 
-    def test_network_structure_5():
+    def test_network_structure_5(subgraph_backend):
         # the subgraph has two duplicate input entries
         data = mx.sym.var('data', shape=(2, 3, 10, 10))
         ret = data + data
-        check_subgraph_exe(ret, ['_plus', '_Plus', 'elemwise_add'])
+        check_subgraph_exe(ret, subgraph_backend, ['_plus', '_Plus', 'elemwise_add'])
 
-    def test_network_structure_6():
+    def test_network_structure_6(subgraph_backend):
         def get_graph():
             data1 = mx.sym.Variable('data1', shape=(3, 3, 10, 10), dtype=np.float32)
             data2 = mx.sym.Variable('data2', shape=(1, 0, 2, 2))
@@ -212,9 +212,9 @@ def get_graph():
             return rets
 
         for sym, op_names in get_graph():
-            check_subgraph_exe(sym, op_names)
+            check_subgraph_exe(sym, subgraph_backend, op_names)
 
-    def test_network_structure_7():
+    def test_network_structure_7(subgraph_backend):
         # in this graph, the subgraph node and the other two external nodes form a cycle
         data = mx.sym.Variable('data', shape=(1,))
         ret1 = mx.sym.sin(data)
@@ -222,16 +222,21 @@ def test_network_structure_7():
         for _ in range(5):
             ret2 = mx.sym.cos(ret2)
         ret = ret1 + ret2
-        check_subgraph_exe(ret, ['sin', 'elemwise_add', '_plus', '_Plus'])
+        check_subgraph_exe(ret, subgraph_backend, ['sin', 'elemwise_add', '_plus', '_Plus'])
+
+    test_network_structure_1(subgraph_backend)
+    test_network_structure_2(subgraph_backend)
+    test_network_structure_3(subgraph_backend)
+    test_network_structure_4(subgraph_backend)
+    test_network_structure_5(subgraph_backend)
+    test_network_structure_6(subgraph_backend)
+    test_network_structure_7(subgraph_backend)
 
-    test_network_structure_1()
-    test_network_structure_2()
-    test_network_structure_3()
-    test_network_structure_4()
-    test_network_structure_5()
-    test_network_structure_6()
-    test_network_structure_7()
+def test_subgraph_exe():
+    _test_subgraph_exe('default')
 
+def test_subgraph_v2_exe():
+    _test_subgraph_exe('default_v2')
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index ac4564b66fa0..b290ff344227 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -157,6 +157,19 @@ def test_symbol_infer_shape():
     assert arg_shapes['x2h_weight'] == (num_hidden, num_dim)
     assert arg_shapes['h2h_weight'] == (num_hidden, num_hidden)
 
+    # Partial shape inference with some unknown dimensions
+    data_shape = (1, 0, 0, 0)
+    data = mx.sym.Variable('data', shape=data_shape)
+    weight = mx.sym.Variable('weight')
+    cdata = mx.sym.cast(data, dtype='float16')
+    cweight = mx.sym.cast(weight, dtype='float16')
+    test = mx.sym.Convolution(data=cdata, weight=cweight, pad=(3, 3), num_filter=64, stride=(2, 2), no_bias=True, kernel=(7, 7))
+
+    arg, _, _ = test.infer_shape_partial()
+    arg_shapes = dict(zip(test.list_arguments(), arg))
+    assert arg_shapes['data'] == data_shape
+    assert arg_shapes['weight'] == (64, 0, 7, 7)
+
 
 def test_symbol_infer_shape_var():
     "Test specifying shape information when constructing a variable"
diff --git a/tests/python/unittest/test_viz.py b/tests/python/unittest/test_viz.py
index fe564b0088f8..13210993014a 100644
--- a/tests/python/unittest/test_viz.py
+++ b/tests/python/unittest/test_viz.py
@@ -19,6 +19,7 @@
 import warnings
 
 import mxnet as mx
+import numpy as np
 
 
 def test_print_summary():
@@ -55,6 +56,7 @@ def test_plot_network():
     net = mx.sym.SoftmaxOutput(data=net, name='out')
     with warnings.catch_warnings(record=True) as w:
         digraph = mx.viz.plot_network(net, shape={'data': (100, 200)},
+                                      dtype={'data': np.float32},
                                       node_attrs={"fixedsize": "false"})
     assert len(w) == 1
     assert "There are multiple variables with the same name in your graph" in str(w[-1].message)
diff --git a/tests/requirements.txt b/tests/requirements.txt
index f64f7ffb6705..e16b764d2746 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,5 +1,8 @@
 # Requirements for tests, those are installed before running on the virtualenv
+# Requirements for tests run within the qemu requirement see ci/qemu/test_requirements.txt
 mock
 nose
 nose-timer
 ipython
+numpy
+scipy
diff --git a/tests/tutorials/test_sanity_tutorials.py b/tests/tutorials/test_sanity_tutorials.py
index 429527db2000..7865000c7608 100644
--- a/tests/tutorials/test_sanity_tutorials.py
+++ b/tests/tutorials/test_sanity_tutorials.py
@@ -33,6 +33,8 @@
              'embedded/index.md',
              'embedded/wine_detector.md',
              'gluon/index.md',
+             'mkldnn/index.md',
+             'mkldnn/MKLDNN_README.md',
              'nlp/index.md',
              'onnx/index.md',
              'python/index.md',
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index 37ba9918fb70..c58881c35054 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -139,9 +139,6 @@ def test_onnx_fine_tuning_gluon():
 def test_onnx_inference_on_onnx_model():
     assert _test_tutorial_nb('onnx/inference_on_onnx_model')
 
-def test_python_matrix_factorization():
-    assert _test_tutorial_nb('python/matrix_factorization')
-
 def test_python_linear_regression():
     assert _test_tutorial_nb('python/linear-regression')
 
@@ -169,6 +166,9 @@ def test_python_data_augmentation_with_masks():
 def test_python_kvstore():
     assert _test_tutorial_nb('python/kvstore')
 
+def test_module_to_gluon():
+    assert _test_tutorial_nb('python/module_to_gluon')
+
 def test_python_types_of_data_augmentation():
     assert _test_tutorial_nb('python/types_of_data_augmentation')
 
@@ -198,4 +198,3 @@ def test_vision_cnn_visualization():
 
 def test_control_flow():
     assert _test_tutorial_nb('control_flow/ControlFlowTutorial')
-    
diff --git a/tools/bandwidth/test_measure.py b/tools/bandwidth/test_measure.py
index 375290fe6853..d14a7aae5196 100644
--- a/tools/bandwidth/test_measure.py
+++ b/tools/bandwidth/test_measure.py
@@ -21,13 +21,11 @@
 from measure import run
 import subprocess
 import logging
+
+import mxnet as mx
+
 def get_gpus():
-    try:
-        re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True)
-    except OSError:
-        return ''
-    gpus = [i for i in re.split('\n') if 'GPU' in i]
-    return ','.join([str(i) for i in range(len(gpus))])
+    return ','.join([str(i) for i in range(mx.util.get_gpu_count())])
 
 def test_measure(**kwargs):
     logging.info(kwargs)
diff --git a/tools/coreml/converter/_layers.py b/tools/coreml/converter/_layers.py
index 8f4bc1a8a02c..6590b13b108f 100644
--- a/tools/coreml/converter/_layers.py
+++ b/tools/coreml/converter/_layers.py
@@ -220,6 +220,49 @@ def convert_activation(net, node, module, builder):
                            output_name = output_name)
 
 
+def convert_leakyrelu(net, node, module, builder):
+    """Convert a leakyrelu layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+
+    input_name, output_name = _get_input_output_name(net, node)
+    name = node['name']
+    inputs = node['inputs']
+    args, _ = module.get_params()
+    mx_non_linearity = _get_attrs(node)['act_type']
+    if mx_non_linearity == 'elu':
+        non_linearity = 'ELU'
+        slope = _get_attrs(node)['slope'] if 'slope' in _get_attrs(node) else 0.25
+        params = slope
+    elif mx_non_linearity == 'leaky':
+        non_linearity = 'LEAKYRELU'
+        slope = _get_attrs(node)['slope'] if 'slope' in _get_attrs(node) else 0.25
+        params = [slope]
+    elif mx_non_linearity == 'prelu':
+        non_linearity = 'PRELU'
+        params = args[_get_node_name(net, inputs[1][0])].asnumpy()
+    else:
+        raise TypeError('Unknown activation type %s' % mx_non_linearity)
+    builder.add_activation(name = name,
+                           non_linearity = non_linearity,
+                           input_name = input_name,
+                           output_name = output_name,
+                           params = params)
+
+
 def convert_elementwise_add(net, node, module, builder):
     """Convert an elementwise add layer from mxnet to coreml.
 
@@ -335,6 +378,7 @@ def convert_convolution(net, node, module, builder):
     border_mode = "valid"
 
     n_filters = int(param['num_filter'])
+    n_groups = int(param['num_group']) if 'num_group' in param else 1
 
     W = args[_get_node_name(net, inputs[1][0])].asnumpy()
     if has_bias:
@@ -361,7 +405,7 @@ def convert_convolution(net, node, module, builder):
         stride_height=stride_height,
         stride_width=stride_width,
         border_mode=border_mode,
-        groups=1,
+        groups=n_groups,
         W=W,
         b=Wb,
         has_bias=has_bias,
diff --git a/tools/coreml/converter/_mxnet_converter.py b/tools/coreml/converter/_mxnet_converter.py
index 2d91eb86a965..c5fd37b34aed 100644
--- a/tools/coreml/converter/_mxnet_converter.py
+++ b/tools/coreml/converter/_mxnet_converter.py
@@ -38,11 +38,14 @@
     'elemwise_add'   : _layers.convert_elementwise_add,
     'Reshape'        : _layers.convert_reshape,
     'Deconvolution'  : _layers.convert_deconvolution,
+    'LeakyReLU'      : _layers.convert_leakyrelu,
 }
 
 _MXNET_SKIP_LAYERS = [
     '_MulScalar',
     'Dropout',
+    '_minus_scalar',
+    '_mul_scalar',
 ]
 
 def _mxnet_remove_batch(input_data):
diff --git a/tools/coreml/test/test_mxnet_converter.py b/tools/coreml/test/test_mxnet_converter.py
index bc850690a572..192090588fde 100644
--- a/tools/coreml/test/test_mxnet_converter.py
+++ b/tools/coreml/test/test_mxnet_converter.py
@@ -192,6 +192,33 @@ def test_tiny_tanh_activation_random_input(self):
         net = mx.sym.Activation(net, name='tanh1', act_type="tanh")
         self._test_mxnet_model(net, input_shape=input_shape, mode='random')
 
+    def test_tiny_elu_leakyrelu_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        slope = 0.1
+        net = mx.sym.LeakyReLU(net, name='elu1', act_type="elu", slope=slope)
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_leaky_leakyrelu_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        slope = 0.1
+        net = mx.sym.LeakyReLU(net, name='leaky1', act_type="leaky", slope=slope)
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_prelu_leakyrelu_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        gamma = mx.sym.Variable('gamma')
+        net = mx.sym.LeakyReLU(net, gamma=gamma, name='prelu1', act_type="prelu")
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
     def test_really_tiny_conv_random_input(self):
         np.random.seed(1988)
         input_shape = (1, 1, 10, 10)
@@ -408,6 +435,26 @@ def test_really_tiny_conv_random_input_multi_filter(self):
         )
         self._test_mxnet_model(net, input_shape=input_shape, mode='random')
 
+    def test_really_tiny_conv_random_input_multi_group(self):
+        np.random.seed(1988)
+        input_shape = (1, 16, 10, 10)
+        num_filter = 16
+        num_group = 4
+        kernel = (1, 1)
+        stride = (1, 1)
+        pad = (0, 0)
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            num_group=num_group,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
     def test_tiny_conv_random_3d_input(self):
         np.random.seed(1988)
         input_shape = (1, 3, 10, 10)
@@ -444,6 +491,26 @@ def test_tiny_conv_random_input_multi_filter(self):
         )
         self._test_mxnet_model(net, input_shape=input_shape, mode='random')
 
+    def test_tiny_conv_random_input_multi_group(self):
+        np.random.seed(1988)
+        input_shape = (1, 16, 10, 10)
+        num_filter = 16
+        num_group = 4
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (0, 0)
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            num_group=num_group,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
     def test_conv_random(self):
         np.random.seed(1988)
         input_shape = (1, 3, 10, 10)
diff --git a/tools/dependencies/README.md b/tools/dependencies/README.md
index 1c2f5172a783..a31caef05daa 100644
--- a/tools/dependencies/README.md
+++ b/tools/dependencies/README.md
@@ -39,4 +39,4 @@ Currently, we only support gcc-4.8 build. It's your own choice to use a higher v
 This issue appeared in the OSX build with XCode version 8.0 above (reproduced on 9.2). Please add the following build flag in `curl.sh` if your XCode version is more than 8.0:
 ```
 --without-libidn2
-``` 
\ No newline at end of file
+``` 
diff --git a/tools/dependencies/curl.sh b/tools/dependencies/curl.sh
index bb947715f2ac..8e546146eeaa 100755
--- a/tools/dependencies/curl.sh
+++ b/tools/dependencies/curl.sh
@@ -23,7 +23,9 @@ LIBCURL_VERSION=7.61.0
 if [[ ! -f $DEPS_PATH/lib/libcurl.a ]]; then
     # download and build libcurl
     >&2 echo "Building libcurl..."
-    curl -s -L https://curl.haxx.se/download/curl-$LIBCURL_VERSION.zip -o $DEPS_PATH/libcurl.zip
+    download \
+        https://curl.haxx.se/download/curl-${LIBCURL_VERSION}.zip \
+        ${DEPS_PATH}/libcurl.zip
     unzip -q $DEPS_PATH/libcurl.zip -d $DEPS_PATH
     pushd .
     cd $DEPS_PATH/curl-$LIBCURL_VERSION
diff --git a/tools/dependencies/eigen.sh b/tools/dependencies/eigen.sh
index a0cd8fcc95ef..96547b920a31 100755
--- a/tools/dependencies/eigen.sh
+++ b/tools/dependencies/eigen.sh
@@ -23,7 +23,9 @@ EIGEN_VERSION=3.3.4
 if [[ ! -d $DEPS_PATH/include/eigen3 ]]; then
     # download eigen
     >&2 echo "Loading eigen..."
-    curl -s -L https://github.com/eigenteam/eigen-git-mirror/archive/$EIGEN_VERSION.zip -o $DEPS_PATH/eigen.zip
+    download \
+        https://github.com/eigenteam/eigen-git-mirror/archive/${EIGEN_VERSION}.zip \
+        ${DEPS_PATH}/eigen.zip
     unzip -q $DEPS_PATH/eigen.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/eigen-git-mirror-$EIGEN_VERSION/build
     pushd .
diff --git a/tools/dependencies/libpng.sh b/tools/dependencies/libpng.sh
index 3faa9f027dcb..f71d4762ab34 100755
--- a/tools/dependencies/libpng.sh
+++ b/tools/dependencies/libpng.sh
@@ -23,7 +23,9 @@ PNG_VERSION=1.6.34
 if [[ ! -f $DEPS_PATH/lib/libpng.a ]]; then
     # download and build libpng
     >&2 echo "Building libpng..."
-    curl -s -L https://github.com/glennrp/libpng/archive/v$PNG_VERSION.zip -o $DEPS_PATH/libpng.zip
+    download \
+        https://github.com/glennrp/libpng/archive/v${PNG_VERSION}.zip \
+        ${DEPS_PATH}/libpng.zip
     unzip -q $DEPS_PATH/libpng.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/libpng-$PNG_VERSION/build
     pushd .
diff --git a/tools/dependencies/libtiff.sh b/tools/dependencies/libtiff.sh
index 2a402bbcac27..f57099bde845 100755
--- a/tools/dependencies/libtiff.sh
+++ b/tools/dependencies/libtiff.sh
@@ -19,14 +19,16 @@
 
 # This script builds the static library of libtiff that can be used as dependency of mxnet/opencv.
 set -ex
-TIFF_VERSION="4-0-9"
+TIFF_VERSION="4.0.10"
 if [[ ! -f $DEPS_PATH/lib/libtiff.a ]]; then
     # download and build libtiff
     >&2 echo "Building libtiff..."
-    curl -s -L https://gitlab.com/libtiff/libtiff/-/archive/Release-v$TIFF_VERSION/libtiff-Release-v$TIFF_VERSION.zip -o $DEPS_PATH/libtiff.zip
+    download \
+        https://download.osgeo.org/libtiff/tiff-${TIFF_VERSION}.zip \
+        ${DEPS_PATH}/libtiff.zip
     unzip -q $DEPS_PATH/libtiff.zip -d $DEPS_PATH
     pushd .
-    cd $DEPS_PATH/libtiff-Release-v$TIFF_VERSION
+    cd $DEPS_PATH/tiff-$TIFF_VERSION
     ./configure --quiet --disable-shared --disable-jpeg --disable-zlib --disable-jbig --disable-lzma --prefix=$DEPS_PATH
     $MAKE
     $MAKE install
diff --git a/tools/dependencies/libturbojpeg.sh b/tools/dependencies/libturbojpeg.sh
index ac813ebec1c7..8dcac01078a3 100755
--- a/tools/dependencies/libturbojpeg.sh
+++ b/tools/dependencies/libturbojpeg.sh
@@ -28,7 +28,9 @@ fi
 if [[ ! -f $DEPS_PATH/lib/libjpeg.a ]] || [[ ! -f $DEPS_PATH/lib/libturbojpeg.a ]]; then
     # download and build libjpeg
     >&2 echo "Building libjpeg-turbo..."
-    curl -s -L https://github.com/libjpeg-turbo/libjpeg-turbo/archive/$TURBO_JPEG_VERSION.zip -o $DEPS_PATH/libjpeg.zip
+    download \
+        https://github.com/libjpeg-turbo/libjpeg-turbo/archive/${TURBO_JPEG_VERSION}.zip \
+        ${DEPS_PATH}/libjpeg.zip
     unzip -q $DEPS_PATH/libjpeg.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/libjpeg-turbo-$TURBO_JPEG_VERSION/build
     pushd .
diff --git a/tools/dependencies/libz.sh b/tools/dependencies/libz.sh
index c5f9953c8d1c..6fdf59fd2ca8 100755
--- a/tools/dependencies/libz.sh
+++ b/tools/dependencies/libz.sh
@@ -23,7 +23,9 @@ ZLIB_VERSION=1.2.6
 if [[ ! -f $DEPS_PATH/lib/libz.a ]]; then
     # Download and build zlib
     >&2 echo "Building zlib..."
-    curl -s -L https://github.com/LuaDist/zlib/archive/$ZLIB_VERSION.zip -o $DEPS_PATH/zlib.zip
+    download \
+        https://github.com/LuaDist/zlib/archive/${ZLIB_VERSION}.zip \
+        ${DEPS_PATH}/zlib.zip
     unzip -q $DEPS_PATH/zlib.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/zlib-$ZLIB_VERSION/build
     pushd .
diff --git a/tools/dependencies/lz4.sh b/tools/dependencies/lz4.sh
index 478c9925e967..ce72908fdc2b 100755
--- a/tools/dependencies/lz4.sh
+++ b/tools/dependencies/lz4.sh
@@ -23,7 +23,9 @@ LZ4_VERSION=r130
 if [[ ! -f $DEPS_PATH/lib/liblz4.a ]]; then
     # Download and build lz4
     >&2 echo "Building lz4..."
-    curl -s -L https://github.com/lz4/lz4/archive/$LZ4_VERSION.zip -o $DEPS_PATH/lz4.zip
+    download \
+        https://github.com/lz4/lz4/archive/${LZ4_VERSION}.zip \
+        ${DEPS_PATH}/lz4.zip
     unzip -q $DEPS_PATH/lz4.zip -d $DEPS_PATH
     pushd .
     cd $DEPS_PATH/lz4-$LZ4_VERSION
diff --git a/tools/dependencies/make_shared_dependencies.sh b/tools/dependencies/make_shared_dependencies.sh
index 16508586169a..9c86c11024d5 100755
--- a/tools/dependencies/make_shared_dependencies.sh
+++ b/tools/dependencies/make_shared_dependencies.sh
@@ -20,11 +20,37 @@
 # This is a convenience script for calling the build scripts of all dependency libraries.
 # Environment variables should be set beforehand.
 
+set -ex
+
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 
+download () {
+    local URL=$1
+    local OUT_FILE=$2
+
+    if [[ -f "${OUT_FILE}" ]]; then
+        echo "File ${OUT_FILE} already downloaded."
+        return 0
+    fi
+
+    echo "Downloading ${URL} ..."
+    local CURL_OPTIONS="--connect-timeout 10 \
+              --max-time 300 \
+              --retry-delay 10 \
+              --retry 3 \
+              --retry-delay 0 \
+              --location \
+              --silent"
+    curl ${CURL_OPTIONS} ${URL} -o ${OUT_FILE}
+
+    if [[ ! -f "${OUT_FILE}" ]]; then
+        echo "File ${URL} couldn't be downloaded!"
+        exit 1
+    fi
+}
 
 if [[ ! $PLATFORM == 'darwin' ]]; then
-    source $DIR/openblas.sh
+    source ${DIR}/openblas.sh
 fi
 source $DIR/libz.sh
 source $DIR/libturbojpeg.sh
diff --git a/tools/dependencies/openblas.sh b/tools/dependencies/openblas.sh
index 27f473919a5b..9732aa214f6b 100755
--- a/tools/dependencies/openblas.sh
+++ b/tools/dependencies/openblas.sh
@@ -25,7 +25,9 @@ if [[ ! -e $DEPS_PATH/lib/libopenblas.a ]]; then
     # download and build openblas
     >&2 echo "Building openblas..."
 
-    curl -s -L https://github.com/xianyi/OpenBLAS/archive/v$OPENBLAS_VERSION.zip -o $DEPS_PATH/openblas.zip
+    download \
+        https://github.com/xianyi/OpenBLAS/archive/v${OPENBLAS_VERSION}.zip \
+        ${DEPS_PATH}/openblas.zip
     unzip -q $DEPS_PATH/openblas.zip -d $DEPS_PATH
     pushd .
     cd $DEPS_PATH/OpenBLAS-$OPENBLAS_VERSION
diff --git a/tools/dependencies/opencv.sh b/tools/dependencies/opencv.sh
index 11c9c2155f91..4b85f018b0d3 100755
--- a/tools/dependencies/opencv.sh
+++ b/tools/dependencies/opencv.sh
@@ -39,7 +39,9 @@ fi
 if [[ ! -f $DEPS_PATH/lib/libopencv_core.a ]] || [[ ! -f $DEPS_PATH/lib/libopencv_imgcodecs.a ]] || [[ ! -f $DEPS_PATH/lib/libopencv_imgproc.a ]]; then
     # download and build opencv since we need the static library
     >&2 echo "Building opencv..."
-    curl -s -L https://github.com/opencv/opencv/archive/$OPENCV_VERSION.zip -o $DEPS_PATH/opencv.zip
+    download \
+        https://github.com/opencv/opencv/archive/${OPENCV_VERSION}.zip \
+        ${DEPS_PATH}/opencv.zip
     unzip -q $DEPS_PATH/opencv.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/opencv-$OPENCV_VERSION/build
     pushd .
diff --git a/tools/dependencies/openssl.sh b/tools/dependencies/openssl.sh
index 93284db3e39f..8e2372c9075a 100755
--- a/tools/dependencies/openssl.sh
+++ b/tools/dependencies/openssl.sh
@@ -24,7 +24,9 @@ if [[ ! -f $DEPS_PATH/lib/libssl.a ]] || [[ ! -f $DEPS_PATH/lib/libcrypto.a ]];
     # download and build openssl
     >&2 echo "Building openssl..."
     OPENSSL_VERSION=$(echo $OPENSSL_VERSION | sed 's/\./_/g')
-    curl -s -L https://github.com/openssl/openssl/archive/OpenSSL_$OPENSSL_VERSION.zip -o $DEPS_PATH/openssl.zip
+    download \
+        https://github.com/openssl/openssl/archive/OpenSSL_${OPENSSL_VERSION}.zip \
+        ${DEPS_PATH}/openssl.zip
     unzip -q $DEPS_PATH/openssl.zip -d $DEPS_PATH
     pushd .
     cd $DEPS_PATH/openssl-OpenSSL_$OPENSSL_VERSION
diff --git a/tools/dependencies/protobuf.sh b/tools/dependencies/protobuf.sh
index 76ce1de8e822..7da4c2537b42 100755
--- a/tools/dependencies/protobuf.sh
+++ b/tools/dependencies/protobuf.sh
@@ -31,7 +31,9 @@ LIBPROTOC="$DEPS_PATH/lib/libprotoc.$DY_EXT"
 if [[ ! -e $LIBPROTOBUF ]] || [[ ! -e $LIBPROTOC ]]; then
     # Download and build protobuf
     >&2 echo "Building protobuf..."
-    curl -s -L https://github.com/google/protobuf/archive/v$PROTOBUF_VERSION.zip -o $DEPS_PATH/protobuf.zip
+    download \
+        https://github.com/google/protobuf/archive/v${PROTOBUF_VERSION}.zip \
+        ${DEPS_PATH}/protobuf.zip
     unzip -q $DEPS_PATH/protobuf.zip -d $DEPS_PATH
     pushd .
     cd $DEPS_PATH/protobuf-$PROTOBUF_VERSION
diff --git a/tools/dependencies/zmq.sh b/tools/dependencies/zmq.sh
index 0042d6bcd073..11d7063200b5 100755
--- a/tools/dependencies/zmq.sh
+++ b/tools/dependencies/zmq.sh
@@ -23,7 +23,9 @@ ZEROMQ_VERSION=4.2.2
 if [[ ! -f $DEPS_PATH/lib/libzmq.a ]]; then
     # Download and build zmq
     >&2 echo "Building zmq..."
-    curl -s -L https://github.com/zeromq/libzmq/archive/v$ZEROMQ_VERSION.zip -o $DEPS_PATH/zeromq.zip
+    download \
+        https://github.com/zeromq/libzmq/archive/v${ZEROMQ_VERSION}.zip \
+        ${DEPS_PATH}/zeromq.zip
     unzip -q $DEPS_PATH/zeromq.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/libzmq-$ZEROMQ_VERSION/build
     pushd .
diff --git a/tools/im2rec.cc b/tools/im2rec.cc
index 915b78029c87..989b3147830d 100644
--- a/tools/im2rec.cc
+++ b/tools/im2rec.cc
@@ -39,6 +39,7 @@
 #include <dmlc/logging.h>
 #include <dmlc/recordio.h>
 #include <opencv2/opencv.hpp>
+#include "../src/io/opencv_compatibility.h"
 #include "../src/io/image_recordio.h"
 #include <random>
 /*!
diff --git a/tools/pip/MANIFEST.in b/tools/pip/MANIFEST.in
index 1edefa026ebc..dd48098c6504 100644
--- a/tools/pip/MANIFEST.in
+++ b/tools/pip/MANIFEST.in
@@ -27,5 +27,6 @@ recursive-include mxnet *.so.*
 recursive-include mxnet *.dylib
 recursive-include mxnet *_LICENSE
 recursive-include mxnet *.h
+recursive-include mxnet *.hpp
 recursive-include mxnet *.cuh
 recursive-include dmlc_tracker *.py
diff --git a/tools/pip/setup.py b/tools/pip/setup.py
index f5f9545619bf..c9134d51e200 100644
--- a/tools/pip/setup.py
+++ b/tools/pip/setup.py
@@ -155,6 +155,8 @@ def has_ext_modules(self):
         package_data['mxnet'].append('mxnet/libiomp5.so')
         package_data['mxnet'].append('mxnet/libmkldnn.so.0')
     shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), '../MKLML_LICENSE'), os.path.join(CURRENT_DIR, 'mxnet'))
+    shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/mkldnn/include'),
+                    os.path.join(CURRENT_DIR, 'mxnet/include/mkldnn'))
     package_data['mxnet'].append('mxnet/MKLML_LICENSE')
 if platform.system() == 'Linux':
     shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libgfortran.so.3'), os.path.join(CURRENT_DIR, 'mxnet'))
diff --git a/tools/staticbuild/build_lib.sh b/tools/staticbuild/build_lib.sh
index b08057321bf7..728f3a1cda25 100755
--- a/tools/staticbuild/build_lib.sh
+++ b/tools/staticbuild/build_lib.sh
@@ -52,6 +52,10 @@ if [[ $VARIANT == *mkl ]]; then
     cp 3rdparty/mkldnn/LICENSE ./MKLML_LICENSE
 fi
 
+if [[ $VARIANT == cu* ]]; then
+    cp 3rdparty/nvidia_cub/LICENSE.TXT ./CUB_LICENSE
+fi
+
 >&2 echo "Now building mxnet..."
 $MAKE DEPS_PATH=$DEPS_PATH